diff --git a/.github/workflows/android-appcenter.yml b/.github/workflows/android-appcenter.yml index aff5c496..d0cbc773 100644 --- a/.github/workflows/android-appcenter.yml +++ b/.github/workflows/android-appcenter.yml @@ -25,109 +25,109 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* - - name: Install AppCenter CLI - run: npm install -g appcenter-cli + - name: Install AppCenter CLI + run: npm install -g appcenter-cli - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Copy test_resources - run: ./copy_test_resources.sh + - name: Copy test_resources + run: ./copy_test_resources.sh - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - name: Grant execute permission for gradlew - run: chmod +x gradlew + - name: Grant execute permission for gradlew + run: chmod +x gradlew - - name: Build app - run: ./gradlew assembleDebug + - name: Build app + run: ./gradlew assembleDebug - - name: Build androidTest - run: ./gradlew assembleAndroidTest + - name: Build androidTest + run: ./gradlew assembleAndroidTest - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/android-min-max" - --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/debug + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/android-min-max" + --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/debug build-integ: name: Run Android Integration Tests on AppCenter runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* - - - name: Install AppCenter CLI - run: npm install -g appcenter-cli - - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' - - - name: Copy test_resources - run: ./copy_test_resources.sh - - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties - - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - - name: Grant execute permission for gradlew - run: chmod +x gradlew - - - name: Build app - run: ./gradlew assembleRelease - - - name: Build androidTest - run: ./gradlew assembleReleaseAndroidTest -DtestBuildType=integ - - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/android-min-max" - --app-path orca-test-app/build/outputs/apk/release/orca-test-app-release.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/release \ No newline at end of file + - uses: actions/checkout@v3 + + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* + + - name: Install AppCenter CLI + run: npm install -g appcenter-cli + + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' + + - name: Copy test_resources + run: ./copy_test_resources.sh + + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties + + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + + - name: Grant execute permission for gradlew + run: chmod +x gradlew + + - name: Build app + run: ./gradlew assembleRelease + + - name: Build androidTest + run: ./gradlew assembleReleaseAndroidTest -DtestBuildType=integ + + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/android-min-max" + --app-path orca-test-app/build/outputs/apk/release/orca-test-app-release.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/release diff --git a/.github/workflows/android-demos.yml b/.github/workflows/android-demos.yml index 4c1d6708..b9d02ded 100644 --- a/.github/workflows/android-demos.yml +++ b/.github/workflows/android-demos.yml @@ -23,13 +23,13 @@ jobs: working-directory: demo/android/OrcaDemo steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Build - run: ./gradlew assembleDebug \ No newline at end of file + - name: Build + run: ./gradlew assembleDebug diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index c4a90226..8fae2bcd 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -26,70 +26,70 @@ jobs: strategy: matrix: - device: [single-android, 32bit-android] + device: [ single-android, 32bit-android ] include: - - device: single-android - procPerformanceThresholdSec: 3.0 - - device: 32bit-android - procPerformanceThresholdSec: 19.0 + - device: single-android + procPerformanceThresholdSec: 3.0 + - device: 32bit-android + procPerformanceThresholdSec: 19.0 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v3 - - name: Set up Node.js LTS - uses: actions/setup-node@v3 - with: - node-version: lts/* + - name: Set up Node.js LTS + uses: actions/setup-node@v3 + with: + node-version: lts/* - - name: Install AppCenter CLI - run: npm install -g appcenter-cli + - name: Install AppCenter CLI + run: npm install -g appcenter-cli - - name: set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'temurin' + - name: set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'temurin' - - name: Copy test_resources - run: ./copy_test_resources.sh + - name: Copy test_resources + run: ./copy_test_resources.sh - - name: Inject AccessKey - run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties + - name: Inject AccessKey + run: echo pvTestingAccessKey="${{secrets.PV_VALID_ACCESS_KEY}}" >> local.properties - - name: Inject Number of Iterations - run: echo numTestIterations="30" >> local.properties + - name: Inject Number of Iterations + run: echo numTestIterations="30" >> local.properties - - name: Inject Android keystore variables - run: | - echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties - echo keyAlias=picovoice >> local.properties - echo storeFile=../picovoice.jks >> local.properties + - name: Inject Android keystore variables + run: | + echo storePassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyPassword="${{secrets.ANDROID_RELEASE_KEYSTORE_PASSWORD}}" >> local.properties + echo keyAlias=picovoice >> local.properties + echo storeFile=../picovoice.jks >> local.properties - - name: Setup Android keystore file - run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks + - name: Setup Android keystore file + run: echo "${{secrets.ANDROID_RELEASE_KEYSTORE_FILE_B64}}" | base64 -d > picovoice.jks - - name: Inject Init Performance Threshold - run: echo initPerformanceThresholdSec="${{ matrix.initPerformanceThresholdSec }}" >> local.properties + - name: Inject Init Performance Threshold + run: echo initPerformanceThresholdSec="${{ matrix.initPerformanceThresholdSec }}" >> local.properties - - name: Inject Proc Performance Threshold - run: echo procPerformanceThresholdSec="${{ matrix.procPerformanceThresholdSec }}" >> local.properties + - name: Inject Proc Performance Threshold + run: echo procPerformanceThresholdSec="${{ matrix.procPerformanceThresholdSec }}" >> local.properties - - name: Grant execute permission for gradlew - run: chmod +x gradlew + - name: Grant execute permission for gradlew + run: chmod +x gradlew - - name: Build app - run: ./gradlew assembleDebug + - name: Build app + run: ./gradlew assembleDebug - - name: Build androidTest - run: ./gradlew assembleAndroidTest -DtestBuildType=perf + - name: Build androidTest + run: ./gradlew assembleAndroidTest -DtestBuildType=perf - - name: Run tests on AppCenter - run: appcenter test run espresso - --token ${{secrets.APPCENTERAPITOKEN}} - --app "Picovoice/Orca-Android" - --devices "Picovoice/${{ matrix.device }}" - --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk - --test-series "orca-android" - --locale "en_US" - --build-dir orca-test-app/build/outputs/apk/androidTest/debug \ No newline at end of file + - name: Run tests on AppCenter + run: appcenter test run espresso + --token ${{secrets.APPCENTERAPITOKEN}} + --app "Picovoice/Orca-Android" + --devices "Picovoice/${{ matrix.device }}" + --app-path orca-test-app/build/outputs/apk/debug/orca-test-app-debug.apk + --test-series "orca-android" + --locale "en_US" + --build-dir orca-test-app/build/outputs/apk/androidTest/debug diff --git a/.github/workflows/c-demos.yml b/.github/workflows/c-demos.yml index 119d6bc3..9b7f54b7 100644 --- a/.github/workflows/c-demos.yml +++ b/.github/workflows/c-demos.yml @@ -49,7 +49,7 @@ jobs: make_file: "MinGW Makefiles" - os: macos-latest platform: mac - arch: x86_64 + arch: undetermined make_file: "Unix Makefiles" steps: @@ -66,10 +66,12 @@ jobs: run: cmake -G "${{ matrix.make_file }}" -B ./build - name: Build demo - run: cmake --build ./build --target orca_demo + run: | + cmake --build ./build --target orca_demo + cmake --build ./build --target orca_demo_streaming - name: Test - run: python test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} + run: python3 test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} build-demo-self-hosted: runs-on: ${{ matrix.machine }} @@ -106,7 +108,9 @@ jobs: run: cmake -B ./build - name: Build demo - run: cmake --build ./build --target orca_demo + run: | + cmake --build ./build --target orca_demo + cmake --build ./build --target orca_demo_streaming - name: Test run: python3 test/test_orca_c.py ${{secrets.PV_VALID_ACCESS_KEY}} ${{ matrix.platform }} ${{ matrix.arch }} diff --git a/.github/workflows/ios-appcenter.yml b/.github/workflows/ios-appcenter.yml index c1d361c0..7a3318ee 100644 --- a/.github/workflows/ios-appcenter.yml +++ b/.github/workflows/ios-appcenter.yml @@ -73,4 +73,4 @@ jobs: --devices "Picovoice/ios-min-max" --test-series "orca-ios" --locale "en_US" - --build-dir ddp/Build/Products/Debug-iphoneos \ No newline at end of file + --build-dir ddp/Build/Products/Debug-iphoneos diff --git a/.github/workflows/ios-perf.yml b/.github/workflows/ios-perf.yml index 3f6848da..98c3df9a 100644 --- a/.github/workflows/ios-perf.yml +++ b/.github/workflows/ios-perf.yml @@ -26,10 +26,10 @@ jobs: strategy: matrix: - device: [ios-perf] + device: [ ios-perf ] include: - - device: ios-perf - performanceThresholdSec: 0.5 + - device: ios-perf + performanceThresholdSec: 0.5 steps: - name: Checkout @@ -88,4 +88,4 @@ jobs: --devices "Picovoice/${{ matrix.device }}" --test-series "orca-ios" --locale "en_US" - --build-dir ddp/Build/Products/Debug-iphoneos \ No newline at end of file + --build-dir ddp/Build/Products/Debug-iphoneos diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index 47630e87..0cf761a9 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -25,8 +25,13 @@ jobs: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + os: [ ubuntu-latest, windows-latest, macos-latest ] + python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + include: + - os: ubuntu-latest + install_dep: sudo apt install libportaudio2 + - os: windows-latest + - os: macos-latest steps: - uses: actions/checkout@v3 @@ -37,14 +42,22 @@ jobs: python-version: ${{ matrix.python-version }} - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies - run: pip install -r requirements.txt + run: | + ${{matrix.install_dep}} + pip install -r requirements.txt - - name: Test + - name: Test streaming run: > - python orca_demo.py + python3 orca_demo_streaming.py + --access_key ${{secrets.PV_VALID_ACCESS_KEY}} + --text_to_stream "Hello, I am Orca!" + + - name: Test single + run: > + python3 orca_demo.py --access_key ${{secrets.PV_VALID_ACCESS_KEY}} --text "Hello, I am Orca!" --output_path ./tmp.wav @@ -54,7 +67,7 @@ jobs: strategy: matrix: - machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] steps: - uses: actions/checkout@v3 @@ -62,9 +75,15 @@ jobs: - name: Install dependencies run: pip3 install -r requirements.txt - - name: Test + - name: Test streaming + run: > + python3 orca_demo_streaming.py + --access_key ${{secrets.PV_VALID_ACCESS_KEY}} + --text_to_stream "Hello, I am Orca!" + + - name: Test single run: > python3 orca_demo.py --access_key ${{secrets.PV_VALID_ACCESS_KEY}} --text "Hello, I am Orca!" - --output_path ./tmp.wav \ No newline at end of file + --output_path ./tmp.wav diff --git a/.github/workflows/python-perf.yml b/.github/workflows/python-perf.yml index 43b5f826..64c8a6cd 100644 --- a/.github/workflows/python-perf.yml +++ b/.github/workflows/python-perf.yml @@ -45,11 +45,11 @@ jobs: os: [ubuntu-latest, windows-latest, macos-latest] include: - os: ubuntu-latest - proc_performance_threshold_sec: 1.5 + proc_performance_threshold_rtf: 5.0 - os: windows-latest - proc_performance_threshold_sec: 1.5 + proc_performance_threshold_rtf: 3.0 - os: macos-latest - proc_performance_threshold_sec: 2.5 + proc_performance_threshold_rtf: 3.0 steps: - uses: actions/checkout@v3 @@ -60,7 +60,7 @@ jobs: python-version: '3.10' - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies run: pip install -r requirements.txt @@ -70,7 +70,7 @@ jobs: python3 test_orca_perf.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} --num-test-iterations 10 - --proc-performance-threshold-sec ${{matrix.proc_performance_threshold_sec}} + --proc-performance-threshold-rtf ${{matrix.proc_performance_threshold_rtf}} perf-self-hosted: runs-on: ${{ matrix.machine }} @@ -81,17 +81,17 @@ jobs: machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] include: - machine: rpi3-32 - proc_performance_threshold_sec: 10.0 + proc_performance_threshold_rtf: 1.0 - machine: rpi3-64 - proc_performance_threshold_sec: 6.0 + proc_performance_threshold_rtf: 1.0 - machine: rpi4-32 - proc_performance_threshold_sec: 5.0 + proc_performance_threshold_rtf: 2.0 - machine: rpi4-64 - proc_performance_threshold_sec: 4.0 + proc_performance_threshold_rtf: 2.0 - machine: rpi5-64 - proc_performance_threshold_sec: 2.0 + proc_performance_threshold_rtf: 2.0 - machine: jetson - proc_performance_threshold_sec: 4.0 + proc_performance_threshold_rtf: 2.0 steps: - uses: actions/checkout@v3 @@ -108,8 +108,8 @@ jobs: python3 test_orca_perf.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} --num-test-iterations 10 - --proc-performance-threshold-sec ${{matrix.proc_performance_threshold_sec}} + --proc-performance-threshold-rtf ${{matrix.proc_performance_threshold_rtf}} - name: Machine state after working-directory: resources/.scripts - run: bash machine-state.sh \ No newline at end of file + run: bash machine-state.sh diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fab6801d..d52b714b 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -37,8 +37,8 @@ jobs: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + os: [ ubuntu-latest, windows-latest, macos-latest ] + python-version: [ '3.8' , '3.9', '3.10', '3.11', '3.12' ] steps: - uses: actions/checkout@v3 @@ -49,20 +49,20 @@ jobs: python-version: ${{ matrix.python-version }} - name: Pre-build dependencies - run: python -m pip install --upgrade pip + run: python3 -m pip install --upgrade pip - name: Install dependencies run: pip install -r requirements.txt - name: Test - run: python test_orca.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} + run: python3 test_orca.py --access-key ${{secrets.PV_VALID_ACCESS_KEY}} build-self-hosted: runs-on: ${{ matrix.machine }} strategy: matrix: - machine: [rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson] + machine: [ rpi3-32, rpi3-64, rpi4-32, rpi4-64, rpi5-64, jetson ] steps: - uses: actions/checkout@v3 diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..41039694 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "demo/c/dr_libs"] + path = demo/c/dr_libs + url = ../../mackron/dr_libs.git \ No newline at end of file diff --git a/README.md b/README.md index 63d3c35d..b732e60d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,8 @@ Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) [![Twitter URL](https://img.shields.io/twitter/url?label=%40AiPicovoice&style=social&url=https%3A%2F%2Ftwitter.com%2FAiPicovoice)](https://twitter.com/AiPicovoice) [![YouTube Channel Views](https://img.shields.io/youtube/channel/views/UCAdi9sTCXLosG1XeqDwLx7w?label=YouTube&style=social)](https://www.youtube.com/channel/UCAdi9sTCXLosG1XeqDwLx7w) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -28,25 +29,26 @@ Orca may undergo changes as we continually enhance and refine the engine to prov - [Orca](#orca) - [Table of Contents](#table-of-contents) - - [Language Support](#language-support) - [Overview](#overview) + - [Orca streaming text synthesis](#orca-input-and-output-streaming-synthesis) + - [Text input](#text-input) - [Custom pronunciations](#custom-pronunciations) - [Voices](#voices) - [Speech control](#speech-control) - [Audio output](#audio-output) + - [AccessKey](#accesskey) - [Demos](#demos) - - [Python](#python-demos) - - [iOS](#ios-demo) - - [C](#c-demos) - - [Web](#web-demos) - - [Android](#android-demo) + - [Python Demos](#python-demos) + - [iOS Demo](#ios-demo) + - [C Demos](#c-demos) + - [Web Demos](#web-demos) + - [Android Demo](#android-demo) - [SDKs](#sdks) - [Python](#python) - [iOS](#ios) - [C](#c) - [Web](#web) - [Android](#android) - - [AccessKey](#accesskey) - [Releases](#releases) - [FAQ](#faq) @@ -58,10 +60,21 @@ Orca may undergo changes as we continually enhance and refine the engine to prov ## Overview +### Orca input and output streaming synthesis + +Orca is a text-to-speech engine designed specifically for LLMs. It can process +incoming text streams in real-time, generating audio continuously, i.e., as the LLM produces tokens, +Orca generates speech in parallel. +This enables seamless conversations with voice assistants, eliminating any audio delays. + +![](https://github.com/Picovoice/orca/blob/orca-prepare-v0.2/resources/assets/orca_streaming_animation.gif) + +Orca also supports single synthesis mode, where a complete text is synthesized in a single call to the Orca engine. + ### Text input -Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, as well as -common punctuation marks. You can get a list of all supported characters by calling the +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the `valid_characters()` method provided in the Orca SDK you are using. Pronunciations of characters or words not supported by this list can be achieved with [custom pronunciations](#custom-pronunciations). @@ -96,6 +109,7 @@ Orca provides a set of parameters to control the synthesized speech. The followi | Parameter | Default | Description | |:-----------:|:-------:|:--------------------------------------------------------------------------------------------------------------------------:| | speech rate | 1.0 | Speed of generated speech. Valid values are within [0.7, 1.3]. Higher (lower) values generate faster (slower) speech. | +| random state| random | Sets the random state for sampling during synthesis. Valid values are all non-negative integers. If not provided, a random seed will be chosen. | ### Audio output @@ -117,12 +131,20 @@ AccessKey also verifies that your usage is within the limits of your account. Ev ### Python Demos -To run the Python demo, run the following in the console: +Install the demo package: ```console pip3 install pvorcademo ``` +Run the streaming demo: + +```console +orca_demo_streaming --access_key ${ACCESS_KEY} --text_to_stream ${TEXT} +``` + +Run the single synthesis demo: + ```console orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_PATH} ``` @@ -130,6 +152,8 @@ orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_P Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with the text to be synthesized, and `${WAV_OUTPUT_PATH}` with a path to an output WAV file. +For more information about Python demos go to [demo/python](demo/python). + ### iOS Demo Run the following from [demo/ios](demo/ios) to install the Orca-iOS CocoaPod: @@ -147,16 +171,16 @@ For more information about iOS demos go to [demo/ios](demo/ios). ### C Demos -Build the demo: +Build the streaming demo: ```console -cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo_streaming ``` Run the demo: ```console -./demo/c/build/orca_demo -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} +./demo/c/build/orca_demo_streaming -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} ``` ### Web Demos @@ -196,17 +220,49 @@ Install the Python SDK: pip3 install pvorca ``` -Create an instance of the engine and generate speech: +Create an instance of the engine: ```python import pvorca orca = pvorca.create(access_key='${ACCESS_KEY}') -pcm = orca.synthesize('${TEXT}') ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${TEXT}` with -the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). + +#### Streaming synthesis + +To synthesize a text stream, create an Orca Stream object and add text to it one-by-one: + +```python +stream = orca.stream_open() + +for text_chunk in text_generator(): + pcm = stream.synthesize(text_chunk) + if pcm is not None: + # handle pcm + +pcm = stream.flush() +if pcm is not None: + # handle pcm +``` + +The `text_generator()` function can be any stream generating text, for example an LLM response. +When done with streaming text synthesis, the stream object needs to be closed: + +```python +stream.close() +``` + +#### Single synthesis + +Use single synthesis mode if the complete text is known in advance: + +```python +pcm, alignments = orca.synthesize('${TEXT}') +``` + +Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). Finally, when done make sure to explicitly release the resources: @@ -218,7 +274,7 @@ For more details see [Python SDK](./binding/python/README.md). ### iOS -Create an instance of the engine and synthesize: +Create an instance of the engine: ```swift import Orca @@ -230,18 +286,52 @@ let modelPath = Bundle(for: type(of: self)).path( do { let orca = try Orca(accessKey: "${ACCESS_KEY}", modelPath: modelPath) } catch {} +``` + +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/) and `${MODEL_FILE}` +with the model file name for Orca. + +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize` method. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```swift +let orcaStream = try orca.streamOpen() + +for textChunk in textGenerator() { + let pcm = orcaStream.synthesize(textChunk) + if pcm != nil { + // handle pcm + } +} +let pcm = orcaStream.flush() +if pcm != nil { + // handle pcm +} + +orcaStream.close() +``` + +`textGenerator()` can be any stream generating text, for example an LLM response. + +#### Single synthesis + +```swift do { - let pcm = try orca.synthesize(text: "${TEXT}") + let (pcm, wordArray) = try orca.synthesize(text: "${TEXT}") } catch {} ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/), `${MODEL_FILE}` -with the model file name for Orca and `${TEXT}` with -the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). + +#### Release resources When done be sure to explicitly release the resources using `orca.delete()`. +For more details, see the [iOS SDK](./binding/ios/). + ### C The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant information on Orca's C SDK. @@ -249,9 +339,9 @@ The header file [include/pv_orca.h](./include/pv_orca.h) contains relevant infor Build an instance of the object: ```c -pv_orca_t *handle = NULL; +pv_orca_t *orca = NULL; const char *model_path = "${MODEL_PATH}"; -pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &handle); +pv_status_t status = pv_orca_init("${ACCESS_KEY}", model_path, &orca); if (status != PV_STATUS_SUCCESS) { // error handling logic } @@ -268,27 +358,112 @@ status = pv_orca_synthesize_params_init(&synthesize_params); // change the default parameters of synthesize_params as desired ``` -Now, the `handle` and `synthesize_params` object can be used to synthesize speech: +#### Streaming synthesis + +To synthesize a text stream, create an `orca_stream` object using `synthesize_params`: + +```c +pv_orca_stream_t *orca_stream = NULL; +status = pv_orca_stream_open(orca, synthesize_params, &orca_stream); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +``` + +Add text to `orca_stream` one-by-one and handle the synthesized audio: + +```c +extern char *get_next_text_chunk(void); + +int32_t num_samples_chunk = 0; +int16_t *pcm_chunk = NULL; +status = pv_orca_stream_synthesize( + orca_stream, + get_next_text_chunk(), + &num_samples_chunk, + &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the text stream is complete, call the flush method to synthesize the remaining text: + +```c +status = pv_orca_stream_flush(orca_stream, &num_samples_chunk, &pcm_chunk); +if (status != PV_STATUS_SUCCESS) { + // error handling logic +} +if (num_samples_chunk > 0) { + // handle pcm_chunk +} +``` + +Once the PCM chunks are handled, make sure to release the acquired resources for each chunk with: + +```c +pv_orca_pcm_delete(pcm_chunk); +``` + +Finally, when done make sure to close the stream: + +```c +pv_orca_stream_close(orca_stream); +``` + +#### Single synthesis + +If the text is known in advance, single synthesis mode can be used: ```c int32_t num_samples = 0; int16_t *synthesized_pcm = NULL; +int32_t num_alignments = 0; +pv_orca_word_alignment_t **alignments = NULL; status = pv_orca_synthesize( - handle, + orca, "${TEXT}", synthesize_params, &num_samples, - &synthesized_pcm); + &synthesized_pcm, + &num_alignments, + &alignments); ``` Replace `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +Print the metadata of the synthesized audio: + +```c +for (int32_t i = 0; i < num_alignments; i++) { + fprintf( + stdout, + "[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].word, + alignments[i].start_sec, + alignments[i].end_sec); + for (int32_t j = 0; j < alignments[i].num_phonemes; j++) { + fprintf( + stdout, + "\t[%s]\t.start_sec = %.2f .end_sec = %.2f\n", + alignments[i].phonemes[j].phoneme, + alignments[i].phonemes[j].start_sec, + alignments[i].phonemes[j].end_sec); + + } +} +``` + Finally, when done make sure to release the acquired resources: ```c -pv_orca_delete_pcm(pcm); +pv_orca_word_alignments_delete(num_alignments, alignments); +pv_orca_pcm_delete(pcm); pv_orca_synthesize_params_delete(synthesize_params); -pv_orca_delete(handle); +pv_orca_delete(orca); ``` ### Web @@ -315,12 +490,48 @@ const orca = await OrcaWorker.create( "${ACCESS_KEY}", { base64: orcaParams } ); +``` + +Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). + +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize` method. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```typescript +const orcaStream = await orca.streamOpen(); -const speechPcm = await orca.synthesize("${TEXT}") +function* textStream(): IterableIterator { + ... // yield text chunks e.g. from an LLM response +} + +for (const textChunk of textStream()) { + const pcm = await orcaStream.synthesize(textChunk); + if (pcm !== null) { + // handle pcm + } +} + +const flushedPcm = orcaStream.flush(); +if (flushedPcm !== null) { + // handle pcm +} + +orcaStream.close(); ``` -Replace `${ACCESS_KEY}` with yours obtained from [Picovoice Console](https://console.picovoice.ai/). Finally, when done -release the resources using `orca.release()`. +#### Single synthesis + +```typescript +const { speechPcm, alignments } = await orca.synthesize("${TEXT}") +``` + +#### Release resources + +Finally, when done release the resources using `orca.release()`. + +For more details, see the [Web SDK](./binding/web/). ### Android @@ -345,11 +556,6 @@ try { .setAccessKey(accessKey) .setModelPath(modelPath) .build(appContext); - - short[] pcm = orca.synthesize( - "${TEXT}", - new OrcaSynthesizeParams.Builder().build()); - } catch (OrcaException ex) { } ``` @@ -357,6 +563,35 @@ Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${MODEL_FIL Orca [voice model file](./lib/common) and `${TEXT}` with the text to be synthesized including potential [custom pronunciations](#custom-pronunciations). +#### Streaming synthesis + +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one via the `synthesize`. +Call `flush` to synthesize any remaining text, and `close` to delete the `OrcaStream` object: + +```java +Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + +for (String textChunk : textGenerator()) { + short[] pcm = orcaStream.synthesize(textChunk); + if (pcm != null) { + // handle pcm + } +} + +short[] flushedPcm = orcaStream.flush(); +if (flushedPcm != null) { + // handle pcm +} +``` + +#### Single synthesis + +```java +OrcaAudio audio = orca.synthesize( + "${TEXT}", + new OrcaSynthesizeParams.Builder().build()); +``` + Finally, when done make sure to explicitly release the resources: ```java @@ -367,6 +602,14 @@ For more details, see the [Android SDK](./binding/android/README.md). ## Releases +### v0.2.0 - May 3rd, 2024 + +- Support for streaming synthesis +- Reduced model size +- Improved performance +- Support for word alignments +- Improved naturalness and pronunciations + ### v0.1.0 - January 24th, 2024 - Beta release diff --git a/binding/android/Orca/orca/build.gradle b/binding/android/Orca/orca/build.gradle index 150f441a..de554471 100644 --- a/binding/android/Orca/orca/build.gradle +++ b/binding/android/Orca/orca/build.gradle @@ -2,7 +2,7 @@ apply plugin: 'com.android.library' ext { PUBLISH_GROUP_ID = 'ai.picovoice' - PUBLISH_VERSION = '0.1.0' + PUBLISH_VERSION = '0.2.0' PUBLISH_ARTIFACT_ID = 'orca-android' } @@ -14,8 +14,8 @@ android { defaultConfig { minSdkVersion 21 targetSdkVersion defaultTargetSdkVersion - versionCode 1 - versionName "0.1" + versionCode 2 + versionName "0.2" consumerProguardFiles "consumer-rules.pro" } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java index ba3c687e..2493f1e1 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/Orca.java @@ -35,6 +35,91 @@ public class Orca { private long handle; + private int maxCharacterLimit; + private int sampleRate; + private String[] validCharacters; + + /** + * OrcaStream object that converts a stream of text to a stream of audio. + */ + public class OrcaStream { + private long stream; + + public OrcaStream(long stream) { + this.stream = stream; + } + + /** + * Adds a chunk of text to the Stream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `.getValidCharacters()`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + * @throws OrcaException if there is an error while synthesizing audio. + */ + public short[] synthesize(String text) throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream synthesize after delete." + ); + } + + if (stream == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream synthesize without an open stream." + ); + } + + short[] pcm = OrcaNative.streamSynthesize(stream, text); + + return pcm.length == 0 ? null : pcm; + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + * @throws OrcaException if there is an error while flushing audio. + */ + public short[] flush() throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream flush after delete." + ); + } + + if (stream == 0) { + throw new OrcaInvalidStateException( + "Attempted to call OrcaStream flush without an open stream." + ); + } + + short[] pcm = OrcaNative.streamFlush(stream); + + return pcm.length == 0 ? null : pcm; + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public void close() { + if (handle != 0 && stream != 0) { + OrcaNative.streamClose(stream); + stream = 0; + } + } + } + /** * Constructor. * @@ -47,6 +132,9 @@ private Orca(String accessKey, String modelPath) throws OrcaException { handle = OrcaNative.init( accessKey, modelPath); + maxCharacterLimit = OrcaNative.getMaxCharacterLimit(handle); + sampleRate = OrcaNative.getSampleRate(handle); + validCharacters = OrcaNative.getValidCharacters(handle); } public static void setSdk(String sdk) { @@ -93,10 +181,11 @@ public void delete() { * syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format, * e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. - * @return The output audio. + * @return An object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and an array of OrcaWord objects representing the word alignments. * @throws OrcaException if there is an error while synthesizing audio. */ - public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaException { + public OrcaAudio synthesize(String text, OrcaSynthesizeParams params) throws OrcaException { if (handle == 0) { throw new OrcaInvalidStateException( "Attempted to call Orca synthesize after delete." @@ -106,7 +195,8 @@ public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaE return OrcaNative.synthesize( handle, text, - params.getSpeechRate()); + params.getSpeechRate(), + params.getRandomState()); } /** @@ -121,9 +211,10 @@ public short[] synthesize(String text, OrcaSynthesizeParams params) throws OrcaE * @param outputPath Absolute path to the output audio file. The output file is saved as * `WAV (.wav)` and consists of a single mono channel. * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. + * @return An array of OrcaWord objects representing the word alignments. * @throws OrcaException if there is an error while synthesizing audio to file. */ - public void synthesizeToFile( + public OrcaWord[] synthesizeToFile( String text, String outputPath, OrcaSynthesizeParams params) throws OrcaException { @@ -133,11 +224,34 @@ public void synthesizeToFile( ); } - OrcaNative.synthesizeToFile( + OrcaAudio result = OrcaNative.synthesizeToFile( handle, text, outputPath, - params.getSpeechRate()); + params.getSpeechRate(), + params.getRandomState()); + + return result.getWordArray(); + } + + /** + * @param params Global parameters for synthesized text. See 'OrcaSynthesizeParams' for details. + * @return OrcaStream object. + * @throws OrcaException if there is an error while opening OrcaStream. + */ + public OrcaStream streamOpen(OrcaSynthesizeParams params) throws OrcaException { + if (handle == 0) { + throw new OrcaInvalidStateException( + "Attempted to call Orca streamOpen after delete." + ); + } + + long stream = OrcaNative.streamOpen( + handle, + params.getSpeechRate(), + params.getRandomState()); + + return new OrcaStream(stream); } /** @@ -155,7 +269,7 @@ public String getVersion() { * @return The maximum number of characters that can be synthesized at once. */ public int getMaxCharacterLimit() { - return OrcaNative.getMaxCharacterLimit(); + return maxCharacterLimit; } /** @@ -163,14 +277,8 @@ public int getMaxCharacterLimit() { * * @return Audio sampling rate of the audio produced by Orca. */ - public int getSampleRate() throws OrcaException { - if (handle == 0) { - throw new OrcaInvalidStateException( - "Attempted to call Orca getSampleRate after delete." - ); - } - - return OrcaNative.getSampleRate(handle); + public int getSampleRate() { + return sampleRate; } /** @@ -178,14 +286,8 @@ public int getSampleRate() throws OrcaException { * * @return Array of characters that are accepted as input to Orca synthesize functions. */ - public String[] getValidCharacters() throws OrcaException { - if (handle == 0) { - throw new OrcaInvalidStateException( - "Attempted to call Orca getValidCharacters after delete." - ); - } - - return OrcaNative.getValidCharacters(handle); + public String[] getValidCharacters() { + return validCharacters; } /** diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java new file mode 100644 index 00000000..6f0d3311 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaAudio.java @@ -0,0 +1,48 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaAudio { + + private final short[] pcm; + private final OrcaWord[] wordArray; + + /** + * Constructor. + * + * @param pcm Synthesized audio. + * @param wordArray Synthesized words and their associated metadata. + */ + public OrcaAudio(short[] pcm, OrcaWord[] wordArray) { + this.pcm = pcm; + this.wordArray = wordArray; + } + + /** + * Getter for the synthesized audio. + * + * @return Synthesized audio. + */ + public short[] getPcm() { + return pcm; + } + + /** + * Getter for synthesized words and their associated metadata. + * + * @return Synthesized words and their associated metadata. + */ + public OrcaWord[] getWordArray() { + return wordArray; + } +} diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java index 50307ea3..6a57cbdc 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaNative.java @@ -26,16 +26,31 @@ class OrcaNative { static native String[] getValidCharacters(long object) throws OrcaException; - static native int getMaxCharacterLimit(); + static native int getMaxCharacterLimit(long object) throws OrcaException; - static native short[] synthesize( + static native OrcaAudio synthesize( long object, String text, - float speechRate) throws OrcaException; + float speechRate, + long randomState) throws OrcaException; - static native void synthesizeToFile( + static native OrcaAudio synthesizeToFile( long object, String text, String outputPath, - float speechRate) throws OrcaException; + float speechRate, + long randomState) throws OrcaException; + + static native long streamOpen( + long object, + float speechRate, + long randomState) throws OrcaException; + + static native short[] streamSynthesize( + long object, + String text) throws OrcaException; + + static native short[] streamFlush(long object) throws OrcaException; + + static native void streamClose(long object); } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java new file mode 100644 index 00000000..d38b56a4 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaPhoneme.java @@ -0,0 +1,60 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaPhoneme { + + private final String phoneme; + private final float startSec; + private final float endSec; + + /** + * Constructor. + * + * @param phoneme Synthesized phoneme. + * @param startSec Start time of the phoneme in seconds. + * @param endSec End time of the phoneme in seconds. + */ + public OrcaPhoneme(String phoneme, float startSec, float endSec) { + this.phoneme = phoneme; + this.startSec = startSec; + this.endSec = endSec; + } + + /** + * Getter for the synthesized phoneme. + * + * @return Synthesized phoneme. + */ + public String getPhoneme() { + return phoneme; + } + + /** + * Getter for the start time of the phoneme in seconds. + * + * @return Start time of the phoneme in seconds. + */ + public float getStartSec() { + return startSec; + } + + /** + * Getter for the end time of the phoneme in seconds. + * + * @return End time of the phoneme in seconds. + */ + public float getEndSec() { + return endSec; + } +} diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java index df787e2a..d749d7d9 100644 --- a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaSynthesizeParams.java @@ -18,12 +18,14 @@ public class OrcaSynthesizeParams { private final float speechRate; + private final long randomState; /** * Constructor. */ - private OrcaSynthesizeParams(float speechRate) { + private OrcaSynthesizeParams(float speechRate, long randomState) { this.speechRate = speechRate; + this.randomState = randomState; } /** @@ -35,12 +37,22 @@ public float getSpeechRate() { return this.speechRate; } + /** + * Getter for the random state (i.e. the random state for the synthesized speech). + * + * @return Random State. + */ + public long getRandomState() { + return this.randomState; + } + /** * Builder for creating instance of OrcaSynthesizeParams. */ public static class Builder { private float speechRate = 1.0f; + private long randomState = -1; /** * Sets the speech rate. @@ -53,6 +65,17 @@ public Builder setSpeechRate(float speechRate) { return this; } + /** + * Sets the random state. + * + * @param randomState The random state for the synthesized speech. + * @return Modified builder object. + */ + public Builder setRandomState(long randomState) { + this.randomState = randomState; + return this; + } + /** * Validates properties and creates an instance of OrcaSynthesizeParams. * @@ -66,7 +89,7 @@ public OrcaSynthesizeParams build() throws OrcaInvalidArgumentException { ); } - return new OrcaSynthesizeParams(speechRate); + return new OrcaSynthesizeParams(speechRate, randomState); } } } diff --git a/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java new file mode 100644 index 00000000..cb9b9868 --- /dev/null +++ b/binding/android/Orca/orca/src/main/java/ai/picovoice/orca/OrcaWord.java @@ -0,0 +1,72 @@ +/* + Copyright 2024 Picovoice Inc. + + You may not use this file except in compliance with the license. A copy of the license is + located in the "LICENSE" file accompanying this source. + + Unless required by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + express or implied. See the License for the specific language governing permissions and + limitations under the License. +*/ + +package ai.picovoice.orca; + +public class OrcaWord { + + private final String word; + private final float startSec; + private final float endSec; + private final OrcaPhoneme[] phonemeArray; + + /** + * Constructor. + * + * @param word Synthesized word. + * @param startSec Start time of the word in seconds. + * @param endSec End time of the word in seconds. + * @param phonemeArray Synthesized phonemes and their associated metadata. + */ + public OrcaWord(String word, float startSec, float endSec, OrcaPhoneme[] phonemeArray) { + this.word = word; + this.startSec = startSec; + this.endSec = endSec; + this.phonemeArray = phonemeArray; + } + + /** + * Getter for the synthesized word. + * + * @return Synthesized word. + */ + public String getWord() { + return word; + } + + /** + * Getter for the start time of the word in seconds. + * + * @return Start time of the word in seconds. + */ + public float getStartSec() { + return startSec; + } + + /** + * Getter for the end time of the word in seconds. + * + * @return End time of the word in seconds. + */ + public float getEndSec() { + return endSec; + } + + /** + * Getter for synthesized phonemes and their associated metadata. + * + * @return Synthesized phonemes and their associated metadata. + */ + public OrcaPhoneme[] getPhonemeArray() { + return phonemeArray; + } +} diff --git a/binding/android/OrcaTestApp/copy_test_resources.sh b/binding/android/OrcaTestApp/copy_test_resources.sh index 5b967362..511e0eed 100755 --- a/binding/android/OrcaTestApp/copy_test_resources.sh +++ b/binding/android/OrcaTestApp/copy_test_resources.sh @@ -10,5 +10,11 @@ cp ../../../lib/common/*.pv ./orca-test-app/src/androidTest/assets/test_resource echo "Copying test data file..." cp ../../../resources/.test/test_data.json ./orca-test-app/src/androidTest/assets/test_resources -echo "Copying test model files ..." -cp ../../../resources/.test/models/*.pv ./orca-test-app/src/androidTest/assets/test_resources/model_files +if [ ! -d "./orca-test-app/src/androidTest/assets/test_resources/wav" ] +then + echo "Creating test model files directory..." + mkdir -p ./orca-test-app/src/androidTest/assets/test_resources/wav +fi + +echo "Copying test wav files..." +cp ../../../resources/.test/wav/*.wav ./orca-test-app/src/androidTest/assets/test_resources/wav diff --git a/binding/android/OrcaTestApp/orca-test-app/build.gradle b/binding/android/OrcaTestApp/orca-test-app/build.gradle index 5225afa2..b2fa669b 100644 --- a/binding/android/OrcaTestApp/orca-test-app/build.gradle +++ b/binding/android/OrcaTestApp/orca-test-app/build.gradle @@ -106,7 +106,7 @@ dependencies { implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation 'ai.picovoice:orca-android:0.1.0' + implementation 'ai.picovoice:orca-android:0.2.0' // Espresso UI Testing androidTestImplementation 'androidx.test.ext:junit:1.1.5' diff --git a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java index 35f6c177..23bac17e 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/BaseTest.java @@ -28,13 +28,23 @@ import java.io.BufferedInputStream; import java.io.BufferedOutputStream; +import java.io.ByteArrayOutputStream; import java.io.File; +import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; + +import ai.picovoice.orca.OrcaWord; +import ai.picovoice.orca.OrcaPhoneme; public class BaseTest { @@ -46,7 +56,6 @@ public class BaseTest { AssetManager assetManager; String testResourcesPath; JsonObject testJson; - String leopardModelPath; String accessKey; @Before @@ -89,35 +98,64 @@ public static String[] getModelFiles() { }; } - protected static float getWordErrorRate( - String transcript, - String expectedTranscript, - boolean useCER) { - String splitter = (useCER) ? "" : " "; - return (float) levenshteinDistance( - transcript.split(splitter), - expectedTranscript.split(splitter)) / transcript.length(); + protected static boolean compareArrays(short[] arr1, short[] arr2, int step) { + for (int i = 0; i < arr1.length - step; i += step) { + if (!(Math.abs(arr1[i] - arr2[i]) <= 500)) { + return false; + } + } + return true; } - private static int levenshteinDistance(String[] words1, String[] words2) { - int[][] res = new int[words1.length + 1][words2.length + 1]; - for (int i = 0; i <= words1.length; i++) { - res[i][0] = i; - } - for (int j = 0; j <= words2.length; j++) { - res[0][j] = j; + protected static short[] concatArrays(short[] existingArray, short[] arrayToAdd) { + short[] result = new short[existingArray.length + arrayToAdd.length]; + + System.arraycopy(existingArray, 0, result, 0, existingArray.length); + System.arraycopy(arrayToAdd, 0, result, existingArray.length, arrayToAdd.length); + + return result; + } + + protected static short[] readAudioFile(String audioFile) throws Exception { + FileInputStream audioInputStream = new FileInputStream(audioFile); + ByteArrayOutputStream audioByteBuffer = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + for (int length; (length = audioInputStream.read(buffer)) != -1; ) { + audioByteBuffer.write(buffer, 0, length); } - for (int i = 1; i <= words1.length; i++) { - for (int j = 1; j <= words2.length; j++) { - res[i][j] = Math.min( - Math.min( - res[i - 1][j] + 1, - res[i][j - 1] + 1), - res[i - 1][j - 1] + (words1[i - 1].equalsIgnoreCase(words2[j - 1]) ? 0 : 1) - ); + byte[] rawData = audioByteBuffer.toByteArray(); + + short[] pcm = new short[rawData.length / 2]; + ByteBuffer pcmBuff = ByteBuffer.wrap(rawData).order(ByteOrder.LITTLE_ENDIAN); + pcmBuff.asShortBuffer().get(pcm); + pcm = Arrays.copyOfRange(pcm, 22, pcm.length); + + return pcm; + } + + protected void validateMetadata( + OrcaWord[] words, + OrcaWord[] expectedWords, + boolean isExpectExact + ) { + assertEquals(words.length, expectedWords.length); + for (int i = 0; i < words.length; i++) { + assertEquals(words[i].getWord(), expectedWords[i].getWord()); + if (isExpectExact) { + assertEquals(words[i].getStartSec(), expectedWords[i].getStartSec(), 0.1); + assertEquals(words[i].getEndSec(), expectedWords[i].getEndSec(), 0.1); + } + OrcaPhoneme[] phonemes = words[i].getPhonemeArray(); + OrcaPhoneme[] expectedPhonemes = expectedWords[i].getPhonemeArray(); + assertEquals(phonemes.length, expectedPhonemes.length); + for (int j = 0; j < phonemes.length; j++) { + assertEquals(phonemes[j].getPhoneme(), expectedPhonemes[j].getPhoneme()); + if (isExpectExact) { + assertEquals(phonemes[j].getStartSec(), expectedPhonemes[j].getStartSec(), 0.1); + assertEquals(phonemes[j].getEndSec(), expectedPhonemes[j].getEndSec(), 0.1); + } } } - return res[words1.length][words2.length]; } private void extractAssetsRecursively(String path) throws IOException { diff --git a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java index 45ab09c5..0f9bda25 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/androidTest/java/ai/picovoice/orca/testapp/OrcaTest.java @@ -12,12 +12,14 @@ package ai.picovoice.orca.testapp; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import com.google.gson.JsonArray; import com.google.gson.JsonObject; import org.junit.After; @@ -28,19 +30,20 @@ import org.junit.runners.Parameterized; import java.io.File; + import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Objects; -import ai.picovoice.leopard.Leopard; - -import ai.picovoice.leopard.LeopardTranscript; import ai.picovoice.orca.Orca; +import ai.picovoice.orca.OrcaAudio; import ai.picovoice.orca.OrcaException; import ai.picovoice.orca.OrcaInvalidArgumentException; import ai.picovoice.orca.OrcaSynthesizeParams; - +import ai.picovoice.orca.OrcaWord; +import ai.picovoice.orca.OrcaPhoneme; @RunWith(Enclosed.class) public class OrcaTest { @@ -101,9 +104,14 @@ public static Collection initParameters() { String text; String textNoPunctuation; String textCustomPronunciation; + String textAlignment; + static JsonArray textInvalid; - float werThreshold; - String leopardModelPath; + long randomState; + static JsonArray alignments; + + String modelFileUsed; + String EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = "female"; Orca orca; @@ -115,10 +123,13 @@ public void Setup() throws Exception { text = testSentences.get("text").getAsString(); textNoPunctuation = testSentences.get("text_no_punctuation").getAsString(); textCustomPronunciation = testSentences.get("text_custom_pronunciation").getAsString(); - werThreshold = testJson.get("wer_threshold").getAsFloat(); - leopardModelPath = new File( - testResourcesPath, - "model_files/leopard_params.pv").getAbsolutePath(); + textAlignment = testSentences.get("text_alignment").getAsString(); + textInvalid = testSentences.get("text_invalid").getAsJsonArray(); + + randomState = testJson.get("random_state").getAsLong(); + alignments = testJson.getAsJsonArray("alignments"); + + modelFileUsed = modelFile.contains("female") ? "female" : "male"; orca = new Orca.Builder() .setAccessKey(accessKey) @@ -133,11 +144,6 @@ public void TearDown() { } } - @Test - public void testMaxCharacterLimit() { - assertTrue(orca.getMaxCharacterLimit() > 0); - } - @Test public void testVersion() { final String version = orca.getVersion(); @@ -146,97 +152,244 @@ public void testVersion() { } @Test - public void testSampleRate() throws OrcaException { + public void testSampleRate() { assertTrue(orca.getSampleRate() > 0); } @Test - public void testValidCharacters() throws OrcaException { + public void testMaxCharacterLimit() { + assertTrue(orca.getMaxCharacterLimit() > 0); + } + + @Test + public void testValidCharacters() { String[] characters = orca.getValidCharacters(); assertTrue(characters.length > 0); assertTrue(Arrays.asList(characters).contains(",")); } @Test - public void testSynthesize() throws Exception { - Leopard leopard = new Leopard.Builder() - .setAccessKey(accessKey) - .setModelPath(leopardModelPath) - .build(appContext); + public void testStreaming() throws Exception { + Orca.OrcaStream orcaStream = orca.streamOpen( + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + + short[] fullPcm = new short[0]; + for (char c : text.toCharArray()) { + short[] pcm = orcaStream.synthesize(String.valueOf(c)); + if (pcm != null && pcm.length > 0) { + fullPcm = concatArrays(fullPcm, pcm); + } + } + + short[] flushedPcm = orcaStream.flush(); + if (flushedPcm != null && flushedPcm.length > 0) { + fullPcm = concatArrays(fullPcm, flushedPcm); + } + + orcaStream.close(); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_stream.wav", testResourcesPath, modelFileUsed)); - final short[] pcm = orca.synthesize( + compareArrays(fullPcm, testFilePcm, 1); + } + + @Test + public void testSynthesize() throws Exception { + final OrcaAudio pcm = orca.synthesize( text, - new OrcaSynthesizeParams.Builder().build()); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); - LeopardTranscript leopardTranscript = leopard.process(pcm); - leopard.delete(); - final float wer = getWordErrorRate( - leopardTranscript.getTranscriptString(), - textNoPunctuation, - false); - assertTrue(wer < werThreshold); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_single.wav", testResourcesPath, modelFileUsed)); + + compareArrays(pcm.getPcm(), testFilePcm, 1); } @Test public void testSynthesizeToFile() throws Exception { - Leopard leopard = new Leopard.Builder() - .setAccessKey(accessKey) - .setModelPath(leopardModelPath) - .build(appContext); - final File outputFile = new File( appContext.getFilesDir(), "text.wav"); orca.synthesizeToFile( text, outputFile.getAbsolutePath(), - new OrcaSynthesizeParams.Builder().build()); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + + short[] outputFilePcm = readAudioFile(outputFile.getAbsolutePath()); + short[] testFilePcm = readAudioFile(String.format( + "%s/wav/orca_params_%s_single.wav", testResourcesPath, modelFileUsed)); - LeopardTranscript leopardTranscript = leopard.processFile(outputFile.getAbsolutePath()); + compareArrays(outputFilePcm, testFilePcm, 1); outputFile.delete(); - leopard.delete(); - final float wer = getWordErrorRate( - leopardTranscript.getTranscriptString(), + } + + @Test + public void testSynthesizeNoPronunciation() throws OrcaException { + final OrcaAudio result = orca.synthesize( textNoPunctuation, - false); - assertTrue(wer < werThreshold); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + assertTrue(result.getPcm().length > 0); } @Test public void testSynthesizeCustomPronunciation() throws OrcaException { - final short[] pcm = orca.synthesize( + final OrcaAudio result = orca.synthesize( textCustomPronunciation, - new OrcaSynthesizeParams.Builder().build()); - assertTrue(pcm.length > 0); + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + assertTrue(result.getPcm().length > 0); + } + + @Test + public void testSynthesizeAlignment() throws OrcaException { + final OrcaAudio result = orca.synthesize( + textAlignment, + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + final OrcaWord[] synthesizeTestData = new OrcaWord[alignments.size()]; + for (int i = 0; i < alignments.size(); i++) { + final JsonObject testData = alignments.get(i).getAsJsonObject(); + final String word = testData.get("word").getAsString(); + final float startSec = testData.get("start_sec").getAsFloat(); + final float endSec = testData.get("end_sec").getAsFloat(); + final JsonArray phonemesJson = testData.getAsJsonArray("phonemes"); + final OrcaPhoneme[] phonemes = new OrcaPhoneme[phonemesJson.size()]; + for (int j = 0; j < phonemesJson.size(); j++) { + final JsonObject phonemeJson = phonemesJson.get(j).getAsJsonObject(); + phonemes[j] = new OrcaPhoneme( + phonemeJson.get("phoneme").getAsString(), + phonemeJson.get("start_sec").getAsFloat(), + phonemeJson.get("end_sec").getAsFloat()); + } + synthesizeTestData[i] = new OrcaWord( + word, + startSec, + endSec, + phonemes); + } + validateMetadata( + result.getWordArray(), + synthesizeTestData, + Objects.equals(modelFileUsed, EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER)); + } + + @Test + public void testSynthesizeToFileAlignment() throws OrcaException { + final File outputFile = new File( + appContext.getFilesDir(), + "text.wav"); + OrcaWord[] result = orca.synthesizeToFile( + textAlignment, + outputFile.getAbsolutePath(), + new OrcaSynthesizeParams.Builder() + .setRandomState(randomState) + .build()); + outputFile.delete(); + + final OrcaWord[] synthesizeTestData = new OrcaWord[alignments.size()]; + for (int i = 0; i < alignments.size(); i++) { + final JsonObject testData = alignments.get(i).getAsJsonObject(); + final String word = testData.get("word").getAsString(); + final float startSec = testData.get("start_sec").getAsFloat(); + final float endSec = testData.get("end_sec").getAsFloat(); + final JsonArray phonemesJson = testData.getAsJsonArray("phonemes"); + final OrcaPhoneme[] phonemes = new OrcaPhoneme[phonemesJson.size()]; + for (int j = 0; j < phonemesJson.size(); j++) { + final JsonObject phonemeJson = phonemesJson.get(j).getAsJsonObject(); + phonemes[j] = new OrcaPhoneme( + phonemeJson.get("phoneme").getAsString(), + phonemeJson.get("start_sec").getAsFloat(), + phonemeJson.get("end_sec").getAsFloat()); + } + synthesizeTestData[i] = new OrcaWord( + word, + startSec, + endSec, + phonemes); + } + validateMetadata( + result, + synthesizeTestData, + Objects.equals(modelFileUsed, EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER)); } @Test public void testSynthesizeSpeechRate() throws OrcaException { - final short[] pcmSlow = orca.synthesize( + final OrcaAudio slow = orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(0.7f) + .setRandomState(randomState) .build()); - assertTrue(pcmSlow.length > 0); + assertTrue(slow.getPcm().length > 0); - final short[] pcmFast = orca.synthesize( + final OrcaAudio fast = orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(1.3f) + .setRandomState(randomState) .build()); - assertTrue(pcmFast.length > 0); - assertTrue(pcmFast.length < pcmSlow.length); + assertTrue(slow.getPcm().length > 0); + assertTrue(fast.getPcm().length < slow.getPcm().length); try { orca.synthesize( textCustomPronunciation, new OrcaSynthesizeParams.Builder() .setSpeechRate(9999f) + .setRandomState(randomState) .build()); fail(); } catch (OrcaInvalidArgumentException e) { assertNotNull(e); } } + + @Test + public void testSynthesizeRandomState() throws OrcaException { + final OrcaAudio randomState1 = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(1) + .build()); + assertTrue(randomState1.getPcm().length > 0); + assertTrue(randomState1.getWordArray().length > 0); + + final OrcaAudio randomState2 = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(2) + .build()); + assertTrue(randomState2.getPcm().length > 0); + assertTrue(randomState2.getWordArray().length > 0); + + assertNotEquals(randomState1, randomState2); + assertNotEquals(randomState1.getWordArray(), randomState2.getWordArray()); + + final OrcaAudio randomStateNull = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .build()); + assertTrue(randomStateNull.getPcm().length > 0); + assertTrue(randomStateNull.getWordArray().length > 0); + + final OrcaAudio randomStateMaxValue = orca.synthesize( + text, + new OrcaSynthesizeParams.Builder() + .setRandomState(Long.MAX_VALUE) + .build()); + assertTrue(randomStateMaxValue.getPcm().length > 0); + assertTrue(randomStateMaxValue.getWordArray().length > 0); + } } } diff --git a/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java b/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java index 25ae1c27..d9349f5f 100644 --- a/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java +++ b/binding/android/OrcaTestApp/orca-test-app/src/main/java/ai/picovoice/orca/testapp/MainActivity.java @@ -27,6 +27,7 @@ import java.util.HashMap; import ai.picovoice.orca.Orca; +import ai.picovoice.orca.OrcaAudio; import ai.picovoice.orca.OrcaException; import ai.picovoice.orca.OrcaSynthesizeParams; @@ -102,8 +103,8 @@ public void runTest() { result = new TestResult(); result.testName = "Test Synthesize"; try { - short[] pcm = orca.synthesize("Hello", new OrcaSynthesizeParams.Builder().build()); - if (pcm.length > 0) { + OrcaAudio orcaAudio = orca.synthesize("Hello", new OrcaSynthesizeParams.Builder().build()); + if (orcaAudio.getPcm().length > 0 && orcaAudio.getWordArray() != null) { result.success = true; } else { result.success = false; @@ -138,6 +139,30 @@ public void runTest() { results.add(result); } + result = new TestResult(); + result.testName = "Test Streaming"; + try { + Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + short[] pcm = orcaStream.synthesize("Hello"); + short[] flushedPcm = orcaStream.flush(); + orcaStream.close(); + + short[] pcm1 = pcm == null ? new short[0] : pcm; + short[] pcm2 = flushedPcm == null ? new short[0] : flushedPcm; + short[] streamPcm = new short[pcm1.length + pcm2.length]; + if (streamPcm.length > 0) { + result.success = true; + } else { + result.success = false; + result.errorMessage = "Stream returned invalid result."; + } + } catch (Exception e) { + result.success = false; + result.errorMessage = String.format("Failed to stream with '%s'", e); + } finally { + results.add(result); + } + result = new TestResult(); result.testName = "Test Exception"; try { diff --git a/binding/android/README.md b/binding/android/README.md index 2b7b1690..48ceeb51 100644 --- a/binding/android/README.md +++ b/binding/android/README.md @@ -1,10 +1,11 @@ # Orca Binding for Android -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -19,7 +20,8 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Installation -Orca can be found on Maven Central. To include the package in your Android project, ensure you have included `mavenCentral()` in your top-level `build.gradle` file and then add the following to your app's `build.gradle`: +Orca can be found on Maven Central. To include the package in your Android project, ensure you have +included `mavenCentral()` in your top-level `build.gradle` file and then add the following to your app's `build.gradle`: ```groovy dependencies { @@ -37,13 +39,20 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Permissions To enable AccessKey validation, you must add the following line to your `AndroidManifest.xml` file: + ```xml - + + ``` ## Usage -Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app context: +Orca supports two modes of operation: streaming and single synthesis. In the streaming synthesis mode, Orca processes an +incoming text stream in real-time and generates audio in parallel. In the single synthesis mode, a complete text is +synthesized in a single call to the Orca engine. + +Create an instance of the engine with the Orca Builder class by passing in the accessKey, modelPath and Android app +context: ```java import ai.picovoice.orca.*; @@ -58,24 +67,72 @@ try { } catch (OrcaException ex) { } ``` -You can synthesize speech by calling one of the available `synthesize` methods: +To synthesize a text stream, create an `OrcaStream` object and add text to it one-by-one: + +```java +Orca.OrcaStream orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + +for (String textChunk : textGenerator()) { + short[] pcm = orcaStream.synthesize(textChunk); + if (pcm != null) { + // handle pcm + } +} + +short[] flushedPcm = orcaStream.flush(); +if (flushedPcm != null) { + // handle pcm +} +``` + +The `textGenerator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `orcaStream.synthesize()`. +To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked. +When done with streaming text synthesis, the `OrcaStream` object needs to be closed: + +```java +orcaStream.close(); +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```java OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder().build(); -// Return raw PCM -short[] pcm = orca.synthesize("${TEXT}", params); +// Return raw PCM and alignments +OrcaAudio audio = orca.synthesize("${TEXT}", params); // Save the generated audio to a WAV file directly -orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params); +OrcaWord[] orcaWords = orca.synthesizeToFile("${TEXT}", "${OUTPUT_PATH}", params); +``` + +Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a +single-channel 16-bit PCM WAV file. +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord` +objects. + +When done make sure to explicitly release the resources using: + +```java +orca.delete() ``` -Replace `${TEXT}` with the text to be synthesized (must be fewer characters than `.getMaxCharacterLimit()`). When using `synthesize`, the generated pcm has a sample rate equal to the one returned by `getSampleRate()`. When using `synthesizeToFile`, replace `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit PCM WAV file. When done make sure to explicitly release the resources with `orca.delete()`. +### Text input -### Text Input +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the +`getValidCharacters()` method provided in the Orca SDK you are using. +Pronunciations of characters or words not supported by this list can be achieved with +[custom pronunciations](#custom-pronunciations). -Orca accepts any character found in the list returned by the `getValidCharacters()` method. -Pronunciations of characters or words not supported by this list can be achieved by embedding custom pronunciations in the text via the syntax: `{word|pronunciation}`. The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example: +### Custom pronunciations + +Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\ +The pronunciation is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for example: - "This is a {custom|K AH S T AH M} pronunciation" - "{read|R IY D} this as {read|R EH D}, please." @@ -89,23 +146,44 @@ in [lib/common](../../lib/common). To add the Orca model file to your Android application: - Download the desired voice model from the [Orca GitHub repository](../../lib/common). -- Add the model file as a bundled resource by placing it under the assets directory of your Android project (`src/main/assets/`). +- Add the model file as a bundled resource by placing it under the assets directory of your Android + project (`src/main/assets/`). ### Additional Synthesis Controls -Orca allows you to control the synthesized speech via the `OrcaSynthesizeParams` class. You can pass in additional settings by using the nested Builder class: +Orca allows you to control the synthesized speech via the `OrcaSynthesizeParams` class. You can pass in additional +settings by using the nested Builder class: ```java import ai.picovoice.orca.*; OrcaSynthesizeParams params = new OrcaSynthesizeParams.Builder() .setSpeechRate(1.2f) + .setRandomState(1) .build(); ``` - `setSpeechRate()`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value produces speech that is faster. The default is `1.0`. +- `setRandomState()`: Sets the random state for sampling during synthesis. This can be used to ensure that the + synthesized speech is deterministic across different runs. + +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaWord` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. ## Demos -To see Orca used in an app, refer to our [Android demo app](../../demo/android/OrcaDemo). \ No newline at end of file +To see Orca used in an app, refer to our [Android demo app](../../demo/android/OrcaDemo). diff --git a/binding/ios/Orca-iOS.podspec b/binding/ios/Orca-iOS.podspec index dcd7efaf..f7127f37 100644 --- a/binding/ios/Orca-iOS.podspec +++ b/binding/ios/Orca-iOS.podspec @@ -1,7 +1,7 @@ Pod::Spec.new do |s| s.name = 'Orca-iOS' s.module_name = 'Orca' - s.version = '0.1.0' + s.version = '0.2.0' s.license = {:type => 'Apache 2.0'} s.summary = 'iOS binding for Picovoice\'s Orca Text-to-Speech Engine.' s.description = @@ -18,7 +18,7 @@ Pod::Spec.new do |s| DESC s.homepage = 'https://github.com/Picovoice/orca/tree/main/binding/ios' s.author = { 'Picovoice' => 'hello@picovoice.ai' } - s.source = { :git => "https://github.com/Picovoice/orca.git", :tag => "Orca-iOS-v0.1.0" } + s.source = { :git => "https://github.com/Picovoice/orca.git", :tag => "Orca-iOS-v0.2.0" } s.ios.deployment_target = '13.0' s.swift_version = '5.0' s.vendored_frameworks = 'lib/ios/PvOrca.xcframework' diff --git a/binding/ios/Orca.swift b/binding/ios/Orca.swift index 1599990a..04cf5ea2 100644 --- a/binding/ios/Orca.swift +++ b/binding/ios/Orca.swift @@ -9,51 +9,193 @@ import PvOrca +public struct OrcaPhoneme { + + /// Synthesized phoneme. + public let phoneme: String + + /// Start of phoneme in seconds. + public let startSec: Float + + /// End of phoneme in seconds. + public let endSec: Float + + /// Constructor. + /// + /// - Parameters: + /// - phoneme: Synthesized phoneme. + /// - startSec: Start of phoneme in seconds. + /// - endSec: End of phoneme in seconds. + public init( + phoneme: String, + startSec: Float, + endSec: Float) { + self.phoneme = phoneme + self.startSec = startSec + self.endSec = endSec + } +} + +public struct OrcaWord { + + /// Synthesized word. + public let word: String + + /// Start of word in seconds. + public let startSec: Float + + /// End of word in seconds. + public let endSec: Float + + /// Array of phonemes. + public let phonemeArray: [OrcaPhoneme] + + /// Constructor. + /// + /// - Parameters: + /// - word: Synthesized word. + /// - startSec: Start of word in seconds. + /// - endSec: End of word in seconds. + /// - phonemeArray: Array of phonemes. + public init( + word: String, + startSec: Float, + endSec: Float, + phonemeArray: [OrcaPhoneme]) { + self.word = word + self.startSec = startSec + self.endSec = endSec + self.phonemeArray = phonemeArray + } +} + /// iOS (Swift) binding for Orca Text-to-Speech engine. Provides a Swift interface to the Orca library. public class Orca { private var handle: OpaquePointer? + + private var stream: OpaquePointer? /// Orca valid symbols private var _validCharacters: Set? /// Orca sample rate private var _sampleRate: Int32? /// Maximum number of characters allowed in a single synthesis request. - public static let maxCharacterLimit = Int32(pv_orca_max_character_limit()) + private var _maxCharacterLimit: Int32? /// Orca version string public static let version = String(cString: pv_orca_version()) private static var sdk = "ios" + /// OrcaStream object that converts a stream of text to a stream of audio. + public class OrcaStream { + + private var orca: Orca + + private var stream: OpaquePointer? + + /// Adds a chunk of text to the OrcaStream object and generates audio if enough text has been added. + /// This function is expected to be called multiple times with consecutive chunks of text from a text stream. + /// The incoming text is buffered as it arrives until the length is long enough to convert a chunk of the + /// buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + /// for the remaining text that has not yet been synthesized. + /// + /// - Parameters: + /// - text: A chunk of text from a text input stream, comprised of valid characters. + /// Valid characters can be retrieved by calling `.validCharacters`. + /// Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + /// They need to be added in a single call to this function. + /// The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". + /// - Returns: The generated audio as a sequence of 16-bit linearly-encoded integers, `nil` if no + /// audio chunk has been produced. + /// - Throws: OrcaError + public func synthesize(text: String) throws -> [Int16]? { + if stream == nil { + throw OrcaInvalidStateError("Unable to synthesize - stream not open") + } + + var cNumSamples: Int32 = 0 + var cPcm: UnsafeMutablePointer? + + let status = pv_orca_stream_synthesize( + stream, + text, + &cNumSamples, + &cPcm) + if status != PV_STATUS_SUCCESS { + let messageStack = try orca.getMessageStack() + throw orca.pvStatusToOrcaError(status, "Unable to synthesize streaming speech", messageStack) + } + + let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) + let pcm = Array(buffer) + + pv_orca_pcm_delete(cPcm) + + return pcm.isEmpty ? nil : pcm + } + + /// Generates audio for all the buffered text that was added to the OrcaStream object + /// via `OrcaStream.synthesize()`. + /// + /// - Returns: The generated audio as a sequence of 16-bit linearly-encoded integers, `nil` if no + /// audio chunk has been produced. + /// - Throws: OrcaError + public func flush() throws -> [Int16]? { + if stream == nil { + throw OrcaInvalidStateError("Unable to flush - stream not open") + } + + var cNumSamples: Int32 = 0 + var cPcm: UnsafeMutablePointer? + + let status = pv_orca_stream_flush( + stream, + &cNumSamples, + &cPcm) + if status != PV_STATUS_SUCCESS { + let messageStack = try orca.getMessageStack() + throw orca.pvStatusToOrcaError(status, "Unable to flush streaming speech", messageStack) + } + + let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) + let pcm = Array(buffer) + + pv_orca_pcm_delete(cPcm) + + return pcm.isEmpty ? nil : pcm + } + + /// Releases the resources acquired by the OrcaStream object. + public func close() { + if stream != nil { + pv_orca_stream_close(stream) + stream = nil + } + } + + public init(orca: Orca, stream: OpaquePointer) { + self.orca = orca + self.stream = stream + } + } + public static func setSdk(sdk: String) { self.sdk = sdk } /// Set of characters supported by Orca. - public var validCharacters: Set { - get throws { - if _validCharacters == nil { - _validCharacters = try getValidCharacters() - } - return _validCharacters! - } + public var validCharacters: Set? { + return self._validCharacters } /// Audio sample rate of generated audio. - public var sampleRate: Int32 { - get throws { - if _sampleRate == nil { - var cSampleRate: Int32 = 0 - let status = pv_orca_sample_rate(handle, &cSampleRate) - if status != PV_STATUS_SUCCESS { - let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Orca failed to get sample rate", messageStack) - } - - _sampleRate = cSampleRate - } + public var sampleRate: Int32? { + return self._sampleRate + } - return _sampleRate! - } + /// Maximum number of characters allowed per call to `synthesize()`. + public var maxCharacterLimit: Int32? { + return self._maxCharacterLimit } /// Constructor. @@ -73,11 +215,44 @@ public class Orca { pv_set_sdk(Orca.sdk) - let status = pv_orca_init(accessKey, modelPathArg, &handle) - if status != PV_STATUS_SUCCESS { + let initStatus = pv_orca_init(accessKey, modelPathArg, &handle) + if initStatus != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Orca init failed", messageStack) + throw pvStatusToOrcaError(initStatus, "Orca init failed", messageStack) } + + var cNumCharacters: Int32 = 0 + var cCharacters: UnsafeMutablePointer?>? + let validCharactersStatus = pv_orca_valid_characters(handle, &cNumCharacters, &cCharacters) + if validCharactersStatus != PV_STATUS_SUCCESS { + let messageStack = try getMessageStack() + throw pvStatusToOrcaError(validCharactersStatus, "Unable to get Orca valid characters", messageStack) + } + var validCharacters: Set = [] + for i in 0.. [Int16] { + public func synthesize( + text: String, + speechRate: Double? = nil, + randomState: Int64? = nil + ) throws -> (pcm: [Int16], wordArray: [OrcaWord]) { if handle == nil { throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - if text.count > Orca.maxCharacterLimit { + if text.count > self._maxCharacterLimit! { throw OrcaInvalidArgumentError( - "Text length (\(text.count)) must be smaller than \(Orca.maxCharacterLimit)") + "Text length (\(text.count)) must be smaller than \(self._maxCharacterLimit!)") } - let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) var cNumSamples: Int32 = 0 var cPcm: UnsafeMutablePointer? - let status = pv_orca_synthesize(handle, text, cSynthesizeParams, &cNumSamples, &cPcm) + + var cNumAlignments: Int32 = 0 + var cAlignments: UnsafeMutablePointer?>? + + let status = pv_orca_synthesize( + handle, + text, + cSynthesizeParams, + &cNumSamples, + &cPcm, + &cNumAlignments, + &cAlignments) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() throw pvStatusToOrcaError(status, "Unable to synthesize speech", messageStack) @@ -140,10 +332,40 @@ public class Orca { let buffer = UnsafeBufferPointer(start: cPcm, count: Int(cNumSamples)) let pcm = Array(buffer) - pv_orca_delete_pcm(cPcm) + var wordArray = [OrcaWord]() + if let cAlignments = cAlignments { + for alignmentIndex in 0.. [OrcaWord] { if handle == nil { throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - if text.count > Orca.maxCharacterLimit { + if text.count > self._maxCharacterLimit! { throw OrcaInvalidArgumentError( - "Text length (\(text.count)) must be smaller than \(Orca.maxCharacterLimit)") + "Text length (\(text.count)) must be smaller than \(self._maxCharacterLimit!)") } - let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) - let status = pv_orca_synthesize_to_file(handle, text, cSynthesizeParams, outputPath) + var cNumAlignments: Int32 = 0 + var cAlignments: UnsafeMutablePointer?>? + + let status = pv_orca_synthesize_to_file( + handle, + text, + cSynthesizeParams, + outputPath, + &cNumAlignments, + &cAlignments) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() throw pvStatusToOrcaError(status, "Unable to synthesize speech to file", messageStack) } + var wordArray = [OrcaWord]() + if let cAlignments = cAlignments { + for alignmentIndex in 0.. [OrcaWord] { + try synthesizeToFile(text: text, outputPath: outputURL.path, speechRate: speechRate, randomState: randomState) } - private func getCSynthesizeParams(speechRate: Double? = nil) throws -> OpaquePointer? { + private func getCSynthesizeParams(speechRate: Double? = nil, randomState: Int64? = nil) throws -> OpaquePointer? { var cParams: OpaquePointer? var status = pv_orca_synthesize_params_init(&cParams) @@ -210,49 +487,60 @@ public class Orca { } } + if randomState != nil { + status = pv_orca_synthesize_params_set_random_state(cParams, randomState!) + if status != PV_STATUS_SUCCESS { + let messageStack = try getMessageStack() + throw pvStatusToOrcaError(status, "Unable to set Orca random state", messageStack) + } + } + return cParams } - private func getValidCharacters() throws -> Set { + /// Opens a stream for streaming text synthesis. + /// + /// - Parameters: + /// - speechRate: Rate of speech of the generated audio. Valid values are within [0.7, 1.3]. + /// - randomState: Random seed for the synthesis process. + /// - Returns: An instance of the OrcaStream class. + /// - Throws: OrcaError + public func streamOpen(speechRate: Double? = nil, randomState: Int64? = nil) throws -> OrcaStream { if handle == nil { - throw OrcaInvalidStateError("Unable to get valid characters - resources have been released") + throw OrcaInvalidStateError("Unable to synthesize - resources have been released") } - var cNumCharacters: Int32 = 0 - var cCharacters: UnsafePointer?>? - let status = pv_orca_valid_characters(handle, &cNumCharacters, &cCharacters) + let cSynthesizeParams = try getCSynthesizeParams(speechRate: speechRate, randomState: randomState) + + let status = pv_orca_stream_open( + handle, + cSynthesizeParams, + &stream) if status != PV_STATUS_SUCCESS { let messageStack = try getMessageStack() - throw pvStatusToOrcaError(status, "Unable to get Orca valid characters", messageStack) - } - - var characters: Set = [] - for i in 0.. String { - if let resourcePath = Bundle(for: type(of: self)).resourceURL?.appendingPathComponent(filePath).path { - if FileManager.default.fileExists(atPath: resourcePath) { - return resourcePath - } + /// + /// - Parameters: + /// - filePath: relative path of a file in the bundle. + /// - Throws: OrcaIOError + /// - Returns: The full path of the resource. + private func getResourcePath(_ filePath: String) throws -> String { + if let resourcePath = Bundle(for: type(of: self)).resourceURL?.appendingPathComponent(filePath).path { + if FileManager.default.fileExists(atPath: resourcePath) { + return resourcePath } - - throw OrcaIOError("Could not find file at path '\(filePath)'. " + - "If this is a packaged asset, ensure you have added it to your xcode project.") } + throw OrcaIOError("Could not find file at path '\(filePath)'. " + + "If this is a packaged asset, ensure you have added it to your xcode project.") + } + private func pvStatusToOrcaError( _ status: pv_status_t, _ message: String, diff --git a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj index 215b4006..045c2669 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj +++ b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/project.pbxproj @@ -18,10 +18,10 @@ 1EAEDDE12B745E6A003B8C18 /* BaseTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1EAEDDDF2B745E6A003B8C18 /* BaseTest.swift */; }; 1EAEDDE32B76A9DB003B8C18 /* test_resources in Resources */ = {isa = PBXBuildFile; fileRef = 1EAEDDE22B76A9DB003B8C18 /* test_resources */; }; 1EAEDDE42B76A9DB003B8C18 /* test_resources in Resources */ = {isa = PBXBuildFile; fileRef = 1EAEDDE22B76A9DB003B8C18 /* test_resources */; }; - 392C5FC2C59B4299F5FB7D3B /* libPods-OrcaAppTestUITests.a in Frameworks */ = {isa = PBXBuildFile; fileRef = B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */; }; - 50CC58C08AAD59C8922AC105 /* libPods-PerformanceTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */; }; 6A9164E4B0B1626D27DBA0A1 /* BuildFile in Frameworks */ = {isa = PBXBuildFile; }; - 837665FCC740E76ED8323395 /* libPods-OrcaAppTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */; }; + 6E98C462BF64583E878F8D23 /* libPods-OrcaAppTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */; }; + A24BE54871C74F1054CEE31C /* libPods-PerformanceTest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */; }; + C81664FA3D1463F091F643C7 /* libPods-OrcaAppTestUITests.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */; }; /* End PBXBuildFile section */ /* Begin PBXContainerItemProxy section */ @@ -42,7 +42,6 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ - 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.debug.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.debug.xcconfig"; sourceTree = ""; }; 1E00644827CEDF9B006FF6E9 /* OrcaAppTest.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = OrcaAppTest.app; sourceTree = BUILT_PRODUCTS_DIR; }; 1E00644B27CEDF9B006FF6E9 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; 1E00644F27CEDF9B006FF6E9 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = ""; }; @@ -58,14 +57,15 @@ 1E5B7AEF2800B2E300F8BDDB /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 1EAEDDDF2B745E6A003B8C18 /* BaseTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BaseTest.swift; sourceTree = ""; }; 1EAEDDE22B76A9DB003B8C18 /* test_resources */ = {isa = PBXFileReference; lastKnownFileType = folder; path = test_resources; sourceTree = ""; }; - 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.release.xcconfig"; sourceTree = ""; }; - 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.debug.xcconfig"; sourceTree = ""; }; - 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.debug.xcconfig"; sourceTree = ""; }; - 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTestUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.release.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.release.xcconfig"; sourceTree = ""; }; - C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-PerformanceTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.release.xcconfig"; sourceTree = ""; }; + 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.release.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.release.xcconfig"; sourceTree = ""; }; + 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaAppTestUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.release.xcconfig"; sourceTree = ""; }; + B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-PerformanceTest.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.debug.xcconfig"; sourceTree = ""; }; + C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTestUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests.debug.xcconfig"; sourceTree = ""; }; + D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaAppTest.release.xcconfig"; path = "Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest.release.xcconfig"; sourceTree = ""; }; + D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PerformanceTest.debug.xcconfig"; path = "Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest.debug.xcconfig"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -73,7 +73,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 837665FCC740E76ED8323395 /* libPods-OrcaAppTest.a in Frameworks */, + 6E98C462BF64583E878F8D23 /* libPods-OrcaAppTest.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -82,7 +82,7 @@ buildActionMask = 2147483647; files = ( 6A9164E4B0B1626D27DBA0A1 /* BuildFile in Frameworks */, - 392C5FC2C59B4299F5FB7D3B /* libPods-OrcaAppTestUITests.a in Frameworks */, + C81664FA3D1463F091F643C7 /* libPods-OrcaAppTestUITests.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -90,7 +90,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 50CC58C08AAD59C8922AC105 /* libPods-PerformanceTest.a in Frameworks */, + A24BE54871C74F1054CEE31C /* libPods-PerformanceTest.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -106,7 +106,7 @@ 1E00646B27CEDF9C006FF6E9 /* OrcaAppTestUITests */, 1E00644927CEDF9B006FF6E9 /* Products */, FA7D97C92E04F06D3273CCF3 /* Pods */, - D7256B2AE1CB33B60277231D /* Frameworks */, + F38640B51414B75BCC2F786D /* Frameworks */, ); sourceTree = ""; }; @@ -152,12 +152,12 @@ path = PerformanceTest; sourceTree = ""; }; - D7256B2AE1CB33B60277231D /* Frameworks */ = { + F38640B51414B75BCC2F786D /* Frameworks */ = { isa = PBXGroup; children = ( - 8FFAEF3E81D2B0623D6C31C0 /* libPods-OrcaAppTest.a */, - C6572CB2CB09D2183B5C6617 /* libPods-PerformanceTest.a */, - B340446EF573C72BC1349E8E /* libPods-OrcaAppTestUITests.a */, + 687BAF06D6520BA1D344AE33 /* libPods-OrcaAppTest.a */, + 73E1CBF84AA60FD3ED122CAC /* libPods-OrcaAppTestUITests.a */, + B3584F517C501F073B3A2F40 /* libPods-PerformanceTest.a */, ); name = Frameworks; sourceTree = ""; @@ -165,12 +165,12 @@ FA7D97C92E04F06D3273CCF3 /* Pods */ = { isa = PBXGroup; children = ( - 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */, - FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */, - 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */, - 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */, - 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */, - B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */, + B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */, + D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */, + C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */, + 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */, + D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */, + 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */, ); path = Pods; sourceTree = ""; @@ -182,11 +182,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E00647227CEDF9C006FF6E9 /* Build configuration list for PBXNativeTarget "OrcaAppTest" */; buildPhases = ( - EC36C0D4EA212BA0832C79AF /* [CP] Check Pods Manifest.lock */, + 8E1BD0B920D35BE249B07D2C /* [CP] Check Pods Manifest.lock */, 1E00644427CEDF9B006FF6E9 /* Sources */, 1E00644527CEDF9B006FF6E9 /* Frameworks */, 1E00644627CEDF9B006FF6E9 /* Resources */, - 676B5C752F03BA675A1EB67A /* [CP] Embed Pods Frameworks */, + 21A9820ABBFC1F00206743E6 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -201,11 +201,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E00647827CEDF9C006FF6E9 /* Build configuration list for PBXNativeTarget "OrcaAppTestUITests" */; buildPhases = ( - 111DBCFB7BC184413D3B26CE /* [CP] Check Pods Manifest.lock */, + 0709E7DDD8FF7E7E665392C0 /* [CP] Check Pods Manifest.lock */, 1E00646427CEDF9C006FF6E9 /* Sources */, 1E00646527CEDF9C006FF6E9 /* Frameworks */, 1E00646627CEDF9C006FF6E9 /* Resources */, - 29E6F72AA55D35729B883608 /* [CP] Embed Pods Frameworks */, + 0F4644102E1332551FF466EA /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -221,11 +221,11 @@ isa = PBXNativeTarget; buildConfigurationList = 1E5B7AE92800B29F00F8BDDB /* Build configuration list for PBXNativeTarget "PerformanceTest" */; buildPhases = ( - 1E5B7AE12800B29F00F8BDDB /* [CP] Check Pods Manifest.lock */, + D99C6A3E9E6311DFD1CA9C59 /* [CP] Check Pods Manifest.lock */, 1E5B7AE22800B29F00F8BDDB /* Sources */, 1E5B7AE42800B29F00F8BDDB /* Frameworks */, 1E5B7AE62800B29F00F8BDDB /* Resources */, - 1E5B7AE82800B29F00F8BDDB /* [CP] Embed Pods Frameworks */, + 7C76F0EC41B925B2353F2779 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -310,7 +310,7 @@ /* End PBXResourcesBuildPhase section */ /* Begin PBXShellScriptBuildPhase section */ - 111DBCFB7BC184413D3B26CE /* [CP] Check Pods Manifest.lock */ = { + 0709E7DDD8FF7E7E665392C0 /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -332,80 +332,80 @@ shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - 1E5B7AE12800B29F00F8BDDB /* [CP] Check Pods Manifest.lock */ = { + 0F4644102E1332551FF466EA /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - inputPaths = ( - "${PODS_PODFILE_DIR_PATH}/Podfile.lock", - "${PODS_ROOT}/Manifest.lock", - ); - name = "[CP] Check Pods Manifest.lock"; + name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - ); - outputPaths = ( - "$(DERIVED_FILE_DIR)/Pods-PerformanceTest-checkManifestLockResult.txt", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 1E5B7AE82800B29F00F8BDDB /* [CP] Embed Pods Frameworks */ = { + 21A9820ABBFC1F00206743E6 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks.sh\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 29E6F72AA55D35729B883608 /* [CP] Embed Pods Frameworks */ = { + 7C76F0EC41B925B2353F2779 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-input-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks-${CONFIGURATION}-output-files.xcfilelist", + "${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTestUITests/Pods-OrcaAppTestUITests-frameworks.sh\"\n"; + shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PerformanceTest/Pods-PerformanceTest-frameworks.sh\"\n"; showEnvVarsInLog = 0; }; - 676B5C752F03BA675A1EB67A /* [CP] Embed Pods Frameworks */ = { + 8E1BD0B920D35BE249B07D2C /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( ); inputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - name = "[CP] Embed Pods Frameworks"; + inputPaths = ( + "${PODS_PODFILE_DIR_PATH}/Podfile.lock", + "${PODS_ROOT}/Manifest.lock", + ); + name = "[CP] Check Pods Manifest.lock"; outputFileListPaths = ( - "${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks-${CONFIGURATION}-output-files.xcfilelist", + ); + outputPaths = ( + "$(DERIVED_FILE_DIR)/Pods-OrcaAppTest-checkManifestLockResult.txt", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; - shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OrcaAppTest/Pods-OrcaAppTest-frameworks.sh\"\n"; + shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - EC36C0D4EA212BA0832C79AF /* [CP] Check Pods Manifest.lock */ = { + D99C6A3E9E6311DFD1CA9C59 /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -420,7 +420,7 @@ outputFileListPaths = ( ); outputPaths = ( - "$(DERIVED_FILE_DIR)/Pods-OrcaAppTest-checkManifestLockResult.txt", + "$(DERIVED_FILE_DIR)/Pods-PerformanceTest-checkManifestLockResult.txt", ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; @@ -608,18 +608,20 @@ }; 1E00647327CEDF9C006FF6E9 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 2C1B5037B36B1A9C62AE0816 /* Pods-OrcaAppTest.debug.xcconfig */; + baseConfigurationReference = B4088BB1D8C7D5A5FF83708E /* Pods-OrcaAppTest.debug.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = OrcaAppTest/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = OrcaDemoApp; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; + INFOPLIST_KEY_UIRequiredDeviceCapabilities = armv7; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; LD_RUNPATH_SEARCH_PATHS = ( @@ -637,18 +639,20 @@ }; 1E00647427CEDF9C006FF6E9 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = FBAD61DE59260B66EB60F57A /* Pods-OrcaAppTest.release.xcconfig */; + baseConfigurationReference = D1F64E4744AD0659F95DE5B5 /* Pods-OrcaAppTest.release.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_FILE = OrcaAppTest/Info.plist; + INFOPLIST_KEY_CFBundleDisplayName = OrcaDemoApp; INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen; INFOPLIST_KEY_UIMainStoryboardFile = Main; + INFOPLIST_KEY_UIRequiredDeviceCapabilities = armv7; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; LD_RUNPATH_SEARCH_PATHS = ( @@ -666,11 +670,11 @@ }; 1E00647927CEDF9C006FF6E9 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 7F000751C879143D6999BEC4 /* Pods-OrcaAppTestUITests.debug.xcconfig */; + baseConfigurationReference = C155C82B6A142F4C976635FF /* Pods-OrcaAppTestUITests.debug.xcconfig */; buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -689,11 +693,11 @@ }; 1E00647A27CEDF9C006FF6E9 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 239D8F7DFD3E66DB68DD74C7 /* Pods-OrcaAppTestUITests.release.xcconfig */; + baseConfigurationReference = 7682693483C5AA5FF0787ED0 /* Pods-OrcaAppTestUITests.release.xcconfig */; buildSettings = { CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -712,12 +716,12 @@ }; 1E5B7AEA2800B29F00F8BDDB /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 0402F655F9B10F18A7CE7EE2 /* Pods-PerformanceTest.debug.xcconfig */; + baseConfigurationReference = D541FCBB3EA63023F5190039 /* Pods-PerformanceTest.debug.xcconfig */; buildSettings = { CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", @@ -737,12 +741,12 @@ }; 1E5B7AEB2800B29F00F8BDDB /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = B9DD6D6C983DB5CCF60FA7F1 /* Pods-PerformanceTest.release.xcconfig */; + baseConfigurationReference = 25F6359237FFDE5259713A21 /* Pods-PerformanceTest.release.xcconfig */; buildSettings = { CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = ""; GENERATE_INFOPLIST_FILE = YES; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", diff --git a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme index c351e8e5..5aadb9b4 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme +++ b/binding/ios/OrcaAppTest/OrcaAppTest.xcodeproj/xcshareddata/xcschemes/OrcaAppTest.xcscheme @@ -60,6 +60,13 @@ ReferencedContainer = "container:OrcaAppTest.xcodeproj"> + + + + CFBundleDevelopmentRegion $(DEVELOPMENT_LANGUAGE) CFBundleDisplayName - PorcupineDemoApp + OrcaDemoApp CFBundleExecutable $(EXECUTABLE_NAME) CFBundleIdentifier diff --git a/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift b/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift index 5269e9fa..4228c9fc 100644 --- a/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift +++ b/binding/ios/OrcaAppTest/OrcaAppTestUITests/BaseTest.swift @@ -14,60 +14,34 @@ import Orca struct TestData: Decodable { var test_sentences: TestSentences - var wer_threshold: Float32 + var random_state: Int64 + var alignments: [TestAlignments] } struct TestSentences: Decodable { var text: String var text_no_punctuation: String var text_custom_pronunciation: String + var text_alignment: String var text_invalid: [String] } -extension String { - subscript(index: Int) -> Character { - return self[self.index(self.startIndex, offsetBy: index)] - } +struct TestAlignments: Decodable { + var word: String + var start_sec: Float + var end_sec: Float + var phonemes: [TestPhonemes] } -extension String { - public func levenshtein(_ other: String) -> Int { - let sCount = self.count - let oCount = other.count - - guard sCount != 0 else { - return oCount - } - - guard oCount != 0 else { - return sCount - } - - let line: [Int] = Array(repeating: 0, count: oCount + 1) - var mat: [[Int]] = Array(repeating: line, count: sCount + 1) - - for i in 0...sCount { - mat[i][0] = i - } - - for j in 0...oCount { - mat[0][j] = j - } - - for j in 1...oCount { - for i in 1...sCount { - if self[i - 1] == other[j - 1] { - mat[i][j] = mat[i - 1][j - 1] // no operation - } else { - let del = mat[i - 1][j] + 1 // deletion - let ins = mat[i][j - 1] + 1 // insertion - let sub = mat[i - 1][j - 1] + 1 // substitution - mat[i][j] = min(min(del, ins), sub) - } - } - } +struct TestPhonemes: Decodable { + var phoneme: String + var start_sec: Float + var end_sec: Float +} - return mat[sCount][oCount] +extension String { + subscript(index: Int) -> Character { + return self[self.index(self.startIndex, offsetBy: index)] } } @@ -81,6 +55,15 @@ class BaseTest: XCTestCase { var orcas: [Orca] = [] var testData: TestData? + let testAudioMaleSingle = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_male_single", withExtension: "wav")! + let testAudioMaleStream = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_male_stream", withExtension: "wav")! + let testAudioFemaleSingle = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_female_single", withExtension: "wav")! + let testAudioFemaleStream = Bundle(for: BaseTest.self) + .url(forResource: "test_resources/wav/orca_params_female_stream", withExtension: "wav")! + override func setUp() async throws { try await super.setUp() @@ -118,7 +101,45 @@ class BaseTest: XCTestCase { return testData } - func characterErrorRate(transcript: String, expectedTranscript: String) -> Float { - return Float(transcript.levenshtein(expectedTranscript)) / Float(expectedTranscript.count) + func compareArrays(arr1: [Int16], arr2: [Int16], step: Int) -> Bool { + for i in stride(from: 0, to: arr1.count - step, by: step) where !(abs(arr1[i] - arr2[i]) <= 500) { + return false + } + return true + } + + func getPcm(fileUrl: URL) throws -> [Int16] { + let data = try Data(contentsOf: fileUrl) + let pcmData = data.withUnsafeBytes { (ptr: UnsafePointer) -> [Int16] in + let count = data.count / MemoryLayout.size + return Array(UnsafeBufferPointer(start: ptr.advanced(by: 22), count: count - 22)) + } + return pcmData + } + + func validateMetadata(words: [OrcaWord], expectedWords: [OrcaWord], isExpectExact: Bool) { + XCTAssertEqual(words.count, expectedWords.count) + + for i in 0.. 0 { results.append(totalNSec) } - orca?.delete() + orca.delete() } let avgNSec = results.reduce(0.0, +) / Double(numTestIterations) diff --git a/binding/ios/OrcaAppTest/Podfile b/binding/ios/OrcaAppTest/Podfile index 0cafd440..f7606386 100644 --- a/binding/ios/OrcaAppTest/Podfile +++ b/binding/ios/OrcaAppTest/Podfile @@ -2,16 +2,13 @@ source 'https://cdn.cocoapods.org/' platform :ios, '13.0' target 'OrcaAppTest' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end target 'OrcaAppTestUITests' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end target 'PerformanceTest' do - pod 'Orca-iOS', '~> 0.1.0' - pod 'Leopard-iOS', '~> 2.0.1' + pod 'Orca-iOS', '~> 0.2.0' end diff --git a/binding/ios/OrcaAppTest/Podfile.lock b/binding/ios/OrcaAppTest/Podfile.lock index 9776fe52..41e5ed08 100644 --- a/binding/ios/OrcaAppTest/Podfile.lock +++ b/binding/ios/OrcaAppTest/Podfile.lock @@ -1,20 +1,16 @@ PODS: - - Leopard-iOS (2.0.1) - - Orca-iOS (0.1.0) + - Orca-iOS (0.2.0) DEPENDENCIES: - - Leopard-iOS (~> 2.0.1) - - Orca-iOS (~> 0.1.0) + - Orca-iOS (~> 0.2.0) SPEC REPOS: trunk: - - Leopard-iOS - Orca-iOS SPEC CHECKSUMS: - Leopard-iOS: 8c94dcf886800b4ed361c1c6af763780ef16f722 - Orca-iOS: 808b4c77678454905ea0a0c1408eff8f9255e3ac + Orca-iOS: 01bbf44ba52a102104fc09aded6bfda7beb4865e -PODFILE CHECKSUM: 1ab9a668595c361f16dadb12876e074b4092d531 +PODFILE CHECKSUM: 01d0a4d9f05893e5371be0f1775f4f59ed59da27 -COCOAPODS: 1.11.3 +COCOAPODS: 1.15.2 diff --git a/binding/ios/OrcaAppTest/copy_test_resources.sh b/binding/ios/OrcaAppTest/copy_test_resources.sh index deee665b..2c850b78 100755 --- a/binding/ios/OrcaAppTest/copy_test_resources.sh +++ b/binding/ios/OrcaAppTest/copy_test_resources.sh @@ -6,9 +6,9 @@ echo "Copying test model files..." mkdir -p ${ASSETS_DIR}/model_files cp ${LIB_DIR}/common/*.pv ${ASSETS_DIR}/model_files -echo "Copying Leopard model files..." -mkdir -p ${ASSETS_DIR}/model_files -cp ${RESOURCE_DIR}/.test/models/*.pv ${ASSETS_DIR}/model_files +echo "Copying wav files..." +mkdir -p ${ASSETS_DIR}/wav +cp ${RESOURCE_DIR}/.test/wav/*.wav ${ASSETS_DIR}/wav echo "Copying test data file..." -cp ${RESOURCE_DIR}/.test/test_data.json ${ASSETS_DIR} \ No newline at end of file +cp ${RESOURCE_DIR}/.test/test_data.json ${ASSETS_DIR} diff --git a/binding/ios/README.md b/binding/ios/README.md index 7bd8dea2..a232528f 100644 --- a/binding/ios/README.md +++ b/binding/ios/README.md @@ -1,8 +1,9 @@ -# Orca Text-to-Speech Engine +# Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency voice +assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -18,7 +19,8 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Installation -The Orca iOS binding is available via [Cocoapods](https://cocoapods.org/pods/Orca-iOS). To import it into your iOS project, add the following line to your Podfile and run `pod install`: +The Orca iOS binding is available via [Cocoapods](https://cocoapods.org/pods/Orca-iOS). To import it into your iOS +project, add the following line to your Podfile and run `pod install`: ```ruby @@ -27,13 +29,18 @@ pod 'Orca-iOS' ## AccessKey -Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca SDKs. +Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca +SDKs. You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. ## Usage -Create an instance of the engine: +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, a complete text is synthesized in a single call to the Orca engine. + +Create an instance of the Orca engine: ```swift import Orca @@ -51,25 +58,65 @@ do { Alternatively, you can provide `modelPath` as an absolute path to the model file on device. -You can synthesize speech by calling one of the `synthesize` methods: +To synthesize a text stream, create an `Orca.OrcaStream` object and add text to it one-by-one: + +```swift +let orcaStream = try orca.streamOpen() + +for textChunk in textGenerator() { + let pcm = orcaStream.synthesize(textChunk) + if pcm != nil { + // handle pcm + } +} + +let pcm = orcaStream.flush() +if pcm != nil { + // handle pcm +} +``` + +The `textGenerator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `orcaStream.synthesize()`. +To ensure smooth transitions between chunks, the `orcaStream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `orcaStream.flush()` needs to be invoked. +When done with streaming text synthesis, the `Orca.OrcaStream` object needs to be closed: + +```swift +orcaStream.close() +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```swift -// return raw pcm -let pcm = try orca.synthesize(text: "${TEXT}") +// Return raw PCM and alignments +let (pcm, wordArray) = try orca.synthesize(text: "${TEXT}") -// save to a file -try orca.synthesizeToFile(text: "${TEXT}", outputPath: "${OUTPUT_PATH}") +// Save the generated audio to a WAV file directly +let wordArray = try orca.synthesizeToFile(text: "${TEXT}", outputPath: "${OUTPUT_PATH}") ``` Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a single-channel 16-bit PCM WAV file. - +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of an array of `OrcaWord` +objects. When done, resources have to be released explicitly: ```swift orca.delete() ``` +### Text input + +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the +`validCharacters()` method provided in the Orca SDK you are using. +Pronunciations of characters or words not supported by this list can be achieved with +[custom pronunciations](#custom-pronunciations). + ### Custom pronunciations Orca allows to embed custom pronunciations in the text via the syntax: `{word|pronunciation}`.\ @@ -99,26 +146,48 @@ and replace `${MODEL_FILE_PATH}` or `${MODEL_FILE_URL}` with the path to the mod ### Speech control -Orca allows for keyword arguments to be provided to the `synthesize` methods to control the synthesized speech: +Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `streamOpen` +method or the single synthesis methods `synthesize` and `synthesizeToFile`: - `speechRate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. +- `randomState`: Sets the random state for sampling during synthesis. This can be used to ensure that the synthesized + speech is deterministic across different runs. ```swift let pcm = orca.synthesize( text: "${TEXT}", - speechRate: 1.0) + speechRate: 1.0, + randomState: 1) ``` ### Orca properties -To obtain the set of valid punctuation symbols, call `Orca.validPunctuationSymbols`. +To obtain the set of valid characters, call `Orca.validCharacters`. To retrieve the maximum number of characters allowed, call `Orca.maxCharacterLimit`. The sample rate of Orca is `Orca.sampleRate`. +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaWord` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. + ## Running Unit Tests -Copy your `AccessKey` into the `accessKey` variable in [`OrcaAppTestUITests.swift`](OrcaAppTest/OrcaAppTestUITests/OrcaAppTestUITests.swift). Open `OrcaAppTest.xcworkspace` with XCode and run the tests with `Product > Test`. +Copy your `AccessKey` into the `accessKey` variable +in [`OrcaAppTestUITests.swift`](OrcaAppTest/OrcaAppTestUITests/OrcaAppTestUITests.swift). Open `OrcaAppTest.xcworkspace` +with XCode and run the tests with `Product > Test`. ## Demo App diff --git a/binding/python/README.md b/binding/python/README.md index 6dcb13de..39b598a1 100644 --- a/binding/python/README.md +++ b/binding/python/README.md @@ -1,10 +1,11 @@ # Orca Binding for Python -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -15,7 +16,7 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Compatibility -- Python 3.7+ +- Python 3.8+ - Runs on Linux (x86_64), macOS (x86_64, arm64), Windows (x86_64), Raspberry Pi (5, 4, 3), and NVIDIA Jetson Nano. ## Installation @@ -32,6 +33,10 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Usage +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, a complete text is synthesized in a single call to the Orca engine. + Create an instance of the Orca engine: ```python @@ -42,24 +47,67 @@ orca = pvorca.create(access_key='${ACCESS_KEY}') Replace the `${ACCESS_KEY}` with your AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/). -You can synthesize speech by calling one of the `synthesize` methods: +To synthesize a text stream, create an `Orca.OrcaStream` object and add text to it one-by-one: + +```python +stream = orca.stream_open() + +for text_chunk in text_generator(): + pcm = stream.synthesize(text_chunk) + if pcm is not None: + # handle pcm + +pcm = stream.flush() +if pcm is not None: + # handle pcm +``` + +The `text_generator()` function can be any stream generating text, for example an LLM response. +Orca produces audio chunks in parallel to the incoming text stream, and returns the raw PCM whenever enough context has +been added via `stream.synthesize()`. +To ensure smooth transitions between chunks, the `stream.synthesize()` function returns an audio chunk that only +includes the audio for a portion of the text that has been added. +To generate the audio for the remaining text, `stream.flush()` needs to be invoked. +When done with streaming text synthesis, the `Orca.OrcaStream` object needs to be closed: + +```python +stream.close() +``` + +If the complete text is known before synthesis, single synthesis mode can be used to generate speech in a single call to +Orca: ```python # Return raw PCM -pcm = orca.synthesize(text='${TEXT}') +pcm, alignments = orca.synthesize(text='${TEXT}') # Save the generated audio to a WAV file directly -orca.synthesize_to_file(text='${TEXT}', path='${OUTPUT_PATH}') +alignments = orca.synthesize_to_file(text='${TEXT}', path='${OUTPUT_PATH}') ``` Replace `${TEXT}` with the text to be synthesized and `${OUTPUT_PATH}` with the path to save the generated audio as a -single-channel 16-bit PCM WAV file.\ -When done make sure to explicitly release the resources with `orca.delete()`. +single-channel 16-bit PCM WAV file. +In single synthesis mode, Orca returns metadata of the synthesized audio in the form of a list of `Orca.WordAlignment` +objects. +You can print the metadata with: + +```python +for token in alignments: + print(f"word=\"{token.word}\", start_sec={token.start_sec:.2f}, end_sec={token.end_sec:.2f}") + for phoneme in token.phonemes: + print(f"\tphoneme=\"{phoneme.phoneme}\", start_sec={phoneme.start_sec:.2f}, end_sec={phoneme.end_sec:.2f}") +``` + +When done make sure to explicitly release the resources using: + +```python +orca.delete() +``` ### Text input -Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, as well as -common punctuation marks. You can get a list of all supported characters by calling the +Orca accepts the 26 lowercase (a-z) and 26 uppercase (A-Z) letters of the English alphabet, numbers, +basic symbols, as well as common punctuation marks. You can get a list of all supported characters by calling the `valid_characters()` method provided in the Orca SDK you are using. Pronunciations of characters or words not supported by this list can be achieved with [custom pronunciations](#custom-pronunciations). @@ -87,10 +135,14 @@ and replace `${MODEL_PATH}` with the path to the model file with the desired voi ### Speech control -Orca allows for keyword arguments to be provided to the `synthesize` methods to control the synthesized speech: +Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `stream_open` +method or the single synthesis methods `synthesize` and `synthesize_to_file`: - `speech_rate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. +- `random_state`: Sets the random state for sampling during synthesis. This can be used to ensure that the synthesized + speech is deterministic across different runs. Valid values are all non-negative integers. If not provided, a random + seed will be chosen and the synthesis process will be non-deterministic. ### Orca properties @@ -98,8 +150,23 @@ To obtain the set of valid characters, call `orca.valid_characters`.\ To retrieve the maximum number of characters allowed, call `orca.max_character_limit`.\ The sample rate of Orca is `orca.sample_rate`. +### Alignment Metadata + +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `Orca.WordAlignment` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** A list of `Orca.PhonemeAlignment` objects. + +The `Orca.PhonemeAlignment` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. + ## Demos [pvorcademo](https://pypi.org/project/pvorcademo/) provides command-line utilities for synthesizing audio using Orca. - diff --git a/binding/python/_orca.py b/binding/python/_orca.py index 53a5030d..15c37aa1 100644 --- a/binding/python/_orca.py +++ b/binding/python/_orca.py @@ -1,10 +1,12 @@ import os +from collections import namedtuple from ctypes import * from enum import Enum from typing import ( Optional, Sequence, - Set) + Set, + Tuple) class OrcaError(Exception): @@ -75,45 +77,152 @@ class OrcaActivationRefusedError(OrcaError): pass -class Orca(object): +class PicovoiceStatuses(Enum): + SUCCESS = 0 + OUT_OF_MEMORY = 1 + IO_ERROR = 2 + INVALID_ARGUMENT = 3 + STOP_ITERATION = 4 + KEY_ERROR = 5 + INVALID_STATE = 6 + RUNTIME_ERROR = 7 + ACTIVATION_ERROR = 8 + ACTIVATION_LIMIT_REACHED = 9 + ACTIVATION_THROTTLED = 10 + ACTIVATION_REFUSED = 11 + + +_PICOVOICE_STATUS_TO_EXCEPTION = { + PicovoiceStatuses.OUT_OF_MEMORY: OrcaMemoryError, + PicovoiceStatuses.IO_ERROR: OrcaIOError, + PicovoiceStatuses.INVALID_ARGUMENT: OrcaInvalidArgumentError, + PicovoiceStatuses.STOP_ITERATION: OrcaStopIterationError, + PicovoiceStatuses.KEY_ERROR: OrcaKeyError, + PicovoiceStatuses.INVALID_STATE: OrcaInvalidStateError, + PicovoiceStatuses.RUNTIME_ERROR: OrcaRuntimeError, + PicovoiceStatuses.ACTIVATION_ERROR: OrcaActivationError, + PicovoiceStatuses.ACTIVATION_LIMIT_REACHED: OrcaActivationLimitError, + PicovoiceStatuses.ACTIVATION_THROTTLED: OrcaActivationThrottledError, + PicovoiceStatuses.ACTIVATION_REFUSED: OrcaActivationRefusedError, +} + + +class COrcaPhonemeAlignment(Structure): + _fields_ = [ + ("phoneme", c_char_p), + ("start_sec", c_float), + ("end_sec", c_float), + ] + + +class COrcaWordAlignment(Structure): + _fields_ = [ + ("word", c_char_p), + ("start_sec", c_float), + ("end_sec", c_float), + ("num_phonemes", c_int32), + ("phonemes", POINTER(POINTER(COrcaPhonemeAlignment))), + ] + + +class Orca: """ Python binding for Orca Text-to-Speech engine. """ - class PicovoiceStatuses(Enum): - SUCCESS = 0 - OUT_OF_MEMORY = 1 - IO_ERROR = 2 - INVALID_ARGUMENT = 3 - STOP_ITERATION = 4 - KEY_ERROR = 5 - INVALID_STATE = 6 - RUNTIME_ERROR = 7 - ACTIVATION_ERROR = 8 - ACTIVATION_LIMIT_REACHED = 9 - ACTIVATION_THROTTLED = 10 - ACTIVATION_REFUSED = 11 - - _PICOVOICE_STATUS_TO_EXCEPTION = { - PicovoiceStatuses.OUT_OF_MEMORY: OrcaMemoryError, - PicovoiceStatuses.IO_ERROR: OrcaIOError, - PicovoiceStatuses.INVALID_ARGUMENT: OrcaInvalidArgumentError, - PicovoiceStatuses.STOP_ITERATION: OrcaStopIterationError, - PicovoiceStatuses.KEY_ERROR: OrcaKeyError, - PicovoiceStatuses.INVALID_STATE: OrcaInvalidStateError, - PicovoiceStatuses.RUNTIME_ERROR: OrcaRuntimeError, - PicovoiceStatuses.ACTIVATION_ERROR: OrcaActivationError, - PicovoiceStatuses.ACTIVATION_LIMIT_REACHED: OrcaActivationLimitError, - PicovoiceStatuses.ACTIVATION_THROTTLED: OrcaActivationThrottledError, - PicovoiceStatuses.ACTIVATION_REFUSED: OrcaActivationRefusedError, - } - class COrca(Structure): pass class COrcaSynthesizeParams(Structure): pass + class COrcaStream(Structure): + pass + + class OrcaStream: + """ + Orca Stream object that converts a stream of text to a stream of audio. + """ + + def __init__(self, handle: POINTER('Orca.COrcaStream'), orca: 'Orca') -> None: + self._handle = handle + self._orca = orca + + def synthesize(self, text: str) -> Optional[Sequence[int]]: + """ + Adds a chunk of text to the Stream object and generates audio if enough text has been added. + This function is expected to be called multiple times with consecutive chunks of text from a text stream. + The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + buffered text into audio. The caller needs to use `pv_orca_stream_flush()` to generate the audio chunk + for the remaining text that has not yet been synthesized. + The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + + :param text: A chunk of text from a text input stream, comprised of valid characters. + Valid characters can be retrieved by calling `pv_orca_valid_characters()`. + Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + They need to be added in a single call to this function. + The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + :return: The generated audio as a sequence of 16-bit linearly-encoded integers, `None` if no + audio chunk has been produced. + """ + + c_num_samples = c_int32() + c_pcm = POINTER(c_int16)() + + status = self._orca._stream_synthesize_func( + self._handle, + text.encode("utf-8"), + byref(c_num_samples), + byref(c_pcm) + ) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to synthesize text in Orca stream", + message_stack=self._orca._get_error_stack()) + + pcm = None + if c_num_samples.value > 0: + pcm = [c_pcm[i] for i in range(c_num_samples.value)] + + self._orca._pcm_delete_func(c_pcm) + + return pcm + + def flush(self) -> Optional[Sequence[int]]: + """ + Generates audio for all the buffered text that was added to the OrcaStream object + via `pv_orca_stream_synthesize()`. + The caller is responsible for deleting the generated audio with `pv_orca_pcm_delete()`. + + :return: The generated audio as a sequence of 16-bit linearly-encoded integers, `None` if no + audio chunk has been produced. + """ + + c_num_samples = c_int32() + c_pcm = POINTER(c_int16)() + + status = self._orca._stream_flush_func( + self._handle, + byref(c_num_samples), + byref(c_pcm) + ) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to flush Orca stream", + message_stack=self._orca._get_error_stack()) + + pcm = [c_pcm[i] for i in range(c_num_samples.value)] + self._orca._pcm_delete_func(c_pcm) + + return pcm + + def close(self) -> None: + """ + Releases the resources acquired by the OrcaStream object. + """ + + self._orca._stream_close_func(self._handle) + def __init__(self, access_key: str, model_path: str, library_path: str) -> None: """ Constructor. @@ -142,7 +251,7 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._get_error_stack_func = library.pv_get_error_stack self._get_error_stack_func.argtypes = [POINTER(POINTER(c_char_p)), POINTER(c_int)] - self._get_error_stack_func.restype = self.PicovoiceStatuses + self._get_error_stack_func.restype = PicovoiceStatuses self._free_error_stack_func = library.pv_free_error_stack self._free_error_stack_func.argtypes = [POINTER(c_char_p)] @@ -150,12 +259,12 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: init_func = library.pv_orca_init init_func.argtypes = [c_char_p, c_char_p, POINTER(POINTER(self.COrca))] - init_func.restype = self.PicovoiceStatuses + init_func.restype = PicovoiceStatuses self._handle = POINTER(self.COrca)() status = init_func(access_key.encode(), model_path.encode(), byref(self._handle)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message='Initialization failed', message_stack=self._get_error_stack()) @@ -163,27 +272,58 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._delete_func.argtypes = [POINTER(self.COrca)] self._delete_func.restype = None - self._valid_characters_func = library.pv_orca_valid_characters - self._valid_characters_func.argtypes = [ + valid_characters_func = library.pv_orca_valid_characters + valid_characters_func.argtypes = [ POINTER(self.COrca), POINTER(c_int32), POINTER(POINTER(POINTER(c_char_p))), ] - self._valid_characters_func.restype = self.PicovoiceStatuses + valid_characters_func.restype = PicovoiceStatuses + + valid_characters_delete_func = library.pv_orca_valid_characters_delete + valid_characters_delete_func.argtypes = [POINTER(POINTER(c_char_p))] + valid_characters_delete_func.restype = None + + c_num_characters = c_int32() + c_characters = POINTER(POINTER(c_char_p))() + status = valid_characters_func(self._handle, byref(c_num_characters), byref(c_characters)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca valid characters", + message_stack=self._get_error_stack()) + + num_characters = c_num_characters.value + characters_array_pointer = cast(c_characters, POINTER(c_char_p * num_characters)) + self._valid_characters = set([symbol.decode('utf-8') for symbol in list(characters_array_pointer.contents)]) + valid_characters_delete_func(c_characters) - self._valid_characters_delete_func = library.pv_orca_valid_characters_delete - self._valid_characters_delete_func.argtypes = [POINTER(POINTER(c_char_p))] - self._valid_characters_delete_func.restype = None + sample_rate_func = library.pv_orca_sample_rate + sample_rate_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] + sample_rate_func.restype = PicovoiceStatuses - self._sample_rate_func = library.pv_orca_sample_rate - self._sample_rate_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] - self._sample_rate_func.restype = self.PicovoiceStatuses + c_sample_rate = c_int32() + status = sample_rate_func(self._handle, byref(c_sample_rate)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca sample rate", + message_stack=self._get_error_stack()) + self._sample_rate = c_sample_rate.value - self._max_character_limit = library.pv_orca_max_character_limit() + max_character_limit_func = library.pv_orca_max_character_limit + max_character_limit_func.argtypes = [POINTER(self.COrca), POINTER(c_int32)] + max_character_limit_func.restype = PicovoiceStatuses + + c_max_character_limit = c_int32() + status = max_character_limit_func(self._handle, byref(c_max_character_limit)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to get Orca maximum character limit", + message_stack=self._get_error_stack()) + self._max_character_limit = c_max_character_limit.value self._synthesize_params_init_func = library.pv_orca_synthesize_params_init self._synthesize_params_init_func.argtypes = [POINTER(POINTER(self.COrcaSynthesizeParams))] - self._synthesize_params_init_func.restype = self.PicovoiceStatuses + self._synthesize_params_init_func.restype = PicovoiceStatuses self._synthesize_params_delete_func = library.pv_orca_synthesize_params_delete self._synthesize_params_delete_func.argtypes = [POINTER(self.COrcaSynthesizeParams)] @@ -191,7 +331,11 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: self._synthesize_params_set_speech_rate_func = library.pv_orca_synthesize_params_set_speech_rate self._synthesize_params_set_speech_rate_func.argtypes = [POINTER(self.COrcaSynthesizeParams), c_float] - self._synthesize_params_set_speech_rate_func.restype = self.PicovoiceStatuses + self._synthesize_params_set_speech_rate_func.restype = PicovoiceStatuses + + self._synthesize_params_set_random_state_func = library.pv_orca_synthesize_params_set_random_state + self._synthesize_params_set_random_state_func.argtypes = [POINTER(self.COrcaSynthesizeParams), c_int64] + self._synthesize_params_set_random_state_func.restype = PicovoiceStatuses self._synthesize_func = library.pv_orca_synthesize self._synthesize_func.argtypes = [ @@ -200,8 +344,10 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: POINTER(self.COrcaSynthesizeParams), POINTER(c_int32), POINTER(POINTER(c_int16)), + POINTER(c_int32), + POINTER(POINTER(POINTER(COrcaWordAlignment))), ] - self._synthesize_func.restype = self.PicovoiceStatuses + self._synthesize_func.restype = PicovoiceStatuses self._synthesize_to_file_func = library.pv_orca_synthesize_to_file self._synthesize_to_file_func.argtypes = [ @@ -209,18 +355,56 @@ def __init__(self, access_key: str, model_path: str, library_path: str) -> None: c_char_p, POINTER(self.COrcaSynthesizeParams), c_char_p, + POINTER(c_int32), + POINTER(POINTER(POINTER(COrcaWordAlignment))), ] - self._synthesize_to_file_func.restype = self.PicovoiceStatuses + self._synthesize_to_file_func.restype = PicovoiceStatuses + + self._word_alignments_delete_func = library.pv_orca_word_alignments_delete + self._word_alignments_delete_func.argtypes = [c_int32, POINTER(POINTER(COrcaWordAlignment))] + self._word_alignments_delete_func.restype = PicovoiceStatuses - self._delete_pcm_func = library.pv_orca_delete_pcm - self._delete_pcm_func.argtypes = [POINTER(c_int16)] - self._delete_pcm_func.restype = None + self._pcm_delete_func = library.pv_orca_pcm_delete + self._pcm_delete_func.argtypes = [POINTER(c_int16)] + self._pcm_delete_func.restype = None + + self._stream_open_func = library.pv_orca_stream_open + self._stream_open_func.argtypes = [ + POINTER(self.COrca), + POINTER(self.COrcaSynthesizeParams), + POINTER(POINTER(self.COrcaStream)) + ] + self._stream_open_func.restype = PicovoiceStatuses + + self._stream_synthesize_func = library.pv_orca_stream_synthesize + self._stream_synthesize_func.argtypes = [ + POINTER(self.COrcaStream), + c_char_p, + POINTER(c_int32), + POINTER(POINTER(c_int16)) + ] + self._stream_synthesize_func.restype = PicovoiceStatuses + + self._stream_flush_func = library.pv_orca_stream_flush + self._stream_flush_func.argtypes = [ + POINTER(self.COrcaStream), + POINTER(c_int32), + POINTER(POINTER(c_int16)) + ] + self._stream_flush_func.restype = PicovoiceStatuses + + self._stream_close_func = library.pv_orca_stream_close + self._stream_close_func.argtypes = [POINTER(self.COrcaStream)] + self._stream_close_func.restype = None version_func = library.pv_orca_version version_func.argtypes = [] version_func.restype = c_char_p self._version = version_func().decode("utf-8") + PhonemeAlignment = namedtuple('Phoneme', ['phoneme', 'start_sec', 'end_sec']) + WordAlignment = namedtuple('Word', ['word', 'start_sec', 'end_sec', 'phonemes']) + def delete(self) -> None: """Releases resources acquired by Orca.""" @@ -230,36 +414,13 @@ def delete(self) -> None: def valid_characters(self) -> Set[str]: """Set of characters supported by Orca.""" - c_num_characters = c_int32() - c_characters = POINTER(POINTER(c_char_p))() - - status = self._valid_characters_func(self._handle, byref(c_num_characters), byref(c_characters)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( - message="Unable to get Orca valid characters", - message_stack=self._get_error_stack()) - - num_characters = c_num_characters.value - characters_array_pointer = cast(c_characters, POINTER(c_char_p * num_characters)) - characters = set([symbol.decode('utf-8') for symbol in list(characters_array_pointer.contents)]) - - self._valid_characters_delete_func(c_characters) - - return characters + return self._valid_characters @property def sample_rate(self) -> int: """Audio sample rate of generated audio.""" - c_sample_rate = c_int32() - - status = self._sample_rate_func(self._handle, byref(c_sample_rate)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( - message="Unable to get Orca sample rate", - message_stack=self._get_error_stack()) - - return c_sample_rate.value + return self._sample_rate @property def max_character_limit(self) -> int: @@ -270,7 +431,8 @@ def max_character_limit(self) -> int: def synthesize( self, text: str, - speech_rate: Optional[float] = None) -> Sequence[int]: + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> Tuple[Sequence[int], Sequence[WordAlignment]]: """ Generates audio from text. The returned audio contains the speech representation of the text. @@ -278,37 +440,49 @@ def synthesize( `self.max_character_limit`. Allowed characters can be retrieved by calling `self.pv_orca_valid_characters`. Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". - :param speech_rate: Rate of speech of the synthesized audio. - :return: The generated audio, stored as a sequence of 16-bit linearly-encoded integers. + :param speech_rate: Rate of speech of the synthesized audio. Higher numbers correspond to faster speech. + Valid values are within [0.7, 1.3]. + :param random_state: Random seed for the synthesis process. Valid values are all non-negative integer. If not + provided, a random seed will be chosen. + :return: A tuple containing the generated audio as a sequence of 16-bit linearly-encoded integers + and a sequence of OrcaWordAlignment objects representing the word alignments. """ - c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate) + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) c_num_samples = c_int32() c_pcm = POINTER(c_int16)() + c_num_alignments = c_int32() + c_alignments = POINTER(POINTER(COrcaWordAlignment))() + status = self._synthesize_func( self._handle, text.encode("utf-8"), c_synthesize_params, byref(c_num_samples), - byref(c_pcm)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + byref(c_pcm), + byref(c_num_alignments), + byref(c_alignments)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to synthesize speech", message_stack=self._get_error_stack()) pcm = [c_pcm[i] for i in range(c_num_samples.value)] + self._pcm_delete_func(c_pcm) + + alignments = self._get_alignments(c_num_alignments=c_num_alignments, c_alignments=c_alignments) - self._delete_pcm_func(c_pcm) self._synthesize_params_delete_func(c_synthesize_params) - return pcm + return pcm, alignments def synthesize_to_file( self, text: str, output_path: str, - speech_rate: Optional[float] = None) -> None: + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> Sequence[WordAlignment]: """ Generates audio from text. The returned audio contains the speech representation of the text. @@ -319,44 +493,127 @@ def synthesize_to_file( :param output_path: Absolute path to the output audio file. The output file is saved as `WAV (.wav)` and consists of a single mono channel. :param speech_rate: Rate of speech of the generated audio. + :param random_state: Random seed for the synthesis process. + :return: A sequence of OrcaWordAlignment objects representing the word alignments. """ - c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate) + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) + + c_num_alignments = c_int32() + c_alignments = POINTER(POINTER(COrcaWordAlignment))() status = self._synthesize_to_file_func( self._handle, text.encode("utf-8"), c_synthesize_params, - output_path.encode("utf-8")) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + output_path.encode("utf-8"), + byref(c_num_alignments), + byref(c_alignments)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to synthesize speech", message_stack=self._get_error_stack()) + alignments = self._get_alignments(c_num_alignments=c_num_alignments, c_alignments=c_alignments) + self._synthesize_params_delete_func(c_synthesize_params) + return alignments + + def stream_open(self, speech_rate: Optional[float] = None, random_state: Optional[int] = None) -> 'Orca.OrcaStream': + """ + Opens a stream for streaming text synthesis. + + :param speech_rate: Rate of speech of the generated audio. + :param random_state: Random seed for the synthesis process. + :return: An instance of Orca.OrcaStream. + """ + + c_synthesize_params = self._get_c_synthesize_params(speech_rate=speech_rate, random_state=random_state) + + stream_handle = POINTER(Orca.COrcaStream)() + status = self._stream_open_func( + self._handle, + c_synthesize_params, + byref(stream_handle)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to open Orca stream", + message_stack=self._get_error_stack()) + + self._synthesize_params_delete_func(c_synthesize_params) + + return self.OrcaStream(stream_handle, self) + @property def version(self) -> str: """Version.""" return self._version - def _get_c_synthesize_params(self, speech_rate: Optional[float] = None) -> POINTER(COrcaSynthesizeParams): + def _get_alignments( + self, + c_num_alignments: c_int32, + c_alignments: POINTER(POINTER(COrcaWordAlignment))) -> Sequence[WordAlignment]: + alignments = [] + for i in range(c_num_alignments.value): + word_alignment = c_alignments[i].contents + word = word_alignment.word.decode("utf-8") + start_sec = word_alignment.start_sec + end_sec = word_alignment.end_sec + num_phonemes = word_alignment.num_phonemes + phoneme_alignments = [] + for j in range(num_phonemes): + phoneme_alignment = word_alignment.phonemes[j].contents + phoneme = phoneme_alignment.phoneme.decode("utf-8") + phoneme_start_sec = phoneme_alignment.start_sec + phoneme_end_sec = phoneme_alignment.end_sec + phoneme_alignment = self.PhonemeAlignment( + phoneme=phoneme, + start_sec=phoneme_start_sec, + end_sec=phoneme_end_sec) + phoneme_alignments.append(phoneme_alignment) + word_alignment = self.WordAlignment( + word=word, + start_sec=start_sec, + end_sec=end_sec, + phonemes=phoneme_alignments) + alignments.append(word_alignment) + + status = self._word_alignments_delete_func(c_num_alignments.value, c_alignments) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to delete Orca word alignments", + message_stack=self._get_error_stack()) + + return alignments + + def _get_c_synthesize_params( + self, + speech_rate: Optional[float] = None, + random_state: Optional[int] = None) -> POINTER(COrcaSynthesizeParams): c_params = POINTER(self.COrcaSynthesizeParams)() status = self._synthesize_params_init_func(byref(c_params)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to create Orca synthesize params object", message_stack=self._get_error_stack()) if speech_rate is not None: status = self._synthesize_params_set_speech_rate_func(c_params, c_float(speech_rate)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status]( + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( message="Unable to set Orca speech rate", message_stack=self._get_error_stack()) + if random_state is not None: + status = self._synthesize_params_set_random_state_func(c_params, c_int64(random_state)) + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status]( + message="Unable to set Orca random state", + message_stack=self._get_error_stack()) + return c_params def _get_error_stack(self) -> Sequence[str]: @@ -364,8 +621,8 @@ def _get_error_stack(self) -> Sequence[str]: message_stack_depth = c_int() status = self._get_error_stack_func(byref(message_stack_ref), byref(message_stack_depth)) - if status is not self.PicovoiceStatuses.SUCCESS: - raise self._PICOVOICE_STATUS_TO_EXCEPTION[status](message="Unable to get Orca error state") + if status is not PicovoiceStatuses.SUCCESS: + raise _PICOVOICE_STATUS_TO_EXCEPTION[status](message="Unable to get Orca error state") message_stack = list() for i in range(message_stack_depth.value): diff --git a/binding/python/requirements.txt b/binding/python/requirements.txt index 01726e58..e69de29b 100644 --- a/binding/python/requirements.txt +++ b/binding/python/requirements.txt @@ -1,2 +0,0 @@ -editdistance>=0.6.1 -pvleopard==2.0.1 \ No newline at end of file diff --git a/binding/python/setup.py b/binding/python/setup.py index 92c73b84..f0bd08cb 100644 --- a/binding/python/setup.py +++ b/binding/python/setup.py @@ -49,10 +49,10 @@ setuptools.setup( name="pvorca", - version="0.1.4", + version="0.2.1", author="Picovoice", author_email="hello@picovoice.ai", - description="Orca Text-to-Speech Engine.", + description="Orca Streaming Text-to-Speech Engine", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Picovoice/orca", @@ -66,6 +66,6 @@ "Programming Language :: Python :: 3", "Topic :: Multimedia :: Sound/Audio :: Speech", ], - python_requires='>=3.7', - keywords="Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", + python_requires='>=3.8', + keywords="Streaming Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", ) diff --git a/binding/python/test_orca.py b/binding/python/test_orca.py index c7f0fef5..a5f8f784 100644 --- a/binding/python/test_orca.py +++ b/binding/python/test_orca.py @@ -13,21 +13,21 @@ import os import sys import unittest -from typing import List - -import editdistance -import pvleopard +from typing import List, Sequence from _orca import Orca, OrcaError, OrcaInvalidArgumentError from _util import default_library_path, default_model_path -from test_util import get_model_paths, get_test_data +from test_util import get_model_paths, get_test_data, read_wav_file -test_sentences, wer_threshold = get_test_data() +test_data = get_test_data() class OrcaTestCase(unittest.TestCase): + EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = "female" + access_key: str orcas: List[Orca] + model_paths: List[str] @classmethod def setUpClass(cls): @@ -37,12 +37,44 @@ def setUpClass(cls): model_path=model_path, library_path=default_library_path('../..')) for model_path in get_model_paths()] + cls.model_paths = get_model_paths() @classmethod def tearDownClass(cls): for orca in cls.orcas: orca.delete() + def _test_audio(self, pcm: Sequence[int], ground_truth: Sequence[int]) -> None: + pcm = pcm[:len(ground_truth)] # compensate for discrepancies due to wav header + self.assertEqual(len(pcm), len(ground_truth)) + for i in range(len(pcm)): + self.assertAlmostEqual(pcm[i], ground_truth[i], delta=500) + + def _test_equal_timestamp(self, timestamp: float, timestamp_truth: float) -> None: + self.assertAlmostEqual(timestamp, timestamp_truth, places=2) + + def _test_phoneme_equal(self, phoneme: Orca.PhonemeAlignment, phoneme_truth: Orca.PhonemeAlignment) -> None: + self.assertEqual(phoneme.phoneme, phoneme_truth.phoneme) + self._test_equal_timestamp(phoneme.start_sec, phoneme_truth.start_sec) + self._test_equal_timestamp(phoneme.end_sec, phoneme_truth.end_sec) + + def _test_word_equal(self, word: Orca.WordAlignment, word_truth: Orca.WordAlignment) -> None: + self.assertEqual(word.word, word_truth.word) + self._test_equal_timestamp(word.start_sec, word_truth.start_sec) + self._test_equal_timestamp(word.end_sec, word_truth.end_sec) + + self.assertEqual(len(word.phonemes), len(word_truth.phonemes)) + for phoneme, phoneme_truth in zip(word.phonemes, word_truth.phonemes): + self._test_phoneme_equal(phoneme, phoneme_truth) + + @staticmethod + def _get_pcm(model_path: str, audio_data_folder: str, synthesis_type: str = "single") -> Sequence[int]: + test_wav_folder = os.path.join(os.path.dirname(__file__), "../../", audio_data_folder) + model_name = os.path.basename(model_path) + test_wav_path = \ + os.path.join(f"{test_wav_folder}", model_name.replace(".pv", f"_{synthesis_type}.wav")) + return read_wav_file(test_wav_path) + def test_valid_characters(self) -> None: for orca in self.orcas: characters = orca.valid_characters @@ -59,53 +91,89 @@ def test_sample_rate(self) -> None: self.assertGreater(orca.sample_rate, 0) def test_synthesize(self) -> None: - leopard = None - try: - leopard = pvleopard.create(access_key=self.access_key) - except NotImplementedError as e: - pass - - for orca in self.orcas: - pcm = orca.synthesize(test_sentences.text) + for i, orca in enumerate(self.orcas): + pcm, alignment = orca.synthesize(test_data.text, random_state=test_data.random_state) self.assertGreater(len(pcm), 0) - if leopard is None: - continue + ground_truth = self._get_pcm( + model_path=self.model_paths[i], + audio_data_folder=test_data.audio_data_folder, + synthesis_type="single") - ground_truth = test_sentences.text_no_punctuation.split() - predicted, _ = leopard.process(pcm) + self._test_audio(pcm=pcm, ground_truth=ground_truth) - wer = editdistance.eval(predicted.split(), ground_truth) / len(ground_truth) + def test_synthesize_alignment_exact(self) -> None: + orca = [ + orca for i, orca in enumerate(self.orcas) if + self.EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER in self.model_paths[i]].pop() + pcm, alignments = orca.synthesize(test_data.text_alignment, random_state=test_data.random_state) + self.assertGreater(len(pcm), 0) - if wer > wer_threshold: - print("Ground truth transcript: `%s`" % " ".join(ground_truth)) - print("Predicted transcript from synthesized audio: `%s`" % predicted) - print("=> WER: %.2f" % wer) - self.assertTrue(wer <= wer_threshold) + self.assertTrue(len(alignments) == len(test_data.alignments)) + for word, word_truth in zip(alignments, test_data.alignments): + self._test_word_equal(word, word_truth) + + def test_synthesize_alignment(self) -> None: + for i, orca in enumerate(self.orcas): + if self.EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER in self.model_paths[i]: + continue + + pcm, alignments = orca.synthesize(test_data.text_alignment, random_state=test_data.random_state) + self.assertGreater(len(pcm), 0) + + previous_word_end_sec = 0 + previous_phoneme_end_sec = 0 + for word in alignments: + self.assertTrue(word.start_sec == previous_word_end_sec) + self.assertTrue(word.end_sec > word.start_sec) + previous_word_end_sec = word.end_sec + + for phoneme in word.phonemes: + self.assertTrue(phoneme.start_sec == previous_phoneme_end_sec) + self.assertTrue(phoneme.start_sec >= word.start_sec) + self.assertTrue(phoneme.end_sec <= word.end_sec) + self.assertTrue(phoneme.end_sec > phoneme.start_sec) + previous_phoneme_end_sec = phoneme.end_sec + + def test_streaming_synthesis(self) -> None: + for i, orca in enumerate(self.orcas): + stream = orca.stream_open(random_state=test_data.random_state) + pcm = [] + for c in test_data.text: + pcm_chunk = stream.synthesize(c) + if pcm_chunk is not None: + pcm.extend(pcm_chunk) + pcm_chunk = stream.flush() + if pcm_chunk is not None: + pcm.extend(pcm_chunk) + stream.close() + + ground_truth = self._get_pcm( + model_path=self.model_paths[i], + audio_data_folder=test_data.audio_data_folder, + synthesis_type="stream") + + self._test_audio(pcm=pcm, ground_truth=ground_truth) def test_synthesize_custom_pron(self) -> None: for orca in self.orcas: - pcm_custom = orca.synthesize(test_sentences.text_custom_pronunciation) - self.assertGreater(len(pcm_custom), 0) + pcm, _ = orca.synthesize(test_data.text_custom_pronunciation) + self.assertGreater(len(pcm), 0) def test_synthesize_speech_rate(self) -> None: for orca in self.orcas: - pcm_fast = orca.synthesize(test_sentences.text, speech_rate=1.3) - pcm_slow = orca.synthesize(test_sentences.text, speech_rate=0.7) + pcm_fast, _ = orca.synthesize(test_data.text, speech_rate=1.3) + pcm_slow, _ = orca.synthesize(test_data.text, speech_rate=0.7) self.assertLess(len(pcm_fast), len(pcm_slow)) - try: - _ = orca.synthesize(test_sentences.text, speech_rate=9999) - except OrcaError: - pass - else: - self.fail("Expected OrcaError") + with self.assertRaises(OrcaError): + _ = orca.synthesize(test_data.text, speech_rate=9999) def test_synthesize_to_file(self) -> None: for orca in self.orcas: output_path = os.path.join(os.path.dirname(__file__), "output.wav") - orca.synthesize_to_file(test_sentences.text, output_path=output_path) + orca.synthesize_to_file(test_data.text, output_path=output_path) self.assertTrue(os.path.isfile(output_path)) os.remove(output_path) @@ -117,7 +185,7 @@ def test_version(self) -> None: def test_invalid_input(self) -> None: for orca in self.orcas: - for sentence in test_sentences.text_invalid: + for sentence in test_data.text_invalid: with self.assertRaises(OrcaInvalidArgumentError): orca.synthesize(sentence) @@ -159,7 +227,7 @@ def test_process_message_stack(self): orca._handle = None try: - res = orca.synthesize(test_sentences.text) + res = orca.synthesize(test_data.text) self.assertEqual(len(res), 0) except OrcaError as e: self.assertGreater(len(e.message_stack), 0) diff --git a/binding/python/test_orca_perf.py b/binding/python/test_orca_perf.py index f997efad..e5a3838b 100644 --- a/binding/python/test_orca_perf.py +++ b/binding/python/test_orca_perf.py @@ -19,13 +19,13 @@ from _util import default_library_path from test_util import get_model_paths, get_test_data -test_sentences, _ = get_test_data() +test_data = get_test_data() class OrcaPerformanceTestCase(unittest.TestCase): access_key: str num_test_iterations: int - proc_performance_threshold_sec: float + proc_performance_threshold_rtf: float def test_performance_proc(self) -> None: for model_path in get_model_paths(): @@ -34,29 +34,31 @@ def test_performance_proc(self) -> None: library_path=default_library_path('../..'), model_path=model_path) - perf_results = list() + num_audio_seconds = 0 + num_proc_seconds = 0 for i in range(self.num_test_iterations): start = perf_counter() - _ = orca.synthesize(test_sentences.text) + pcm, _ = orca.synthesize(test_data.text) if i > 0: - perf_results.append(perf_counter() - start) + num_audio_seconds += len(pcm) / orca.sample_rate + num_proc_seconds += perf_counter() - start orca.delete() - avg_perf = sum(perf_results) / self.num_test_iterations - print("Average proc performance [model=%s]: %s seconds" % (os.path.basename(model_path), avg_perf)) - self.assertLess(avg_perf, self.proc_performance_threshold_sec) + real_time_factor = num_audio_seconds / num_proc_seconds + print("Average proc performance[model=%s]: RTF = %s " % (os.path.basename(model_path), real_time_factor)) + self.assertGreater(real_time_factor, self.proc_performance_threshold_rtf) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--access-key', required=True) parser.add_argument('--num-test-iterations', type=int, required=True) - parser.add_argument('--proc-performance-threshold-sec', type=float, required=True) + parser.add_argument('--proc-performance-threshold-rtf', type=float, required=True) args = parser.parse_args() OrcaPerformanceTestCase.access_key = args.access_key OrcaPerformanceTestCase.num_test_iterations = args.num_test_iterations - OrcaPerformanceTestCase.proc_performance_threshold_sec = args.proc_performance_threshold_sec + OrcaPerformanceTestCase.proc_performance_threshold_rtf = args.proc_performance_threshold_rtf unittest.main(argv=sys.argv[:1]) diff --git a/binding/python/test_util.py b/binding/python/test_util.py index 9dd06468..210ccc10 100644 --- a/binding/python/test_util.py +++ b/binding/python/test_util.py @@ -11,25 +11,31 @@ import json import os +import struct +import wave from dataclasses import dataclass -from typing import Sequence, Tuple +from typing import List, Sequence -from typing import List +from _orca import Orca @dataclass -class TestSentences: +class TestData: text: str text_no_punctuation: str text_custom_pronunciation: str + text_alignment: str text_invalid: Sequence[str] + alignments: Sequence[Orca.WordAlignment] + random_state: int + audio_data_folder: str -def get_test_data() -> Tuple[TestSentences, float]: - data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/.test/test_data.json") - with open(data_file_path, encoding="utf8") as data_file: - test_data = json.loads(data_file.read()) - return TestSentences(**test_data["test_sentences"]), test_data["wer_threshold"] +def read_wav_file(path: str) -> Sequence[int]: + with wave.open(path, 'rb') as f: + buffer = f.readframes(f.getnframes()) + # minus 4 because of the header + return struct.unpack(f"{f.getnframes() - 4}h", buffer) def get_model_paths() -> List[str]: @@ -37,7 +43,39 @@ def get_model_paths() -> List[str]: return [os.path.join(model_folder, model_name) for model_name in os.listdir(model_folder)] +def get_test_data() -> TestData: + data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/.test/test_data.json") + with open(data_file_path, encoding="utf8") as data_file: + test_data = json.loads(data_file.read()) + + alignments = [] + for word_data in test_data["alignments"]: + phonemes = [] + for phoneme_data in word_data["phonemes"]: + phoneme = Orca.PhonemeAlignment( + phoneme=phoneme_data["phoneme"], + start_sec=phoneme_data["start_sec"], + end_sec=phoneme_data["end_sec"]) + phonemes.append(phoneme) + + word = Orca.WordAlignment( + word=word_data["word"], + start_sec=word_data["start_sec"], + end_sec=word_data["end_sec"], + phonemes=phonemes) + alignments.append(word) + + test_data = TestData( + alignments=alignments, + random_state=test_data["random_state"], + audio_data_folder=test_data["audio_data_folder"], + **test_data["test_sentences"]) + + return test_data + + __all__ = [ "get_test_data", "get_model_paths", + "read_wav_file", ] diff --git a/binding/web/.gitignore b/binding/web/.gitignore index cf2f85e9..4c610367 100644 --- a/binding/web/.gitignore +++ b/binding/web/.gitignore @@ -1,7 +1,6 @@ node_modules dist lib/pv_orca*.wasm -cypress/fixtures/.test/* +cypress/fixtures/resources/* test/orca_params*.js test/orca_params*.pv -test/leopard_params.pv diff --git a/binding/web/README.md b/binding/web/README.md index 4d002fcb..ad97cc5f 100644 --- a/binding/web/README.md +++ b/binding/web/README.md @@ -1,14 +1,16 @@ # Orca Binding for Web -## Orca Text-to-Speech Engine +## Orca Streaming Text-to-Speech Engine Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: - Linux (x86_64), macOS (x86_64, arm64), and Windows (x86_64) + - Android and iOS - Chrome, Safari, Firefox, and Edge - Raspberry Pi (3, 4, 5) and NVIDIA Jetson Nano @@ -122,6 +124,13 @@ const orca = await OrcaWorker.create( ); ``` +### Streaming vs. Single Synthesis + +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to +the Orca engine. + ### Custom Pronunciations Orca allows the embedding of custom pronunciations in the text via the syntax: `{word|pronunciation}`. The pronunciation @@ -131,33 +140,96 @@ is expressed in [ARPAbet](https://en.wikipedia.org/wiki/ARPABET) phonemes, for e - "{read|R IY D} this as {read|R EH D}, please." - "I {live|L IH V} in {Sevilla|S EH V IY Y AH}. We have great {live|L AY V} sports!" -### Synthesize Speech +### Orca Properties + +To obtain the complete set of valid characters, call `.validCharacters`. To retrieve the maximum number of +characters allowed, call `.maxCharacterLimit`. The sample rate of the generated `Int16Array` is `.sampleRate`. -The `synthesize` function will send the text to the engine and return the speech audio as an `Int16Array`. +### Usage + +#### Streaming Synthesis + +To use streaming synthesis, call `streamOpen` to create an `OrcaStream` object. ```typescript -const speechPcm = await orca.synthesize("${TEXT}"); +const orcaStream = await orca.streamOpen(); +``` + +Then, call `synthesize` on `orcaStream` to generate speech from a stream of text: + +```typescript +function* textStream(): IterableIterator { + ... // yield text chunks e.g. from an LLM response +} + +for (const textChunk of textStream()) { + const pcm = await orcaStream.synthesize(textChunk); + if (pcm !== null) { + // handle pcm + } +} +``` + +The `OrcaStream` object buffers input text until there is enough to generate audio. If there is not enough text to generate +audio, `null` is returned. + +When done, call `flush` to synthesize any remaining text, and `close` to delete the `orcaStream` object. + +```typescript +const flushedPcm = orcaStream.flush(); +if (flushedPcm !== null) { + // handle pcm +} + +orcaStream.close(); +``` + +#### Single Synthesis + +To use single synthesis, simply call `synthesize` directly on the `Orca` instance. The `synthesize` function will send +the text to the engine and return the speech audio data as an `Int16Array` as well as +the [alignments metadata](#alignments-metadata). + +```typescript +const { pcm, alignments } = await orca.synthesize("${TEXT}"); ``` ### Speech Control -Orca allows for an additional argument to be provided to the `synthesize` method to control the synthesized speech: +Orca allows for additional arguments to control the synthesized speech. +These can be provided to `streamOpen` or one of the single mode `synthesize` methods: - `speechRate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher value produces speech that is faster, and a lower value produces speech that is slower. The default value is `1.0`. ```typescript const synthesizeParams = { - speechRate: 1.3 + speechRate: 1.3, }; -const speechPcm = await orca.synthesize("${TEXT}", synthesizeParams); +// Streaming synthesis +const OrcaStream = await orca.streamOpen(synthesizeParams); + +// Single synthesis +const result = await orca.synthesize("${TEXT}", synthesizeParams); + ``` -### Orca Properties +### Alignments Metadata -To obtain the complete set of valid characters, call `.validCharacters`. To retrieve the maximum number of -characters allowed, call `.maxCharacterLimit`. The sample rate of Orca is `.sampleRate`. +Along with the raw PCM or saved audio file, Orca returns metadata for the synthesized audio in single synthesis mode. +The `OrcaAlignment` object has the following properties: + +- **Word:** String representation of the word. +- **Start Time:** Indicates when the word started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the word ended in the synthesized audio. Value is in seconds. +- **Phonemes:** An array of `OrcaPhoneme` objects. + +The `OrcaPhoneme` object has the following properties: + +- **Phoneme:** String representation of the phoneme. +- **Start Time:** Indicates when the phoneme started in the synthesized audio. Value is in seconds. +- **End Time:** Indicates when the phoneme ended in the synthesized audio. Value is in seconds. ### Clean Up diff --git a/binding/web/cypress.config.ts b/binding/web/cypress.config.ts index 95dea01a..ff2969e8 100644 --- a/binding/web/cypress.config.ts +++ b/binding/web/cypress.config.ts @@ -6,8 +6,8 @@ export default defineConfig({ 'PROC_PERFORMANCE_THRESHOLD_SEC': 10, }, e2e: { + supportFile: 'cypress/support/index.ts', defaultCommandTimeout: 30000, - supportFile: false, specPattern: 'test/*.test.{js,jsx,ts,tsx}', video: false, screenshotOnRunFailure: false, diff --git a/binding/web/cypress/support/commands.ts b/binding/web/cypress/support/commands.ts new file mode 100644 index 00000000..2c2e7297 --- /dev/null +++ b/binding/web/cypress/support/commands.ts @@ -0,0 +1,9 @@ + +const WAV_HEADER_SIZE = 44; + +Cypress.Commands.add("getFramesFromFile", (path: string) => { + cy.fixture(path, 'base64').then(Cypress.Blob.base64StringToBlob).then(async blob => { + const data = new Int16Array(await blob.arrayBuffer()); + return data.slice(WAV_HEADER_SIZE / Int16Array.BYTES_PER_ELEMENT); + }); +}); diff --git a/binding/web/cypress/support/index.ts b/binding/web/cypress/support/index.ts new file mode 100644 index 00000000..a4db0a5e --- /dev/null +++ b/binding/web/cypress/support/index.ts @@ -0,0 +1,9 @@ +import './commands'; + +declare global { + namespace Cypress { + interface Chainable { + getFramesFromFile(path: string): Chainable; + } + } +} diff --git a/binding/web/package.json b/binding/web/package.json index 2595abfe..cd740bab 100644 --- a/binding/web/package.json +++ b/binding/web/package.json @@ -3,7 +3,7 @@ "description": "Orca Text-to-Speech engine for web browsers (via WebAssembly)", "author": "Picovoice Inc", "license": "Apache-2.0", - "version": "0.1.1", + "version": "0.2.0", "keywords": [ "orca", "web", diff --git a/binding/web/scripts/setup_test.js b/binding/web/scripts/setup_test.js index a9aee392..a68d22d6 100644 --- a/binding/web/scripts/setup_test.js +++ b/binding/web/scripts/setup_test.js @@ -4,7 +4,7 @@ const { join } = require('path'); console.log('Copying the orca & leopard models...'); const testDirectory = join(__dirname, '..', 'test'); -const fixturesDirectory = join(__dirname, '..', 'cypress', 'fixtures'); +const fixturesDirectory = join(__dirname, '..', 'cypress', 'fixtures', 'resources'); const paramsSourceDirectory = join( __dirname, @@ -21,16 +21,7 @@ const sourceDirectory = join( '..', '..', 'resources', -); - -const testingModelFilesSourceDirectory = join( - __dirname, - '..', - '..', - '..', - 'resources', '.test', - 'models', ); try { @@ -40,12 +31,12 @@ try { fs.copyFileSync(join(paramsSourceDirectory, file), join(testDirectory, file)); }); - fs.readdirSync(testingModelFilesSourceDirectory).forEach(file => { - fs.copyFileSync(join(testingModelFilesSourceDirectory, file), join(testDirectory, file)); - }); + fs.mkdirSync(join(fixturesDirectory, '.test', 'wav'), { recursive: true }); + fs.copyFileSync(join(sourceDirectory, 'test_data.json'), join(fixturesDirectory, '.test', 'test_data.json')); - fs.mkdirSync(join(fixturesDirectory, '.test'), { recursive: true }); - fs.copyFileSync(join(sourceDirectory, '.test', 'test_data.json'), join(fixturesDirectory, '.test', 'test_data.json')); + fs.readdirSync(join(sourceDirectory, 'wav')).forEach(file => { + fs.copyFileSync(join(sourceDirectory, 'wav', file), join(fixturesDirectory, '.test', 'wav', file)); + }); } catch (error) { console.error(error); } diff --git a/binding/web/src/index.ts b/binding/web/src/index.ts index 728a928c..dcd7a8f0 100644 --- a/binding/web/src/index.ts +++ b/binding/web/src/index.ts @@ -1,8 +1,12 @@ -import { Orca } from './orca'; -import { OrcaWorker } from './orca_worker'; +import { Orca, OrcaStream } from './orca'; +import { OrcaWorker, OrcaStreamWorker } from './orca_worker'; import { OrcaModel, + OrcaSynthesizeParams, + OrcaPhoneme, + OrcaAlignment, + OrcaSynthesizeResult, OrcaWorkerInitRequest, OrcaWorkerSynthesizeRequest, OrcaWorkerReleaseRequest, @@ -26,8 +30,15 @@ OrcaWorker.setWasmSimd(orcaWasmSimd); export { Orca, + OrcaStream, + OrcaErrors, OrcaModel, + OrcaSynthesizeParams, + OrcaPhoneme, + OrcaAlignment, + OrcaSynthesizeResult, OrcaWorker, + OrcaStreamWorker, OrcaWorkerInitRequest, OrcaWorkerSynthesizeRequest, OrcaWorkerReleaseRequest, @@ -37,5 +48,4 @@ export { OrcaWorkerReleaseResponse, OrcaWorkerFailureResponse, OrcaWorkerResponse, - OrcaErrors, }; diff --git a/binding/web/src/orca.ts b/binding/web/src/orca.ts index 40e2282b..07cf52ab 100644 --- a/binding/web/src/orca.ts +++ b/binding/web/src/orca.ts @@ -13,6 +13,8 @@ import { Mutex } from 'async-mutex'; +import { simd } from 'wasm-feature-detect'; + import { aligned_alloc_type, arrayBufferToStringAtIndex, @@ -23,9 +25,15 @@ import { PvError, } from '@picovoice/web-utils'; -import { simd } from 'wasm-feature-detect'; - -import { OrcaModel, PvStatus, SynthesizeParams } from './types'; +import { + OrcaAlignment, + OrcaModel, + OrcaPhoneme, + OrcaStreamSynthesizeResult, + OrcaSynthesizeParams, + OrcaSynthesizeResult, + PvStatus, +} from './types'; import * as OrcaErrors from './orca_errors'; import { pvStatusToException } from './orca_errors'; @@ -38,109 +46,379 @@ type pv_orca_delete_type = (object: number) => Promise; type pv_orca_valid_characters_type = (object: number, numCharacters: number, validCharacters: number) => Promise; type pv_orca_valid_characters_delete_type = (validCharacters: number) => Promise; type pv_orca_sample_rate_type = (object: number, sampleRate: number) => Promise; -type pv_orca_max_character_limit_type = () => Promise; +type pv_orca_max_character_limit_type = (object: number, maxCharacterLimit: number) => Promise; type pv_orca_synthesize_params_init_type = (object: number) => Promise; type pv_orca_synthesize_params_delete_type = (object: number) => Promise; type pv_orca_synthesize_params_set_speech_rate_type = (object: number, speechRate: number) => Promise; -type pv_orca_synthesize_type = (object: number, text: number, synthesizeParams: number, numSamples: number, pcm: number) => Promise; -type pv_orca_delete_pcm_type = (object: number) => Promise; +type pv_orca_synthesize_params_set_random_state_type = (object: number, randomState: bigint) => Promise; +type pv_orca_synthesize_type = (object: number, text: number, synthesizeParams: number, numSamples: number, pcm: number, numAlignments: number, alignments: number) => Promise; +type pv_orca_pcm_delete_type = (object: number) => Promise; +type pv_orca_word_alignments_delete_type = (numAlignments: number, alignments: number) => Promise; +type pv_orca_stream_open_type = (object: number, synthesizeParams: number, stream: number) => Promise; +type pv_orca_stream_synthesize_type = (object: number, text: number, numSamples: number, pcm: number) => Promise; +type pv_orca_stream_flush_type = (object: number, numSamples: number, pcm: number) => Promise; +type pv_orca_stream_close_type = (object: number) => Promise; type pv_orca_version_type = () => Promise; -type pv_status_to_string_type = (status: number) => Promise type pv_set_sdk_type = (sdk: number) => Promise; type pv_get_error_stack_type = (messageStack: number, messageStackDepth: number) => Promise; type pv_free_error_stack_type = (messageStack: number) => Promise; -/** - * JavaScript/WebAssembly Binding for Orca - */ - type OrcaWasmOutput = { - alignedAlloc: aligned_alloc_type; - memory: WebAssembly.Memory; - pvFree: pv_free_type; - version: string; sampleRate: number; - maxCharacterLimit: number; validCharacters: string[]; + maxCharacterLimit: number; - objectAddress: number; - inputBufferAddress: number; - synthesizeParamsAddressAddress: number; - speechRateAddress: number; + memory: WebAssembly.Memory; + alignedAlloc: aligned_alloc_type; + pvFree: pv_free_type; + pvGetErrorStack: pv_get_error_stack_type; + pvFreeErrorStack: pv_free_error_stack_type; messageStackAddressAddressAddress: number; messageStackDepthAddress: number; + objectAddress: number; pvOrcaDelete: pv_orca_delete_type; pvOrcaSynthesize: pv_orca_synthesize_type; pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type - pvOrcaDeletePcm: pv_orca_delete_pcm_type; - pvStatusToString: pv_status_to_string_type; - pvGetErrorStack: pv_get_error_stack_type; - pvFreeErrorStack: pv_free_error_stack_type; + pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state_type + pvOrcaPcmDelete: pv_orca_pcm_delete_type; + pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete_type; + + streamPcmAddressAddress: number; + pvOrcaStreamOpen: pv_orca_stream_open_type; + pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + pvOrcaStreamFlush: pv_orca_stream_flush_type; + pvOrcaStreamClose: pv_orca_stream_close_type; }; -const PV_STATUS_SUCCESS = 10000; - -export class Orca { - private readonly _pvOrcaDelete: pv_orca_delete_type; - private readonly _pvOrcaSynthesize: pv_orca_synthesize_type; - private readonly _pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; - private readonly _pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; - private readonly _pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type; - private readonly _pvOrcaDeletePcm: pv_orca_delete_pcm_type; +/** + * OrcaStream object that converts a stream of text to a stream of audio. + */ +class Stream { + private _wasmMemory: WebAssembly.Memory; + private readonly _alignedAlloc: CallableFunction; + private readonly _pvFree: pv_free_type; private readonly _pvGetErrorStack: pv_get_error_stack_type; private readonly _pvFreeErrorStack: pv_free_error_stack_type; + private readonly _messageStackAddressAddressAddress: number; + private readonly _messageStackDepthAddress: number; - private _wasmMemory: WebAssembly.Memory | undefined; + private readonly _functionMutex: Mutex; + private readonly _streamPcmAddressAddress: number; + private readonly _pvOrcaPcmDelete: pv_orca_pcm_delete_type; + private readonly _pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + private readonly _pvOrcaStreamFlush: pv_orca_stream_flush_type; + private readonly _pvOrcaStreamClose: pv_orca_stream_close_type; + private readonly _streamAddress: number; + private readonly _getMessageStack: any; + + constructor( + wasmMemory: WebAssembly.Memory, + alignedAlloc: CallableFunction, + pvFree: pv_free_type, + pvGetErrorStack: pv_get_error_stack_type, + pvFreeErrorStack: pv_free_error_stack_type, + messageStackAddressAddressAddress: number, + messageStackDepthAddress: number, + functionMutex: Mutex, + streamPcmAddressAddress: number, + pvOrcaPcmDelete: pv_orca_pcm_delete_type, + pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type, + pvOrcaStreamFlush: pv_orca_stream_flush_type, + pvOrcaStreamClose: pv_orca_stream_close_type, + streamAddress: number, + getMessageStack: any, + ) { + this._wasmMemory = wasmMemory; + this._alignedAlloc = alignedAlloc; + this._pvFree = pvFree; + this._pvGetErrorStack = pvGetErrorStack; + this._pvFreeErrorStack = pvFreeErrorStack; + this._messageStackAddressAddressAddress = messageStackAddressAddressAddress; + this._messageStackDepthAddress = messageStackDepthAddress; + this._functionMutex = functionMutex; + this._streamPcmAddressAddress = streamPcmAddressAddress; + this._pvOrcaPcmDelete = pvOrcaPcmDelete; + this._pvOrcaStreamSynthesize = pvOrcaStreamSynthesize; + this._pvOrcaStreamFlush = pvOrcaStreamFlush; + this._pvOrcaStreamClose = pvOrcaStreamClose; + this._streamAddress = streamAddress; + this._getMessageStack = getMessageStack; + } - private readonly _pvFree: pv_free_type; - private readonly _synthesizeMutex: Mutex; + /** + * Adds a chunk of text to the Stream object and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public async synthesize(text: string): Promise { + if (typeof text !== 'string') { + throw new OrcaErrors.OrcaInvalidArgumentError( + 'The argument \'text\' must be provided as a string', + ); + } - private readonly _objectAddress: number; - private readonly _alignedAlloc: CallableFunction; - private readonly _inputBufferAddress: number; - private readonly _messageStackAddressAddressAddress: number; - private readonly _messageStackDepthAddress: number; + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError( + 'Attempted to call Orca stream synthesize after release.', + ); + } + + const memoryBufferText = new Uint8Array(this._wasmMemory.buffer); + const encodedText = new TextEncoder().encode(text); + const textAddress = await this._alignedAlloc( + Uint8Array.BYTES_PER_ELEMENT, + (encodedText.length + 1) * Uint8Array.BYTES_PER_ELEMENT, + ); + if (textAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError( + 'malloc failed: Cannot allocate memory', + ); + } + memoryBufferText.set(encodedText, textAddress); + memoryBufferText[textAddress + encodedText.length] = 0; + + const numSamplesAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numSamplesAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamSynthesizeStatus = await this._pvOrcaStreamSynthesize( + this._streamAddress, + textAddress, + numSamplesAddress, + this._streamPcmAddressAddress, + ); + await this._pvFree(textAddress); + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + if (streamSynthesizeStatus !== PvStatus.SUCCESS) { + const messageStack = await this._getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + throw pvStatusToException(streamSynthesizeStatus, 'Stream synthesize failed', messageStack); + } + + const pcmAddress = memoryBufferView.getInt32( + this._streamPcmAddressAddress, + true, + ); + + const numSamples = memoryBufferView.getInt32( + numSamplesAddress, + true, + ); + await this._pvFree(numSamplesAddress); + + const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + return pcm.length > 0 ? pcm : null; + }) + .then((result: OrcaStreamSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public async flush(): Promise { + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError('Attempted to call OrcaStream flush after release.'); + } + + const numSamplesAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numSamplesAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const pcmAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (pcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamFlushStatus = await this._pvOrcaStreamFlush( + this._streamAddress, + numSamplesAddress, + pcmAddressAddress, + ); + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + if (streamFlushStatus !== PvStatus.SUCCESS) { + const messageStack = await this._getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(streamFlushStatus, 'Flush failed', messageStack); + } + + const pcmAddress = memoryBufferView.getInt32( + pcmAddressAddress, + true, + ); + await this._pvFree(pcmAddressAddress); + + const numSamples = memoryBufferView.getInt32( + numSamplesAddress, + true, + ); + await this._pvFree(numSamplesAddress); + + const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + return pcm.length > 0 ? pcm : null; + }) + .then((result: OrcaStreamSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public async close(): Promise { + await this._pvOrcaStreamClose(this._streamAddress); + } +} + +export type OrcaStream = Stream + +/** + * JavaScript/WebAssembly Binding for Orca + */ +export class Orca { private static _version: string; private static _sampleRate: number; - private static _maxCharacterLimit: number; private static _validCharacters: string[]; + private static _maxCharacterLimit: number; + + private _wasmMemory?: WebAssembly.Memory; + private readonly _alignedAlloc: CallableFunction; + private readonly _pvFree: pv_free_type; + private readonly _pvGetErrorStack: pv_get_error_stack_type; + private readonly _pvFreeErrorStack: pv_free_error_stack_type; + private readonly _messageStackAddressAddressAddress: number; + private readonly _messageStackDepthAddress: number; + + private readonly _objectAddress: number; + private readonly _pvOrcaDelete: pv_orca_delete_type; + private readonly _pvOrcaSynthesize: pv_orca_synthesize_type; + private readonly _pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init_type; + private readonly _pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete_type; + private readonly _pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate_type; + private readonly _pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state_type; + private readonly _pvOrcaPcmDelete: pv_orca_pcm_delete_type; + private readonly _pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete_type; + + private readonly _streamPcmAddressAddress: number; + private readonly _pvOrcaStreamOpen: pv_orca_stream_open_type; + private readonly _pvOrcaStreamSynthesize: pv_orca_stream_synthesize_type; + private readonly _pvOrcaStreamFlush: pv_orca_stream_flush_type; + private readonly _pvOrcaStreamClose: pv_orca_stream_close_type; + private readonly _functionMutex: Mutex; + private static _wasm: string; private static _wasmSimd: string; private static _sdk: string = 'web'; private static _orcaMutex = new Mutex(); - private constructor( - handleWasm: OrcaWasmOutput, - ) { + private constructor(handleWasm: OrcaWasmOutput) { Orca._version = handleWasm.version; Orca._sampleRate = handleWasm.sampleRate; - Orca._maxCharacterLimit = handleWasm.maxCharacterLimit; Orca._validCharacters = handleWasm.validCharacters; + Orca._maxCharacterLimit = handleWasm.maxCharacterLimit; + this._wasmMemory = handleWasm.memory; + this._alignedAlloc = handleWasm.alignedAlloc; + this._pvFree = handleWasm.pvFree; + this._pvGetErrorStack = handleWasm.pvGetErrorStack; + this._pvFreeErrorStack = handleWasm.pvFreeErrorStack; + this._messageStackAddressAddressAddress = handleWasm.messageStackAddressAddressAddress; + this._messageStackDepthAddress = handleWasm.messageStackDepthAddress; + + this._objectAddress = handleWasm.objectAddress; this._pvOrcaDelete = handleWasm.pvOrcaDelete; this._pvOrcaSynthesize = handleWasm.pvOrcaSynthesize; this._pvOrcaSynthesizeParamsInit = handleWasm.pvOrcaSynthesizeParamsInit; this._pvOrcaSynthesizeParamsDelete = handleWasm.pvOrcaSynthesizeParamsDelete; this._pvOrcaSynthesizeParamsSetSpeechRate = handleWasm.pvOrcaSynthesizeParamsSetSpeechRate; - this._pvOrcaDeletePcm = handleWasm.pvOrcaDeletePcm; - this._pvGetErrorStack = handleWasm.pvGetErrorStack; - this._pvFreeErrorStack = handleWasm.pvFreeErrorStack; + this._pvOrcaSynthesizeParamsSetRandomState = handleWasm.pvOrcaSynthesizeParamsSetRandomState; + this._pvOrcaPcmDelete = handleWasm.pvOrcaPcmDelete; + this._pvOrcaWordAlignmentsDelete = handleWasm.pvOrcaWordAlignmentsDelete; - this._alignedAlloc = handleWasm.alignedAlloc; - this._wasmMemory = handleWasm.memory; - this._pvFree = handleWasm.pvFree; - this._objectAddress = handleWasm.objectAddress; - this._inputBufferAddress = handleWasm.inputBufferAddress; - this._messageStackAddressAddressAddress = handleWasm.messageStackAddressAddressAddress; - this._messageStackDepthAddress = handleWasm.messageStackDepthAddress; + this._streamPcmAddressAddress = handleWasm.streamPcmAddressAddress; + this._pvOrcaStreamOpen = handleWasm.pvOrcaStreamOpen; + this._pvOrcaStreamSynthesize = handleWasm.pvOrcaStreamSynthesize; + this._pvOrcaStreamFlush = handleWasm.pvOrcaStreamFlush; + this._pvOrcaStreamClose = handleWasm.pvOrcaStreamClose; - this._synthesizeMutex = new Mutex(); + this._functionMutex = new Mutex(); } /** @@ -158,17 +436,17 @@ export class Orca { } /** - * Get maximum character limit. + * Get valid characters. */ - get maxCharacterLimit(): number { - return Orca._maxCharacterLimit; + get validCharacters(): string[] { + return Orca._validCharacters; } /** - * Get valid characters. + * Get maximum character limit. */ - get validCharacters(): string[] { - return Orca._validCharacters; + get maxCharacterLimit(): number { + return Orca._maxCharacterLimit; } /** @@ -218,13 +496,10 @@ export class Orca { const customWritePath = (model.customWritePath) ? model.customWritePath : 'orca_model'; const modelPath = await loadModel({ ...model, customWritePath }); - return Orca._init( - accessKey, - modelPath, - ); + return Orca._init(accessKey, modelPath); } - public static async _init( + public static _init( accessKey: string, modelPath: string, ): Promise { @@ -236,7 +511,7 @@ export class Orca { Orca._orcaMutex .runExclusive(async () => { const isSimd = await simd(); - const wasmOutput = await Orca.initWasm(accessKey.trim(), (isSimd) ? this._wasmSimd : this._wasm, modelPath); + const wasmOutput = await Orca.initWasm(accessKey.trim(), modelPath, (isSimd) ? this._wasmSimd : this._wasm); return new Orca(wasmOutput); }) .then((result: Orca) => { @@ -248,35 +523,50 @@ export class Orca { }); } + /** * Generates audio from text. The returned audio contains the speech representation of the text. - * - * @param text Generates audio from text. The returned audio contains the speech representation of the text. * The maximum number of characters per call to `.synthesize()` is `.maxCharacterLimit`. * Allowed characters are lower-case and upper-case letters and punctuation marks that can be retrieved with `.validCharacters`. * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. * The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". + * + * @param text A string of text. * @param synthesizeParams Optional configuration arguments. * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + * + * @return A result object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and a sequence of OrcaAlignment objects representing the word alignments. */ - public async synthesize(text: string, synthesizeParams: SynthesizeParams = {}): Promise { + public async synthesize( + text: string, + synthesizeParams: OrcaSynthesizeParams = { + speechRate: 1.0, + randomState: null, + }, + ): Promise { if (typeof text !== 'string') { - throw new OrcaErrors.OrcaInvalidArgumentError('The argument \'text\' must be provided as a string'); + throw new OrcaErrors.OrcaInvalidArgumentError( + `The argument 'text' must be provided as a string`, + ); } - const { - speechRate = 1.0, - } = synthesizeParams; + if (text.trim().length > Orca._maxCharacterLimit) { + throw new OrcaErrors.OrcaInvalidArgumentError(` + 'text' length must be smaller than ${Orca._maxCharacterLimit} + `); + } - return new Promise((resolve, reject) => { - this._synthesizeMutex + return new Promise((resolve, reject) => { + this._functionMutex .runExclusive(async () => { if (this._wasmMemory === undefined) { - throw new OrcaErrors.OrcaInvalidStateError('Attempted to call Orca synthesize after release.'); + throw new OrcaErrors.OrcaInvalidStateError( + 'Attempted to call Orca synthesize after release.', + ); } - const memoryBufferView = new DataView(this._wasmMemory.buffer); - const memoryBufferText = new Uint8Array(this._wasmMemory.buffer); const encodedText = new TextEncoder().encode(text); const textAddress = await this._alignedAlloc( @@ -299,9 +589,11 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } + const memoryBufferView = new DataView(this._wasmMemory.buffer); const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + const initStatus = await this._pvOrcaSynthesizeParamsInit(synthesizeParamsAddressAddress); - if (initStatus !== PV_STATUS_SUCCESS) { + if (initStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( this._pvGetErrorStack, this._pvFreeErrorStack, @@ -311,23 +603,47 @@ export class Orca { memoryBufferUint8, ); - throw pvStatusToException(initStatus, 'Synthesizing failed', messageStack); + throw pvStatusToException(initStatus, 'Synthesize failed', messageStack); } const synthesizeParamsAddress = memoryBufferView.getInt32(synthesizeParamsAddressAddress, true); await this._pvFree(synthesizeParamsAddressAddress); - const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate(synthesizeParamsAddress, speechRate); - if (setSpeechRateStatus !== PV_STATUS_SUCCESS) { - const messageStack = await Orca.getMessageStack( - this._pvGetErrorStack, - this._pvFreeErrorStack, - this._messageStackAddressAddressAddress, - this._messageStackDepthAddress, - memoryBufferView, - memoryBufferUint8, + + if (synthesizeParams.speechRate !== null && synthesizeParams.speechRate !== undefined) { + const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate( + synthesizeParamsAddress, + synthesizeParams.speechRate, ); + if (setSpeechRateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setSpeechRateStatus, 'Synthesize failed', messageStack); + } + } - throw pvStatusToException(setSpeechRateStatus, 'Synthesizing failed', messageStack); + if (synthesizeParams.randomState !== null && synthesizeParams.randomState !== undefined) { + const setRandomStateStatus = await this._pvOrcaSynthesizeParamsSetRandomState( + synthesizeParamsAddress, + BigInt(synthesizeParams.randomState), + ); + if (setRandomStateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + throw pvStatusToException(setRandomStateStatus, 'Synthesize failed', messageStack); + } } const numSamplesAddress = await this._alignedAlloc( @@ -338,11 +654,27 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } - const speechAddressAddress = await this._alignedAlloc( + const pcmAddressAddress = await this._alignedAlloc( Int32Array.BYTES_PER_ELEMENT, Int32Array.BYTES_PER_ELEMENT, ); - if (speechAddressAddress === 0) { + if (pcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const numAlignmentsAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (numAlignmentsAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const alignmentsAddressAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (alignmentsAddressAddressAddress === 0) { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } @@ -351,12 +683,14 @@ export class Orca { textAddress, synthesizeParamsAddress, numSamplesAddress, - speechAddressAddress, + pcmAddressAddress, + numAlignmentsAddress, + alignmentsAddressAddressAddress, ); await this._pvFree(textAddress); await this._pvOrcaSynthesizeParamsDelete(synthesizeParamsAddress); - if (synthesizeStatus !== PV_STATUS_SUCCESS) { + if (synthesizeStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( this._pvGetErrorStack, this._pvFreeErrorStack, @@ -366,14 +700,14 @@ export class Orca { memoryBufferUint8, ); - throw pvStatusToException(synthesizeStatus, 'Synthesizing failed', messageStack); + throw pvStatusToException(synthesizeStatus, 'Synthesize failed', messageStack); } - const speechAddress = memoryBufferView.getInt32( - speechAddressAddress, + const pcmAddress = memoryBufferView.getInt32( + pcmAddressAddress, true, ); - await this._pvFree(speechAddressAddress); + await this._pvFree(pcmAddressAddress); const numSamples = memoryBufferView.getInt32( numSamplesAddress, @@ -382,14 +716,196 @@ export class Orca { await this._pvFree(numSamplesAddress); const outputMemoryBuffer = new Int16Array(this._wasmMemory.buffer); - const speech = outputMemoryBuffer.slice( - speechAddress / Int16Array.BYTES_PER_ELEMENT, - (speechAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + const pcm = outputMemoryBuffer.slice( + pcmAddress / Int16Array.BYTES_PER_ELEMENT, + (pcmAddress / Int16Array.BYTES_PER_ELEMENT) + numSamples, + ); + await this._pvOrcaPcmDelete(pcmAddress); + + const numAlignments = memoryBufferView.getInt32(numAlignmentsAddress, true); + const alignmentsAddressAddress = memoryBufferView.getInt32(alignmentsAddressAddressAddress, true); + + let ptr = memoryBufferView.getInt32(alignmentsAddressAddress, true); + const alignments: OrcaAlignment[] = []; + for (let i = 1; i <= numAlignments; i++) { + const wordAddress = memoryBufferView.getInt32(ptr, true); + const word = arrayBufferToStringAtIndex( + memoryBufferUint8, + wordAddress, + ); + ptr += Uint32Array.BYTES_PER_ELEMENT; + const startSec = memoryBufferView.getFloat32(ptr, true); + ptr += Float32Array.BYTES_PER_ELEMENT; + const endSec = memoryBufferView.getFloat32(ptr, true); + ptr += Float32Array.BYTES_PER_ELEMENT; + const numPhonemes = memoryBufferView.getInt32(ptr, true); + ptr += Uint32Array.BYTES_PER_ELEMENT; + const phonemesAddress = memoryBufferView.getInt32(ptr, true); + ptr = memoryBufferView.getInt32(alignmentsAddressAddress + (i * Uint32Array.BYTES_PER_ELEMENT), true); + + let phonemesPtr = memoryBufferView.getInt32(phonemesAddress, true); + const phonemes: OrcaPhoneme[] = []; + for (let j = 1; j <= numPhonemes; j++) { + const phonemeAddress = memoryBufferView.getInt32(phonemesPtr, true); + const phoneme = arrayBufferToStringAtIndex( + memoryBufferUint8, + phonemeAddress, + ); + phonemesPtr += Uint32Array.BYTES_PER_ELEMENT; + const pStartSec = memoryBufferView.getFloat32(phonemesPtr, true); + phonemesPtr += Float32Array.BYTES_PER_ELEMENT; + const pEndSec = memoryBufferView.getFloat32(phonemesPtr, true); + phonemesPtr = memoryBufferView.getInt32(phonemesAddress + (j * Uint32Array.BYTES_PER_ELEMENT), true); + phonemes.push({ phoneme, startSec: pStartSec, endSec: pEndSec }); + } + alignments.push({ word, startSec, endSec, phonemes }); + } + await this._pvFree(numAlignmentsAddress); + await this._pvFree(alignmentsAddressAddressAddress); + await this._pvOrcaWordAlignmentsDelete(numAlignments, alignmentsAddressAddress); + + return { pcm, alignments }; + }) + .then((result: OrcaSynthesizeResult) => { + resolve(result); + }) + .catch(async (error: any) => { + reject(error); + }); + }); + } + + /** + * Opens a stream for streaming text synthesis. + * + * @param synthesizeParams Optional configuration arguments. + * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + * + * @returns An instance of OrcaStream. + */ + public async streamOpen( + synthesizeParams: OrcaSynthesizeParams = { + speechRate: 1.0, + randomState: null, + }, + ): Promise { + return new Promise((resolve, reject) => { + this._functionMutex + .runExclusive(async () => { + if (this._wasmMemory === undefined) { + throw new OrcaErrors.OrcaInvalidStateError('Attempted to call Orca stream open after release.'); + } + + const synthesizeParamsAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (synthesizeParamsAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const memoryBufferView = new DataView(this._wasmMemory.buffer); + const memoryBufferUint8 = new Uint8Array(this._wasmMemory.buffer); + + const initStatus = await this._pvOrcaSynthesizeParamsInit(synthesizeParamsAddressAddress); + if (initStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(initStatus, 'Stream open failed', messageStack); + } + + const synthesizeParamsAddress = memoryBufferView.getInt32(synthesizeParamsAddressAddress, true); + await this._pvFree(synthesizeParamsAddressAddress); + + if (synthesizeParams.speechRate !== null && synthesizeParams.speechRate !== undefined) { + const setSpeechRateStatus = await this._pvOrcaSynthesizeParamsSetSpeechRate(synthesizeParamsAddress, synthesizeParams.speechRate); + if (setSpeechRateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setSpeechRateStatus, 'Stream open failed', messageStack); + } + } + + if (synthesizeParams.randomState !== null && synthesizeParams.randomState !== undefined) { + const setRandomStateStatus = await this._pvOrcaSynthesizeParamsSetRandomState(synthesizeParamsAddress, BigInt(synthesizeParams.randomState)); + if (setRandomStateStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(setRandomStateStatus, 'Stream open failed', messageStack); + } + } + + const streamAddressAddress = await this._alignedAlloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (streamAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + + const streamOpenStatus = await this._pvOrcaStreamOpen( + this._objectAddress, + synthesizeParamsAddress, + streamAddressAddress, + ); + await this._pvOrcaSynthesizeParamsDelete(synthesizeParamsAddress); + + if (streamOpenStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(streamOpenStatus, 'Stream open failed', messageStack); + } + const streamAddress = memoryBufferView.getInt32(streamAddressAddress, true); + await this._pvFree(streamAddressAddress); + + return new Stream( + this._wasmMemory, + this._alignedAlloc, + this._pvFree, + this._pvGetErrorStack, + this._pvFreeErrorStack, + this._messageStackAddressAddressAddress, + this._messageStackDepthAddress, + this._functionMutex, + this._streamPcmAddressAddress, + this._pvOrcaPcmDelete, + this._pvOrcaStreamSynthesize, + this._pvOrcaStreamFlush, + this._pvOrcaStreamClose, + streamAddress, + Orca.getMessageStack, ); - await this._pvOrcaDeletePcm(speechAddress); - return speech; }) - .then((result: Int16Array) => { + .then(result => { resolve(result); }) .catch(async (error: any) => { @@ -405,30 +921,16 @@ export class Orca { await this._pvOrcaDelete(this._objectAddress); await this._pvFree(this._messageStackAddressAddressAddress); await this._pvFree(this._messageStackDepthAddress); - await this._pvFree(this._inputBufferAddress); + await this._pvFree(this._streamPcmAddressAddress); delete this._wasmMemory; this._wasmMemory = undefined; } - async onmessage(e: MessageEvent): Promise { - switch (e.data.command) { - case 'synthesize': - await this.synthesize(e.data.text, e.data.speechRate); - break; - default: - // eslint-disable-next-line no-console - console.warn(`Unrecognized command: ${e.data.command}`); - } - } - - private static async initWasm(accessKey: string, wasmBase64: string, modelPath: string): Promise { + private static async initWasm(accessKey: string, modelPath: string, wasmBase64: string): Promise { // A WebAssembly page has a constant size of 64KiB. -> 1MiB ~= 16 pages const memory = new WebAssembly.Memory({ initial: 7500 }); - const memoryBufferUint8 = new Uint8Array(memory.buffer); - const pvError = new PvError(); - const exports = await buildWasm(memory, wasmBase64, pvError); const aligned_alloc = exports.aligned_alloc as aligned_alloc_type; @@ -442,10 +944,15 @@ export class Orca { const pv_orca_synthesize_params_init = exports.pv_orca_synthesize_params_init as pv_orca_synthesize_params_init_type; const pv_orca_synthesize_params_delete = exports.pv_orca_synthesize_params_delete as pv_orca_synthesize_params_delete_type; const pv_orca_synthesize_params_set_speech_rate = exports.pv_orca_synthesize_params_set_speech_rate as pv_orca_synthesize_params_set_speech_rate_type; + const pv_orca_synthesize_params_set_random_state = exports.pv_orca_synthesize_params_set_random_state as pv_orca_synthesize_params_set_random_state_type; const pv_orca_synthesize = exports.pv_orca_synthesize as pv_orca_synthesize_type; - const pv_orca_delete_pcm = exports.pv_orca_delete_pcm as pv_orca_delete_pcm_type; + const pv_orca_pcm_delete = exports.pv_orca_pcm_delete as pv_orca_pcm_delete_type; + const pv_orca_word_alignments_delete = exports.pv_orca_word_alignments_delete as pv_orca_word_alignments_delete_type; + const pv_orca_stream_open = exports.pv_orca_stream_open as pv_orca_stream_open_type; + const pv_orca_stream_synthesize = exports.pv_orca_stream_synthesize as pv_orca_stream_synthesize_type; + const pv_orca_stream_flush = exports.pv_orca_stream_flush as pv_orca_stream_flush_type; + const pv_orca_stream_close = exports.pv_orca_stream_close as pv_orca_stream_close_type; const pv_orca_version = exports.pv_orca_version as pv_orca_version_type; - const pv_status_to_string = exports.pv_status_to_string_type as pv_status_to_string_type; const pv_set_sdk = exports.pv_set_sdk as pv_set_sdk_type; const pv_get_error_stack = exports.pv_get_error_stack as pv_get_error_stack_type; const pv_free_error_stack = exports.pv_free_error_stack as pv_free_error_stack_type; @@ -518,7 +1025,7 @@ export class Orca { objectAddressAddress); await pv_free(accessKeyAddress); await pv_free(modelPathAddress); - if (initStatus !== PV_STATUS_SUCCESS) { + if (initStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -542,7 +1049,7 @@ export class Orca { throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); } const sampleRateStatus = await pv_orca_sample_rate(objectAddress, sampleRateAddress); - if (sampleRateStatus !== PV_STATUS_SUCCESS) { + if (sampleRateStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -558,6 +1065,30 @@ export class Orca { const sampleRate = memoryBufferView.getInt32(sampleRateAddress, true); await pv_free(sampleRateAddress); + const maxCharacterLimitAddress = await aligned_alloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (maxCharacterLimitAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + const maxCharacterLimitStatus = await pv_orca_max_character_limit(objectAddress, maxCharacterLimitAddress); + if (maxCharacterLimitStatus !== PvStatus.SUCCESS) { + const messageStack = await Orca.getMessageStack( + pv_get_error_stack, + pv_free_error_stack, + messageStackAddressAddressAddress, + messageStackDepthAddress, + memoryBufferView, + memoryBufferUint8, + ); + + throw pvStatusToException(maxCharacterLimitStatus, 'Get max character limit failed', messageStack, pvError); + } + + const maxCharacterLimit = memoryBufferView.getInt32(maxCharacterLimitAddress, true); + await pv_free(maxCharacterLimitAddress); + const numCharactersAddress = await aligned_alloc( Int32Array.BYTES_PER_ELEMENT, Int32Array.BYTES_PER_ELEMENT, @@ -579,7 +1110,7 @@ export class Orca { numCharactersAddress, validCharactersAddressAddressAddress, ); - if (validCharactersStatus !== PV_STATUS_SUCCESS) { + if (validCharactersStatus !== PvStatus.SUCCESS) { const messageStack = await Orca.getMessageStack( pv_get_error_stack, pv_free_error_stack, @@ -610,14 +1141,20 @@ export class Orca { await pv_free(validCharactersAddressAddressAddress); await pv_orca_valid_characters_delete(validCharactersAddressAddress); - const maxCharacterLimit = await pv_orca_max_character_limit(); - const versionAddress = await pv_orca_version(); const version = arrayBufferToStringAtIndex( memoryBufferUint8, versionAddress, ); + const streamPcmAddressAddress = await aligned_alloc( + Int32Array.BYTES_PER_ELEMENT, + Int32Array.BYTES_PER_ELEMENT, + ); + if (streamPcmAddressAddress === 0) { + throw new OrcaErrors.OrcaOutOfMemoryError('malloc failed: Cannot allocate memory'); + } + return { memory: memory, pvFree: pv_free, @@ -628,6 +1165,7 @@ export class Orca { sampleRate: sampleRate, maxCharacterLimit: maxCharacterLimit, validCharacters: validCharacters, + streamPcmAddressAddress: streamPcmAddressAddress, messageStackAddressAddressAddress: messageStackAddressAddressAddress, messageStackDepthAddress: messageStackDepthAddress, @@ -635,9 +1173,14 @@ export class Orca { pvOrcaSynthesizeParamsInit: pv_orca_synthesize_params_init, pvOrcaSynthesizeParamsDelete: pv_orca_synthesize_params_delete, pvOrcaSynthesizeParamsSetSpeechRate: pv_orca_synthesize_params_set_speech_rate, + pvOrcaSynthesizeParamsSetRandomState: pv_orca_synthesize_params_set_random_state, pvOrcaSynthesize: pv_orca_synthesize, - pvStatusToString: pv_status_to_string, - pvOrcaDeletePcm: pv_orca_delete_pcm, + pvOrcaPcmDelete: pv_orca_pcm_delete, + pvOrcaWordAlignmentsDelete: pv_orca_word_alignments_delete, + pvOrcaStreamOpen: pv_orca_stream_open, + pvOrcaStreamSynthesize: pv_orca_stream_synthesize, + pvOrcaStreamFlush: pv_orca_stream_flush, + pvOrcaStreamClose: pv_orca_stream_close, pvGetErrorStack: pv_get_error_stack, pvFreeErrorStack: pv_free_error_stack, }; @@ -667,7 +1210,7 @@ export class Orca { messageStack.push(message); } - pv_free_error_stack(messageStackAddressAddress); + await pv_free_error_stack(messageStackAddressAddress); return messageStack; } diff --git a/binding/web/src/orca_worker.ts b/binding/web/src/orca_worker.ts index 743d70e5..8f951a2d 100644 --- a/binding/web/src/orca_worker.ts +++ b/binding/web/src/orca_worker.ts @@ -13,16 +13,171 @@ import PvWorker from 'web-worker:./orca_worker_handler.ts'; import { OrcaModel, + OrcaSynthesizeParams, + OrcaSynthesizeResult, + OrcaStreamSynthesizeResult, OrcaWorkerInitResponse, OrcaWorkerSynthesizeResponse, OrcaWorkerReleaseResponse, + OrcaWorkerStreamOpenResponse, + OrcaWorkerStreamSynthesizeResponse, + OrcaWorkerStreamFlushResponse, + OrcaWorkerStreamCloseResponse, PvStatus, - SynthesizeParams, } from './types'; import { loadModel } from '@picovoice/web-utils'; import { pvStatusToException } from './orca_errors'; +class StreamWorker { + readonly _worker: Worker; + + constructor(orcaWorker: Worker) { + this._worker = orcaWorker; + } + + /** + * Adds a chunk of text to the Stream object in a worker and generates audio if enough text has been added. + * This function is expected to be called multiple times with consecutive chunks of text from a text stream. + * The incoming text is buffered as it arrives until there is enough context to convert a chunk of the + * buffered text into audio. The caller needs to use `OrcaStream.flush()` to generate the audio chunk + * for the remaining text that has not yet been synthesized. + * + * @param text A chunk of text from a text input stream, comprised of valid characters. + * Valid characters can be retrieved by calling `validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * They need to be added in a single call to this function. + * The pronunciation is expressed in ARPAbet format, e.g.: `I {liv|L IH V} in {Sevilla|S EH V IY Y AH}`. + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public synthesize(text: string): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(event.data.result); + break; + case 'failed': + case 'error': + // eslint-disable-next-line no-case-declarations + reject(pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + )); + break; + default: + reject(pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + )); + } + }; + }, + ); + + this._worker.postMessage( + { + command: 'streamSynthesize', + text: text, + }, + ); + + return returnPromise; + } + + /** + * Generates audio for all the buffered text that was added to the OrcaStream object + * via `OrcaStream.synthesize()`. + * + * @return The generated audio as a sequence of 16-bit linearly-encoded integers, `null` if no + * audio chunk has been produced. + */ + public flush(): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(event.data.result); + break; + case 'failed': + case 'error': + reject(pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + )); + break; + default: + reject(pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + )); + } + }; + }, + ); + + this._worker.postMessage({ + command: 'streamFlush', + }); + + return returnPromise; + } + + /** + * Releases the resources acquired by the OrcaStream object. + */ + public close(): Promise { + const returnPromise: Promise = new Promise((resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(); + break; + case 'failed': + case 'error': + reject( + pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + ), + ); + break; + default: + reject( + pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + ), + ); + } + }; + }); + + this._worker.postMessage({ + command: 'streamClose', + }); + + return returnPromise; + } +} + +export type OrcaStreamWorker = StreamWorker + export class OrcaWorker { private readonly _worker: Worker; private readonly _version: string; @@ -186,22 +341,25 @@ export class OrcaWorker { } /** - * Synthesizes speech in a worker. - * The speech result will be supplied with the callback provided when initializing the worker either - * by 'fromBase64' or 'fromPublicDirectory'. - * Can also send a message directly using 'this.worker.postMessage({command: "synthesize", text: "..."})'. + * Generates audio from text in a worker. The returned audio contains the speech representation of the text. + * The maximum number of characters per call to `.synthesize()` is `.maxCharacterLimit`. + * Allowed characters are lower-case and upper-case letters and punctuation marks that can be retrieved with `.validCharacters`. + * Custom pronunciations can be embedded in the text via the syntax `{word|pronunciation}`. + * The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". * - * @param text A string of text. + * @param text A string of text with properties described above. * @param synthesizeParams Optional configuration arguments. * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. * - * @return An Int16Array. + * @return A result object containing the generated audio as a sequence of 16-bit linearly-encoded integers + * and a sequence of OrcaAlignment objects representing the word alignments. */ - public async synthesize( + public synthesize( text: string, - synthesizeParams: SynthesizeParams = {}, - ): Promise { - const returnPromise: Promise = new Promise( + synthesizeParams: OrcaSynthesizeParams = {}, + ): Promise { + const returnPromise: Promise = new Promise( (resolve, reject) => { this._worker.onmessage = ( event: MessageEvent, @@ -283,6 +441,57 @@ export class OrcaWorker { return returnPromise; } + + /** + * Opens a new OrcaStream object in a worker. + * + * @param synthesizeParams Optional configuration arguments. + * @param synthesizeParams.speechRate Configure the rate of speech of the synthesized speech. + * @param synthesizeParams.randomState Configure the random seed for the synthesized speech. + */ + public streamOpen(synthesizeParams: OrcaSynthesizeParams = {}): Promise { + const returnPromise: Promise = new Promise( + (resolve, reject) => { + this._worker.onmessage = ( + event: MessageEvent, + ): void => { + switch (event.data.command) { + case 'ok': + resolve(new StreamWorker(this._worker)); + break; + case 'failed': + case 'error': + reject( + pvStatusToException( + event.data.status, + event.data.shortMessage, + event.data.messageStack, + ), + ); + break; + default: + reject( + pvStatusToException( + PvStatus.RUNTIME_ERROR, + // @ts-ignore + `Unrecognized command: ${event.data.command}`, + ), + ); + } + }; + }, + ); + + this._worker.postMessage( + { + command: 'streamOpen', + synthesizeParams: synthesizeParams, + }, + ); + + return returnPromise; + } + /** * Terminates the active worker. Stops all requests being handled by worker. */ diff --git a/binding/web/src/orca_worker_handler.ts b/binding/web/src/orca_worker_handler.ts index 398d78eb..96c0c66d 100644 --- a/binding/web/src/orca_worker_handler.ts +++ b/binding/web/src/orca_worker_handler.ts @@ -17,6 +17,7 @@ import { OrcaWorkerRequest, PvStatus } from './types'; import { OrcaError } from './orca_errors'; let orca: Orca | null = null; +let orcaStream: any = null; /** * Orca worker handler. @@ -90,7 +91,7 @@ self.onmessage = async function( } else { self.postMessage({ command: 'error', - status: PvStatus.INVALID_STATE, + status: PvStatus.RUNTIME_ERROR, shortMessage: 'Orca synthesize error', }); } @@ -106,6 +107,121 @@ self.onmessage = async function( command: 'ok', }); break; + case 'streamOpen': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + try { + orcaStream = await orca.streamOpen(event.data.synthesizeParams); + self.postMessage({ + command: 'ok', + }); + } catch (e: any) { + if (e instanceof OrcaError) { + self.postMessage({ + command: 'error', + status: e.status, + shortMessage: e.shortMessage, + messageStack: e.messageStack, + }); + } else { + self.postMessage({ + command: 'error', + status: PvStatus.RUNTIME_ERROR, + shortMessage: 'Orca stream open error', + }); + } + } + break; + case 'streamSynthesize': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + try { + self.postMessage({ + command: 'ok', + result: await orcaStream.synthesize(event.data.text), + }); + } catch (e: any) { + if (e instanceof OrcaError) { + self.postMessage({ + command: 'error', + status: e.status, + shortMessage: e.shortMessage, + messageStack: e.messageStack, + }); + } else { + self.postMessage({ + command: 'error', + status: PvStatus.RUNTIME_ERROR, + shortMessage: 'Orca synthesize error', + }); + } + } + break; + case 'streamFlush': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + self.postMessage({ + command: 'ok', + result: await orcaStream.flush(), + }); + break; + case 'streamClose': + if (orca === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca not initialized', + }); + return; + } + if (orcaStream === null) { + self.postMessage({ + command: 'error', + status: PvStatus.INVALID_STATE, + shortMessage: 'Orca stream not initialized', + }); + return; + } + await orcaStream.close(); + orcaStream = null; + self.postMessage({ + command: 'ok', + }); + break; default: self.postMessage({ command: 'failed', diff --git a/binding/web/src/types.ts b/binding/web/src/types.ts index 7c8aed4a..ccee0060 100644 --- a/binding/web/src/types.ts +++ b/binding/web/src/types.ts @@ -27,15 +27,36 @@ export enum PvStatus { ACTIVATION_REFUSED, } -export type SynthesizeParams = { - speechRate?: number -} - /** * OrcaModel types */ export type OrcaModel = PvModel; +export type OrcaSynthesizeParams = { + speechRate?: number; + randomState?: number | null; +} + +export type OrcaPhoneme = { + phoneme: string; + startSec: number; + endSec: number; +} + +export type OrcaAlignment = { + word: string; + startSec: number; + endSec: number; + phonemes: OrcaPhoneme[]; +} + +export type OrcaSynthesizeResult = { + pcm: Int16Array; + alignments: OrcaAlignment[]; +} + +export type OrcaStreamSynthesizeResult = Int16Array | null + export type OrcaWorkerInitRequest = { command: 'init'; accessKey: string; @@ -48,17 +69,39 @@ export type OrcaWorkerInitRequest = { export type OrcaWorkerSynthesizeRequest = { command: 'synthesize'; text: string; - synthesizeParams?: SynthesizeParams; + synthesizeParams: OrcaSynthesizeParams; }; export type OrcaWorkerReleaseRequest = { command: 'release'; }; +export type OrcaWorkerStreamOpenRequest = { + command: 'streamOpen'; + synthesizeParams: OrcaSynthesizeParams; +} + +export type OrcaWorkerStreamSynthesizeRequest = { + command: 'streamSynthesize'; + text: string; +}; + +export type OrcaWorkerStreamFlushRequest = { + command: 'streamFlush'; +}; + +export type OrcaWorkerStreamCloseRequest = { + command: 'streamClose'; +}; + export type OrcaWorkerRequest = | OrcaWorkerInitRequest | OrcaWorkerSynthesizeRequest - | OrcaWorkerReleaseRequest; + | OrcaWorkerReleaseRequest + | OrcaWorkerStreamOpenRequest + | OrcaWorkerStreamSynthesizeRequest + | OrcaWorkerStreamFlushRequest + | OrcaWorkerStreamCloseRequest; export type OrcaWorkerFailureResponse = { command: 'failed' | 'error'; @@ -82,7 +125,7 @@ export type OrcaWorkerSynthesizeResponse = | OrcaWorkerFailureResponse | { command: 'ok'; - result: Int16Array; + result: OrcaSynthesizeResult; }; export type OrcaWorkerReleaseResponse = @@ -91,7 +134,37 @@ export type OrcaWorkerReleaseResponse = command: 'ok'; }; +export type OrcaWorkerStreamOpenResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: any; +}; + +export type OrcaWorkerStreamSynthesizeResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: OrcaStreamSynthesizeResult; +}; + +export type OrcaWorkerStreamFlushResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; + result: OrcaStreamSynthesizeResult; +}; + +export type OrcaWorkerStreamCloseResponse = + | OrcaWorkerFailureResponse + | { + command: 'ok'; +}; + export type OrcaWorkerResponse = | OrcaWorkerInitResponse | OrcaWorkerSynthesizeResponse - | OrcaWorkerReleaseResponse; + | OrcaWorkerReleaseResponse + | OrcaWorkerStreamOpenResponse + | OrcaWorkerStreamSynthesizeResponse + | OrcaWorkerStreamFlushResponse; diff --git a/binding/web/test/orca.test.ts b/binding/web/test/orca.test.ts index 6c3af96c..d531733e 100644 --- a/binding/web/test/orca.test.ts +++ b/binding/web/test/orca.test.ts @@ -1,54 +1,42 @@ -import { LeopardWorker } from '@picovoice/leopard-web'; import { Orca, OrcaWorker } from '../'; import { OrcaError } from '../dist/types/orca_errors'; +import { PvModel } from '@picovoice/web-utils'; // @ts-ignore import orcaParamsMale from './orca_params_male'; + // @ts-ignore import orcaParamsFemale from './orca_params_female'; -import { PvModel } from '@picovoice/web-utils'; -import testData from '../cypress/fixtures/.test/test_data.json'; +/* eslint camelcase: 0 */ + +import testData from '../cypress/fixtures/resources/.test/test_data.json'; const ACCESS_KEY = Cypress.env('ACCESS_KEY'); const EXPECTED_MAX_CHARACTER_LIMIT = 2000; const EXPECTED_SAMPLE_RATE = 22050; const EXPECTED_VALID_CHARACTERS = [ - '.', ':', ',', '"', '?', '!', 'a', 'b', - 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', - 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', - 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', - 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', - 'Y', 'Z', '\'', '{', '}', '|', ' ', '-', + '.', ':', ',', '"', '?', '!', 'a', + 'b', 'c', 'd', 'e', 'f', 'g', 'h', + 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', + 'w', 'x', 'y', 'z', 'A', 'B', 'C', + 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', + 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '\'', '{', '}', '|', ' ', + '-', '1', '2', '3', '4', '5', '6', + '7', '8', '9', '0', '@', '%', '&', ]; -const levenshteinDistance = (words1: string[], words2: string[]) => { - const res = Array.from(Array(words1.length + 1), () => new Array(words2.length + 1)); - for (let i = 0; i <= words1.length; i++) { - res[i][0] = i; - } - for (let j = 0; j <= words2.length; j++) { - res[0][j] = j; - } - for (let i = 1; i <= words1.length; i++) { - for (let j = 1; j <= words2.length; j++) { - res[i][j] = Math.min( - res[i - 1][j] + 1, - res[i][j - 1] + 1, - res[i - 1][j - 1] + (words1[i - 1].toUpperCase() === words2[j - 1].toUpperCase() ? 0 : 1), - ); - } - } - return res[words1.length][words2.length]; -}; +const EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER = 'female'; -const wordErrorRate = (reference: string, hypothesis: string, useCER = false): number => { - const splitter = (useCER) ? '' : ' '; - const ed = levenshteinDistance(reference.split(splitter), hypothesis.split(splitter)); - return ed / reference.length; +const compareArrays = (arr1: Int16Array, arr2: Int16Array, step: number) => { + expect(arr1.length).eq(arr2.length); + for (let i = 0; i < arr1.length - step; i += step) { + expect(arr1[i]).closeTo(arr2[i], 1); + } }; const runInitTest = async ( @@ -65,19 +53,18 @@ const runInitTest = async ( expectFailure = false, } = params; - let orca = null; + let orca: Orca | OrcaWorker | null = null; let isFailed = false; try { orca = await instance.create(accessKey, model); - - expect(typeof orca.version).to.eq('string'); - expect(orca.version.length).to.be.greaterThan(0); - expect(orca.maxCharacterLimit).to.eq(EXPECTED_MAX_CHARACTER_LIMIT); - expect(orca.sampleRate).to.eq(EXPECTED_SAMPLE_RATE); - expect(orca.validCharacters.length).to.eq(EXPECTED_VALID_CHARACTERS.length); + expect(typeof orca.version).eq('string'); + expect(orca.version.length).gt(0); + expect(orca.maxCharacterLimit).eq(EXPECTED_MAX_CHARACTER_LIMIT); + expect(orca.sampleRate).eq(EXPECTED_SAMPLE_RATE); + expect(orca.validCharacters.length).eq(EXPECTED_VALID_CHARACTERS.length); orca.validCharacters.forEach((symbol: string, i: number) => { - expect(symbol).to.eq(EXPECTED_VALID_CHARACTERS[i]); + expect(symbol).eq(EXPECTED_VALID_CHARACTERS[i]); }); } catch (e) { if (expectFailure) { @@ -85,73 +72,12 @@ const runInitTest = async ( } else { expect(e).to.be.undefined; } - } finally { - if (orca !== null) { - if (orca instanceof OrcaWorker) { - orca.terminate(); - } else { - await orca.release(); - } - } } - if (expectFailure) { - expect(isFailed).to.be.true; - } else { - expect(isFailed).to.be.false; - } -}; - -const runProcTest = async ( - instance: typeof Orca | typeof OrcaWorker, - text: string, - speechRate: number, - params: { - accessKey?: string; - model?: PvModel; - isTestWER?: boolean; - expectFailure?: boolean; - } = {}, -) => { - const { - accessKey = ACCESS_KEY, - model = { publicPath: '/test/orca_params_male.pv', forceWrite: true }, - isTestWER = true, - expectFailure = false, - } = params; - - const checkWER = async (pcm: Int16Array) => { - const leopard = await LeopardWorker.create( - accessKey, - { publicPath: '/test/leopard_params.pv', forceWrite: true }, - ); - - const { transcript } = await leopard.process(pcm); - const wer = wordErrorRate(transcript, testData.test_sentences.text_no_punctuation); - expect(wer).lt(testData.wer_threshold); - leopard.terminate(); - }; - - let isFailed = false; - const orca = await instance.create(accessKey, model); - - try { - const speech = await orca.synthesize(text, { speechRate }); - if (isTestWER) { - await checkWER(speech); - } else if (!expectFailure) { - expect(speech.length).gt(0); - } - } catch (e) { - isFailed = true; - } finally { - if (orca !== null) { - if (orca instanceof OrcaWorker) { - orca.terminate(); - } else if (orca instanceof Orca) { - await orca.release(); - } - } + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); } if (expectFailure) { @@ -163,32 +89,26 @@ const runProcTest = async ( describe('Orca Binding', function() { for (const instance of [Orca, OrcaWorker]) { - const instanceString = instance === OrcaWorker ? 'worker' : 'main'; + const instanceString = instance === Orca ? 'main' : 'worker'; - it(`should be able to handle invalid public path (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { publicPath: 'invalid', forceWrite: true }, - expectFailure: true, - }); + it(`should be able to handle invalid public path (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath: 'invalid', forceWrite: true }, + expectFailure: true, }); }); - it(`should be able to handle invalid base64 (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { base64: 'invalid', forceWrite: true }, - expectFailure: true, - }); + it(`should be able to handle invalid base64 (${instanceString})`, async () => { + await runInitTest(instance, { + model: { base64: 'invalid', forceWrite: true }, + expectFailure: true, }); }); - it(`should be able to handle invalid access key (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - accessKey: 'invalid', - expectFailure: true, - }); + it(`should be able to handle invalid access key (${instanceString})`, async () => { + await runInitTest(instance, { + accessKey: 'invalid', + expectFailure: true, }); }); @@ -196,105 +116,93 @@ describe('Orca Binding', function() { const publicPath = modelFileSuffix === 'male' ? `/test/orca_params_male.pv` : `/test/orca_params_female.pv`; const base64Path = modelFileSuffix === 'male' ? orcaParamsMale : orcaParamsFemale; - it(`should return process and flush error message stack`, async () => { - const orca = await Orca.create( - ACCESS_KEY, - { publicPath: publicPath, forceWrite: true }, - ); - - // @ts-ignore - const objectAddress = orca._objectAddress; - - // @ts-ignore - orca._objectAddress = 0; - - const errors: OrcaError[] = []; - try { - await orca.synthesize('test'); - } catch (e) { - errors.push(e); - } - - // @ts-ignore - orca._objectAddress = objectAddress; - await orca.release(); - - expect(errors.length).to.be.gte(0); - - for (let i = 0; i < errors.length; i++) { - expect((errors[i] as OrcaError).messageStack.length).to.be.gt(0); - expect((errors[i] as OrcaError).messageStack.length).to.be.lte(8); - } - }); - - it(`should return correct error message stack [${modelFileSuffix}] (${instanceString})`, async () => { - let messageStack = []; - try { - const orca = await instance.create('invalidAccessKey', { - publicPath, - forceWrite: true, - }); - expect(orca).to.be.undefined; - } catch (e: any) { - messageStack = e.messageStack; - } - - expect(messageStack.length).to.be.gt(0); - expect(messageStack.length).to.be.lte(8); - - try { - const orca = await instance.create('invalidAccessKey', { - publicPath, - forceWrite: true, - }); - expect(orca).to.be.undefined; - } catch (e: any) { - expect(messageStack.length).to.be.eq(e.messageStack.length); - } + it(`should be able to init with public path [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath, forceWrite: true }, + }); }); - it(`should be able to init with public path [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { - publicPath, - forceWrite: true, - }, - }); + it(`should be able to init with base64 [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { base64: base64Path, forceWrite: true }, }); }); - it(`should be able to init with base64 [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { base64: base64Path, forceWrite: true }, - }); + it(`should be able to handle UTF-8 public path [${modelFileSuffix}] (${instanceString})`, async () => { + await runInitTest(instance, { + model: { publicPath, forceWrite: true, customWritePath: '테스트' }, }); }); - it(`should be able to handle UTF-8 public path [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { - await runInitTest(instance, { - model: { - publicPath, - forceWrite: true, - customWritePath: '테스트', + it(`should be able to process text streaming [${modelFileSuffix}] (${instanceString})`, () => { + try { + cy.getFramesFromFile(`${testData.audio_data_folder}orca_params_${modelFileSuffix}_stream.wav`).then( + async (rawPcm: Int16Array) => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + try { + const orcaStream = await orca.streamOpen({ randomState: testData.random_state }); + + const streamPcm: number[] = []; + for (const c of testData.test_sentences.text.split('')) { + const pcm = await orcaStream.synthesize(c); + if (pcm !== null) { + streamPcm.push(...pcm); + } + } + + const endPcm = await orcaStream.flush(); + if (endPcm !== null) { + streamPcm.push(...endPcm); + } + + compareArrays(new Int16Array(streamPcm), rawPcm, 500); + await orcaStream.close(); + } catch (e) { + expect(e).to.be.undefined; + } + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } }, - }); - }); + ); + } catch (e) { + expect(e).to.be.undefined; + } }); - it(`should be able to handle different speech rates [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { + if (modelFileSuffix === EXACT_ALIGNMENT_TEST_MODEL_IDENTIFIER) { + it(`should be able to process alignment exact [${modelFileSuffix}] (${instanceString})`, async () => { try { const orca = await instance.create( ACCESS_KEY, { publicPath, forceWrite: true }, ); - const speechSlow = await orca.synthesize(testData.test_sentences.text, { speechRate: 0.7 }); - const speechFast = await orca.synthesize(testData.test_sentences.text, { speechRate: 1.3 }); - expect(speechSlow.length).gt(speechFast.length); + const { + pcm, + alignments, + } = await orca.synthesize(testData.test_sentences.text_alignment, { randomState: testData.random_state }); + expect(pcm.length).gt(0); + expect(alignments.length).eq(testData.alignments.length); + + alignments.forEach((w, i) => { + const { word, start_sec, end_sec, phonemes } = testData.alignments[i]; + expect(w.word).eq(word); + expect(w.startSec).closeTo(start_sec, 0.01); + expect(w.endSec).closeTo(end_sec, 0.01); + w.phonemes.forEach((p, j) => { + expect(p.phoneme).eq(phonemes[j].phoneme); + expect(p.startSec).closeTo(phonemes[j].start_sec, 0.01); + expect(p.endSec).closeTo(phonemes[j].end_sec, 0.01); + }); + }); if (orca instanceof OrcaWorker) { orca.terminate(); @@ -305,19 +213,33 @@ describe('Orca Binding', function() { expect(e).to.be.undefined; } }); - }); - - it(`should be able to handle max num characters [${modelFileSuffix}] (${instanceString})`, () => { - cy.wrap(null).then(async () => { + } else { + it(`should be able to process alignment [${modelFileSuffix}] (${instanceString})`, async () => { try { const orca = await instance.create( ACCESS_KEY, { publicPath, forceWrite: true }, ); - const maxNumChars = orca.maxCharacterLimit; - const speech = await orca.synthesize('a'.repeat(maxNumChars)); - expect(speech.length).gt(0); + const { + pcm, + alignments, + } = await orca.synthesize(testData.test_sentences.text_alignment, { randomState: testData.random_state }); + expect(pcm.length).gt(0); + expect(alignments.length).eq(testData.alignments.length); + + let prevWordEndSec = 0; + let prevPhonemeEndSec = 0; + alignments.forEach(w => { + expect(w.startSec).closeTo(prevWordEndSec, 0.001); + expect(w.endSec).gt(w.startSec); + prevWordEndSec = w.endSec; + w.phonemes.forEach(p => { + expect(p.startSec).closeTo(prevPhonemeEndSec, 0.001); + expect(p.endSec).gt(p.startSec); + prevPhonemeEndSec = p.endSec; + }); + }); if (orca instanceof OrcaWorker) { orca.terminate(); @@ -328,19 +250,28 @@ describe('Orca Binding', function() { expect(e).to.be.undefined; } }); - }); + } - it(`should be able to process - punctuation [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to process text [${modelFileSuffix}] (${instanceString})`, () => { try { - await runProcTest( - instance, - testData.test_sentences.text, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, + cy.getFramesFromFile(`${testData.audio_data_folder}orca_params_${modelFileSuffix}_single.wav`).then( + async rawPcm => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const { pcm } = await orca.synthesize( + testData.test_sentences.text, + { speechRate: 1, randomState: testData.random_state }, + ); + compareArrays(pcm, rawPcm, 500); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } }, ); } catch (e) { @@ -350,62 +281,163 @@ describe('Orca Binding', function() { it(`should be able to process - no punctuation [${modelFileSuffix}] (${instanceString})`, async () => { try { - await runProcTest( - instance, - testData.test_sentences.text_no_punctuation, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - }, + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, ); + + const { pcm } = await orca.synthesize(testData.test_sentences.text_no_punctuation); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } } catch (e) { expect(e).to.be.undefined; } }); - it(`should be able to process - custom punctuation [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to process custom punctuation [${modelFileSuffix}] (${instanceString})`, async () => { try { - await runProcTest( - instance, - testData.test_sentences.text_custom_pronunciation, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - isTestWER: false, - }, + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, ); + + const { pcm } = await orca.synthesize(testData.test_sentences.text_custom_pronunciation); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } } catch (e) { expect(e).to.be.undefined; } }); - for (const failureCase of testData.test_sentences.text_invalid) { - it(`should handle invalid text (${failureCase}) [${modelFileSuffix}] (${instanceString})`, async () => { + it(`should be able to handle different speech rates [${modelFileSuffix}] (${instanceString})`, async () => { + try { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const { pcm: pcmSlow } = await orca.synthesize(testData.test_sentences.text, { speechRate: 0.7 }); + const { pcm: pcmFast } = await orca.synthesize(testData.test_sentences.text, { speechRate: 1.3 }); + expect(pcmSlow.length).gt(pcmFast.length); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + } catch (e) { + expect(e).to.be.undefined; + } + }); + + it(`should be able to handle max num characters [${modelFileSuffix}] (${instanceString})`, async () => { + try { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + const maxNumChars = orca.maxCharacterLimit; + const { pcm } = await orca.synthesize('a'.repeat(maxNumChars)); + expect(pcm.length).gt(0); + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + } catch (e) { + expect(e).to.be.undefined; + } + }); + + it(`should handle invalid input [${modelFileSuffix}] (${instanceString})`, async () => { + const orca = await instance.create( + ACCESS_KEY, + { publicPath, forceWrite: true }, + ); + + for (const failureCase of testData.test_sentences.text_invalid) { try { - await runProcTest( - instance, - failureCase, - 1.0, - { - model: { - publicPath, - forceWrite: true, - }, - isTestWER: false, - expectFailure: true, - }, - ); + await orca.synthesize(failureCase); } catch (e) { - expect(e).to.be.undefined; + expect(e).not.to.be.undefined; } - }); - } + } + + if (orca instanceof OrcaWorker) { + orca.terminate(); + } else if (orca instanceof Orca) { + await orca.release(); + } + }); + + it(`should return process and flush error message stack [${modelFileSuffix}] (${instanceString})`, async () => { + const orca = await Orca.create( + ACCESS_KEY, + { publicPath: publicPath, forceWrite: true }, + ); + + // @ts-ignore + const objectAddress = orca._objectAddress; + + // @ts-ignore + orca._objectAddress = 0; + + const errors: OrcaError[] = []; + try { + await orca.synthesize('test'); + } catch (e: any) { + errors.push(e); + } + + // @ts-ignore + orca._objectAddress = objectAddress; + await orca.release(); + + expect(errors.length).to.be.gte(0); + + for (let i = 0; i < errors.length; i++) { + expect((errors[i] as OrcaError).messageStack.length).to.be.gt(0); + expect((errors[i] as OrcaError).messageStack.length).to.be.lte(8); + } + }); + + it(`should return correct error message stack [${modelFileSuffix}] (${instanceString})`, async () => { + let messageStack = []; + try { + const orca = await instance.create('invalidAccessKey', { + publicPath, + forceWrite: true, + }); + expect(orca).to.be.undefined; + } catch (e: any) { + messageStack = e.messageStack; + } + + expect(messageStack.length).to.be.gt(0); + expect(messageStack.length).to.be.lte(8); + + try { + const orca = await instance.create('invalidAccessKey', { + publicPath, + forceWrite: true, + }); + expect(orca).to.be.undefined; + } catch (e: any) { + expect(messageStack.length).to.be.eq(e.messageStack.length); + } + }); } } }); diff --git a/binding/web/test/orca_perf.test.ts b/binding/web/test/orca_perf.test.ts index 172f26d3..67419b7d 100644 --- a/binding/web/test/orca_perf.test.ts +++ b/binding/web/test/orca_perf.test.ts @@ -1,5 +1,5 @@ import { Orca, OrcaWorker } from '../'; -import testData from '../cypress/fixtures/.test/test_data.json'; +import testData from '../cypress/fixtures/resources/.test/test_data.json'; const ACCESS_KEY = Cypress.env('ACCESS_KEY'); const NUM_TEST_ITERATIONS = Number(Cypress.env('NUM_TEST_ITERATIONS')); diff --git a/demo/android/OrcaDemo/README.md b/demo/android/OrcaDemo/README.md index 73144c72..6b45dd58 100644 --- a/demo/android/OrcaDemo/README.md +++ b/demo/android/OrcaDemo/README.md @@ -2,13 +2,15 @@ ## AccessKey -Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca SDKs. +Orca requires a valid Picovoice `AccessKey` at initialization. `AccessKey` acts as your credentials when using Orca +SDKs. You can get your `AccessKey` for free. Make sure to keep your `AccessKey` secret. Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get your `AccessKey`. ## Setup -Replace `"${YOUR_ACCESS_KEY_HERE}"` inside [MainActivity.java](orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java) +Replace `"${YOUR_ACCESS_KEY_HERE}"` +inside [MainActivity.java](orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java) with your AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/). 1. Open the project in Android Studio @@ -16,7 +18,7 @@ with your AccessKey obtained from [Picovoice Console](https://console.picovoice. ## Usage -1. Type a phrase that you'd like to synthesize into the textbox at the top. -2. Press the `Synthesize` button to hear the synthesized speech. -3. Press `Stop` if you wish to stop the playback before it completes on its own. +1. Choose between Streaming Synthesis and Single Synthesis using the switch at the top. +2. Type a phrase that you'd like to synthesize into the textbox. +3. Press the `Synthesize` button to hear the synthesized speech. diff --git a/demo/android/OrcaDemo/orca-demo-app/build.gradle b/demo/android/OrcaDemo/orca-demo-app/build.gradle index 2cdae428..8f8353a7 100644 --- a/demo/android/OrcaDemo/orca-demo-app/build.gradle +++ b/demo/android/OrcaDemo/orca-demo-app/build.gradle @@ -7,8 +7,8 @@ android { applicationId "ai.picovoice.orcademo" minSdkVersion 21 targetSdkVersion defaultTargetSdkVersion - versionCode 1 - versionName "1.0" + versionCode 2 + versionName "2.0" testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" } @@ -30,11 +30,10 @@ android { } dependencies { - implementation 'androidx.appcompat:appcompat:1.6.1' implementation 'com.google.android.material:material:1.8.0' implementation 'androidx.constraintlayout:constraintlayout:2.1.4' - implementation 'ai.picovoice:orca-android:0.1.0' + implementation 'ai.picovoice:orca-android:0.2.0' } tasks.register('copyParams', Copy) { @@ -43,4 +42,4 @@ tasks.register('copyParams', Copy) { into("${rootDir}/orca-demo-app/src/main/assets") } -preBuild.dependsOn(copyParams) \ No newline at end of file +preBuild.dependsOn(copyParams) diff --git a/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java b/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java index cb5e4b7f..3f9653c5 100644 --- a/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java +++ b/demo/android/OrcaDemo/orca-demo-app/src/main/java/ai/picovoice/orcademo/MainActivity.java @@ -20,19 +20,28 @@ import android.os.Looper; import android.text.Editable; import android.text.TextWatcher; +import android.text.method.ScrollingMovementMethod; import android.view.View; -import android.widget.Button; import android.widget.EditText; import android.widget.ProgressBar; import android.widget.TextView; import android.widget.ToggleButton; +import android.media.AudioFormat; +import android.media.AudioManager; +import android.media.AudioTrack; + import androidx.appcompat.app.AppCompatActivity; +import androidx.appcompat.widget.SwitchCompat; +import java.util.ArrayList; import java.util.HashSet; import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -50,9 +59,12 @@ public class MainActivity extends AppCompatActivity { private static final String ACCESS_KEY = "${YOUR_ACCESS_KEY_HERE}"; private static final String MODEL_FILE = "orca_params_female.pv"; + private static final int STREAMING_NUM_AUDIO_WAIT_CHUNKS = 1; private final Handler mainHandler = new Handler(Looper.getMainLooper()); private final ExecutorService executor = Executors.newSingleThreadExecutor(); + private final ExecutorService executorStreamingSynthesis = Executors.newSingleThreadExecutor(); + private final ExecutorService executorStreamingAudio = Executors.newSingleThreadExecutor(); private String synthesizedFilePath; private MediaPlayer synthesizedPlayer; @@ -63,10 +75,15 @@ public class MainActivity extends AppCompatActivity { private Orca orca; + private Orca.OrcaStream orcaStream = null; + TextView errorText; TextView infoTextView; + TextView streamTextView; + TextView streamSecsTextView; TextView numCharsTextView; EditText synthesizeEditText; + SwitchCompat streamSwitch; ToggleButton synthesizeButton; ProgressBar synthesizeProgress; @@ -77,10 +94,14 @@ protected void onCreate(Bundle savedInstanceState) { setContentView(R.layout.orca_demo); errorText = findViewById(R.id.errorTextView); infoTextView = findViewById(R.id.infoTextView); + streamTextView = findViewById(R.id.streamTextView); + streamSecsTextView = findViewById(R.id.streamSecsTextView); numCharsTextView = findViewById(R.id.numCharsTextView); synthesizeEditText = findViewById(R.id.synthesizeEditText); + streamSwitch = findViewById(R.id.streamSwitch); synthesizeButton = findViewById(R.id.synthesizeButton); synthesizeProgress = findViewById(R.id.synthesizeProgress); + streamTextView.setMovementMethod(new ScrollingMovementMethod()); try { orca = new Orca.Builder() @@ -113,9 +134,9 @@ public void afterTextChanged(Editable s) { public void onTextChanged(CharSequence s, int start, int before, int count) { runOnUiThread(() -> numCharsTextView.setText(String.format( - "%d/%d", - s.toString().length(), - orca.getMaxCharacterLimit())) + "%d/%d", + s.toString().length(), + orca.getMaxCharacterLimit())) ); validateText(s.toString()); } @@ -139,37 +160,65 @@ private void setUIState(UIState state) { case EDIT: infoTextView.setVisibility(View.INVISIBLE); synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(true); synthesizeButton.setEnabled(true); + synthesizeButton.setChecked(false); synthesizeEditText.setEnabled(true); + synthesizeEditText.setVisibility(View.VISIBLE); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); break; case PLAYBACK: infoTextView.setVisibility(View.VISIBLE); synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(true); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); + break; + case STREAMING_PLAYBACK: + infoTextView.setText("Streaming..."); + infoTextView.setVisibility(View.VISIBLE); + synthesizeButton.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); + synthesizeButton.setEnabled(false); + synthesizeEditText.setEnabled(false); + synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.VISIBLE); + streamSecsTextView.setVisibility(View.VISIBLE); + synthesizeEditText.setVisibility(View.INVISIBLE); break; case BUSY: infoTextView.setVisibility(View.VISIBLE); synthesizeButton.setVisibility(View.INVISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.VISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; case ERROR: infoTextView.setVisibility(View.VISIBLE); errorText.setVisibility(View.INVISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(true); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; case FATAL_ERROR: infoTextView.setVisibility(View.INVISIBLE); errorText.setVisibility(View.VISIBLE); + streamSwitch.setEnabled(false); synthesizeButton.setEnabled(false); synthesizeEditText.setEnabled(false); synthesizeProgress.setVisibility(View.INVISIBLE); + streamTextView.setVisibility(View.INVISIBLE); + streamSecsTextView.setVisibility(View.INVISIBLE); break; default: break; @@ -225,13 +274,23 @@ private void validateText(String text) { } sb.append(c); } - runOnUiThread(() -> { - setUIState(UIState.ERROR); - infoTextView.setText(String.format( - "Invalid characters in text: [%s]", - sb - )); - }); + if (orcaStream == null) { + runOnUiThread(() -> { + setUIState(UIState.ERROR); + infoTextView.setText(String.format( + "Invalid characters in text: [%s]", + sb + )); + }); + } else { + runOnUiThread(() -> { + infoTextView.setVisibility(View.VISIBLE); + infoTextView.setText(String.format( + "Invalid characters in text will be ignored: [%s]", + sb + )); + }); + } } } } else { @@ -322,6 +381,34 @@ private void stopPlayback() { runOnUiThread(() -> setUIState(UIState.EDIT)); } + public void onStreamSwitchClick(View view) { + if (orca == null) { + displayError("Orca is not initialized"); + streamSwitch.setChecked(false); + return; + } + + try { + if (orcaStream == null) { + orcaStream = orca.streamOpen(new OrcaSynthesizeParams.Builder().build()); + runOnUiThread(() -> { + synthesizeEditText.setText(""); + streamSecsTextView.setText(""); + streamSecsTextView.setVisibility(View.VISIBLE); + }); + } else { + orcaStream.close(); + orcaStream = null; + runOnUiThread(() -> { + synthesizeEditText.setText(""); + streamSecsTextView.setVisibility(View.INVISIBLE); + }); + } + } catch (OrcaException e) { + onOrcaException(e); + } + } + public void onSynthesizeClick(View view) { if (orca == null) { displayError("Orca is not initialized"); @@ -329,21 +416,150 @@ public void onSynthesizeClick(View view) { return; } - if (synthesizeButton.isChecked()) { - String text = synthesizeEditText.getText().toString(); - if (!previousText.equals(text)) { - runSynthesis(text); + String text = synthesizeEditText.getText().toString(); + if (orcaStream == null) { + if (synthesizeButton.isChecked()) { + if (!previousText.equals(text)) { + runSynthesis(text); + } else { + startPlayback(); + } } else { - startPlayback(); + stopPlayback(); } } else { - stopPlayback(); + runStreamSynthesis(text); } } + private void runStreamSynthesis(final String text) { + setUIState(UIState.STREAMING_PLAYBACK); + + AtomicBoolean isStreamingText = new AtomicBoolean(false); + ArrayList textStream = new ArrayList<>(); + + AtomicBoolean isQueueingStreamingPcm = new AtomicBoolean(false); + ConcurrentLinkedQueue pcmQueue = new ConcurrentLinkedQueue<>(); + CountDownLatch streamingSynthesisLatch = new CountDownLatch(1); + CountDownLatch streamingAudioLatch = new CountDownLatch(1); + + executor.submit(() -> { + isStreamingText.set(true); + streamingSynthesisLatch.countDown(); + + String[] words = text.split(" "); + for (String word : words) { + word += " "; + String finalWord = word; + mainHandler.post(() -> { + textStream.add(finalWord); + streamTextView.append(finalWord); + }); + try { + Thread.sleep(100); + } catch (InterruptedException ignored) { } + } + + isStreamingText.set(false); + }); + + executorStreamingSynthesis.submit(() -> { + try { + mainHandler.post(() -> { + streamTextView.setText(""); + streamSecsTextView.setText("Seconds of audio synthesized: 0.000s"); + synthesizeButton.setEnabled(false); + }); + + int numIterations = 0; + boolean isPcmPlayStarted = false; + float secs = 0; + isQueueingStreamingPcm.set(true); + + streamingSynthesisLatch.await(); + while (isStreamingText.get() || !textStream.isEmpty()) { + if (!textStream.isEmpty()) { + String word = textStream.remove(0); + try { + short[] pcm = orcaStream.synthesize(word); + if (pcm != null && pcm.length > 0) { + pcmQueue.add(pcm); + secs += (float) pcm.length / orca.getSampleRate(); + float finalSecs = secs; + mainHandler.post(() -> streamSecsTextView.setText(String.format("Seconds of audio synthesized: %.3fs", finalSecs))); + if (numIterations == STREAMING_NUM_AUDIO_WAIT_CHUNKS) { + streamingAudioLatch.countDown(); + isPcmPlayStarted = true; + } + numIterations++; + } + } catch (OrcaException e) { + mainHandler.post(() -> onOrcaException(e)); + } + } + } + + try { + short[] flushedPcm = orcaStream.flush(); + if (flushedPcm != null && flushedPcm.length > 0) { + pcmQueue.add(flushedPcm); + secs += (float) flushedPcm.length / orca.getSampleRate(); + float finalSecs = secs; + mainHandler.post(() -> streamSecsTextView.setText(String.format("Seconds of audio synthesized: %.3fs", finalSecs))); + } + + if (!isPcmPlayStarted) { + streamingAudioLatch.countDown(); + } + } catch (OrcaException e) { + mainHandler.post(() -> onOrcaException(e)); + } + + isQueueingStreamingPcm.set(false); + } catch (Exception e) { + mainHandler.post(() -> displayError(e.toString())); + } + }); + + executorStreamingAudio.submit(() -> { + try { + AudioTrack audioTrack = new AudioTrack( + AudioManager.STREAM_MUSIC, + orca.getSampleRate(), + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT, + AudioTrack.getMinBufferSize( + orca.getSampleRate(), + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_16BIT), + AudioTrack.MODE_STREAM); + + audioTrack.play(); + + streamingAudioLatch.await(); + while(isQueueingStreamingPcm.get() || !pcmQueue.isEmpty()) { + if (!pcmQueue.isEmpty()) { + short[] pcm = pcmQueue.poll(); + if (pcm != null && pcm.length > 0) { + audioTrack.write(pcm, 0, pcm.length); + } + } + } + + audioTrack.stop(); + audioTrack.release(); + + mainHandler.post(() -> setUIState(UIState.EDIT)); + } catch (Exception e) { + mainHandler.post(() -> displayError(e.toString())); + } + }); + } + private enum UIState { EDIT, PLAYBACK, + STREAMING_PLAYBACK, BUSY, ERROR, FATAL_ERROR diff --git a/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml b/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml index 912b82b9..f541b179 100644 --- a/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml +++ b/demo/android/OrcaDemo/orca-demo-app/src/main/res/layout/orca_demo.xml @@ -1,5 +1,6 @@ - - + - + - + - + - + - + - + - \ No newline at end of file + + + + + + + diff --git a/demo/c/CMakeLists.txt b/demo/c/CMakeLists.txt index 12e37908..3efdb691 100644 --- a/demo/c/CMakeLists.txt +++ b/demo/c/CMakeLists.txt @@ -1,13 +1,18 @@ cmake_minimum_required(VERSION 3.13) -project(orca_demo) +project(orca_demo_c) set(CMAKE_C_STANDARD 99) set(CMAKE_BUILD_TYPE Release) +set(COMMON_LIBS dl) include_directories("${PROJECT_SOURCE_DIR}/../../include") add_executable(orca_demo orca_demo.c) +add_executable(orca_demo_streaming orca_demo_streaming.c) +target_include_directories(orca_demo_streaming PRIVATE dr_libs) + if (NOT WIN32) - target_link_libraries(orca_demo dl) + target_link_libraries(orca_demo ${COMMON_LIBS}) + target_link_libraries(orca_demo_streaming ${COMMON_LIBS}) endif() diff --git a/demo/c/README.md b/demo/c/README.md index 93e42357..715acb9a 100644 --- a/demo/c/README.md +++ b/demo/c/README.md @@ -16,33 +16,63 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you - The demo requires [CMake](https://cmake.org/) version 3.4 or higher. - **For Windows Only**: [MinGW](https://www.mingw-w64.org/) is required to build the demo. -# Speech Synthesis Demo +# Speech Synthesis Demos +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +This is demonstrated in the Orca streaming demo. +In the single synthesis mode, the text is synthesized in a single call to the Orca engine. **Note**: the following commands are run from the root of the repo. -## Build +## Streaming Synthesis Demo + +### Build Use CMake to build the Orca demo target: ```console -cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo_streaming ``` -## Usage +### Usage Running the executable without any command-line arguments prints the usage info to the console: ```console -Usage: orca_demo [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH] +Usage: orca_demo_streaming [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH] +``` + +To run the Orca streaming demo: + +```console +./demo/c/build/orca_demo_streaming -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} +``` + +Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with +a path to any of the model files available under [lib/common](../../lib/common), `${ACCESS_KEY}` with AccessKey +obtained from [Picovoice Console](https://console.picovoice.ai/), `${TEXT}` with the text to be synthesized, +and `${WAV_OUTPUT_PATH}` with a path to a output audio file. +The audio will be stored as a single-channel 16-bit PCM `.wav` file. + +## Single Synthesis Demo + +### Build + +Use CMake to build the Orca demo target: + +```console +cmake -S demo/c/ -B demo/c/build && cmake --build demo/c/build --target orca_demo ``` +### Usage + To run the Orca demo: ```console ./demo/c/build/orca_demo -l ${LIBRARY_PATH} -m ${MODEL_PATH} -a ${ACCESS_KEY} -t ${TEXT} -o ${OUTPUT_PATH} ``` -Replace `${LIBRARY_PATH}` with path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with +Replace `${LIBRARY_PATH}` with the path to appropriate library available under [lib](../../lib), `${MODEL_PATH}` with a path to any of the model files available under [lib/common](../../lib/common), `${ACCESS_KEY}` with AccessKey obtained from [Picovoice Console](https://console.picovoice.ai/), `${TEXT}` with the text to be synthesized, and `${WAV_OUTPUT_PATH}` with a path to a output audio file. diff --git a/demo/c/dr_libs b/demo/c/dr_libs new file mode 160000 index 00000000..da35f9d6 --- /dev/null +++ b/demo/c/dr_libs @@ -0,0 +1 @@ +Subproject commit da35f9d6c7374a95353fd1df1d394d44ab66cf01 diff --git a/demo/c/orca_demo.c b/demo/c/orca_demo.c index 93aa1279..35a0c62c 100644 --- a/demo/c/orca_demo.c +++ b/demo/c/orca_demo.c @@ -140,7 +140,7 @@ int picovoice_main(int argc, char **argv) { void *orca_library = open_dl(library_path); if (!orca_library) { - fprintf(stderr, "Failed to open library at '%s'.\n", library_path); + fprintf(stderr, "Failed to open library at `%s`.\n", library_path); exit(EXIT_FAILURE); } @@ -164,20 +164,6 @@ int picovoice_main(int argc, char **argv) { exit(EXIT_FAILURE); } - pv_status_t (*pv_orca_valid_characters_func)(pv_orca_t *, int32_t *, const char *const **) = - load_symbol(orca_library, "pv_orca_valid_characters"); - if (!pv_orca_valid_characters_func) { - print_dl_error("Failed to load 'pv_orca_valid_characters'"); - exit(EXIT_FAILURE); - } - - pv_status_t (*pv_orca_sample_rate_func)(pv_orca_t *, int32_t *) = - load_symbol(orca_library, "pv_orca_sample_rate"); - if (!pv_orca_sample_rate_func) { - print_dl_error("Failed to load 'pv_orca_sample_rate'"); - exit(EXIT_FAILURE); - } - pv_status_t (*pv_orca_synthesize_params_init_func)(pv_orca_synthesize_params_t **) = load_symbol(orca_library, "pv_orca_synthesize_params_init"); if (!pv_orca_synthesize_params_init_func) { @@ -192,23 +178,29 @@ int picovoice_main(int argc, char **argv) { exit(EXIT_FAILURE); } - pv_status_t (*pv_orca_synthesize_params_set_speech_rate_func)(pv_orca_synthesize_params_t *, float) = - load_symbol(orca_library, "pv_orca_synthesize_params_set_speech_rate"); - if (!pv_orca_synthesize_params_set_speech_rate_func) { - print_dl_error("Failed to load 'pv_orca_synthesize_params_set_speech_rate'"); - exit(EXIT_FAILURE); - } - - pv_status_t (*pv_orca_synthesize_to_file_func)(pv_orca_t *, const char *, const pv_orca_synthesize_params_t *, const char *) = + pv_status_t (*pv_orca_synthesize_to_file_func)( + pv_orca_t *, + const char *, + const pv_orca_synthesize_params_t *, + const char *, + int32_t *num_alignments, + pv_orca_word_alignment_t ***alignments) = load_symbol(orca_library, "pv_orca_synthesize_to_file"); if (!pv_orca_synthesize_to_file_func) { print_dl_error("Failed to load 'pv_orca_synthesize_to_file'"); exit(EXIT_FAILURE); } - void (*pv_orca_delete_pcm_func)(int16_t *) = load_symbol(orca_library, "pv_orca_delete_pcm"); - if (!pv_orca_delete_pcm_func) { - print_dl_error("Failed to load 'pv_orca_delete_pcm'"); + pv_status_t (*pv_orca_word_alignments_delete_func)(int32_t, pv_orca_word_alignment_t **) = + load_symbol(orca_library, "pv_orca_word_alignments_delete"); + if (!pv_orca_word_alignments_delete_func) { + print_dl_error("Failed to load 'pv_orca_word_alignments_delete'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_pcm_delete_func)(int16_t *) = load_symbol(orca_library, "pv_orca_pcm_delete"); + if (!pv_orca_pcm_delete_func) { + print_dl_error("Failed to load 'pv_orca_pcm_delete'"); exit(EXIT_FAILURE); } @@ -242,10 +234,10 @@ int picovoice_main(int argc, char **argv) { pv_orca_t *orca = NULL; pv_status_t orca_status = pv_orca_init_func(access_key, model_path, &orca); if (orca_status != PV_STATUS_SUCCESS) { - fprintf(stderr, "Failed to create an instance of Orca with '%s'", pv_status_to_string_func(orca_status)); + fprintf(stderr, "Failed to create an instance of Orca with `%s`", pv_status_to_string_func(orca_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { - fprintf(stderr, ".\nUnable to get Orca error state with '%s'.\n", pv_status_to_string_func(error_status)); + fprintf(stderr, ".\nUnable to get Orca error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -260,8 +252,9 @@ int picovoice_main(int argc, char **argv) { struct timeval after; gettimeofday(&after, NULL); - double init_sec = ((double) (after.tv_sec - before.tv_sec) + - ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + double init_sec = + ((double) (after.tv_sec - before.tv_sec) + + ((double) (after.tv_usec - before.tv_usec)) * 1e-6); fprintf(stdout, "Initialized Orca in %.1f sec\n", init_sec); pv_orca_synthesize_params_t *synthesize_params = NULL; @@ -269,13 +262,13 @@ int picovoice_main(int argc, char **argv) { if (synthesize_params_status != PV_STATUS_SUCCESS) { fprintf( stderr, - "Failed to create an instance of Orca synthesize params with '%s'", + "Failed to create an instance of Orca synthesize params with `%s`", pv_status_to_string_func(synthesize_params_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { fprintf( stderr, - ".\nUnable to get Orca synthesize params error state with '%s'.\n", + ".\nUnable to get Orca synthesize params error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -291,23 +284,27 @@ int picovoice_main(int argc, char **argv) { double proc_sec = 0.; gettimeofday(&before, NULL); - fprintf(stdout, "Synthesizing text '%s' ...\n", text); + fprintf(stdout, "\nSynthesizing text `%s`\n", text); + int32_t num_alignments = 0; + pv_orca_word_alignment_t **alignments = NULL; pv_status_t synthesize_status = pv_orca_synthesize_to_file_func( orca, text, synthesize_params, - output_path); + output_path, + &num_alignments, + &alignments); if (synthesize_status != PV_STATUS_SUCCESS) { fprintf( stderr, - "Failed to synthesize text with '%s'", + "Failed to synthesize text with `%s`", pv_status_to_string_func(synthesize_params_status)); error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); if (error_status != PV_STATUS_SUCCESS) { fprintf( stderr, - ".\nUnable to get Orca synthesize error state with '%s'.\n", + ".\nUnable to get Orca synthesize error state with `%s`.\n", pv_status_to_string_func(error_status)); exit(EXIT_FAILURE); } @@ -322,12 +319,45 @@ int picovoice_main(int argc, char **argv) { gettimeofday(&after, NULL); - proc_sec += ((double) (after.tv_sec - before.tv_sec) + - ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + proc_sec += + ((double) (after.tv_sec - before.tv_sec) + + ((double) (after.tv_usec - before.tv_usec)) * 1e-6); + + if (num_alignments > 0) { + fprintf(stdout, "\nWord alignments"); + if (num_alignments > 3) { + fprintf(stdout, " (only showing first 3):\n"); + } else { + fprintf(stdout, ":\n"); + } + int32_t num_alignments_shown = num_alignments > 3 ? 3 : num_alignments; + for (int32_t i = 0; i < num_alignments_shown; i++) { + fprintf( + stdout, + "word=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->word, + alignments[i]->start_sec, + alignments[i]->end_sec); + for (int32_t j = 0; j < alignments[i]->num_phonemes; j++) { + fprintf( + stdout, + "\tphoneme=\"%s\", start_sec=%.2f, end_sec=%.2f\n", + alignments[i]->phonemes[j]->phoneme, + alignments[i]->phonemes[j]->start_sec, + alignments[i]->phonemes[j]->end_sec); + } + } + } - fprintf(stdout, "Synthesized text in %.1f sec\n", proc_sec); + fprintf(stdout, "\nSynthesized text in %.2f sec\n", proc_sec); fprintf(stdout, "Saved audio to `%s`\n", output_path); + pv_status_t delete_status = pv_orca_word_alignments_delete_func(num_alignments, alignments); + if (delete_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to delete word alignments with `%s`.\n", pv_status_to_string_func(delete_status)); + exit(EXIT_FAILURE); + } + pv_orca_synthesize_params_delete_func(synthesize_params); pv_orca_delete_func(orca); close_dl(orca_library); diff --git a/demo/c/orca_demo_streaming.c b/demo/c/orca_demo_streaming.c new file mode 100644 index 00000000..238f3918 --- /dev/null +++ b/demo/c/orca_demo_streaming.c @@ -0,0 +1,623 @@ +/* +Copyright 2024 Picovoice Inc. + +You may not use this file except in compliance with the license. A copy of +the license is located in the "LICENSE" file accompanying this source. + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations under +the License. +*/ + +#include +#include +#include +#include +#include + +#if !(defined(_WIN32) || defined(_WIN64)) + +#include + +#else + +#include + +#define UTF8_COMPOSITION_FLAG (0) +#define NULL_TERMINATED (-1) + +#endif + +#define DR_WAV_IMPLEMENTATION + +#include "dr_wav.h" + +#include "pv_orca.h" + +#define MAX_NUM_CHUNKS (500) +#define MAX_NUM_BYTES_PER_CHARACTER (5) + +static void *open_dl(const char *dl_path) { + +#if defined(_WIN32) || defined(_WIN64) + + return LoadLibrary(dl_path); + +#else + + return dlopen(dl_path, RTLD_NOW); + +#endif +} + +static void *load_symbol(void *handle, const char *symbol) { + +#if defined(_WIN32) || defined(_WIN64) + + return GetProcAddress((HMODULE) handle, symbol); + +#else + + return dlsym(handle, symbol); + +#endif +} + +static void close_dl(void *handle) { + +#if defined(_WIN32) || defined(_WIN64) + + FreeLibrary((HMODULE) handle); + +#else + + dlclose(handle); + +#endif +} + +static void print_dl_error(const char *message) { + +#if defined(_WIN32) || defined(_WIN64) + + fprintf(stderr, "%s with code `%lu`.\n", message, GetLastError()); + +#else + + fprintf(stderr, "%s with `%s`.\n", message, dlerror()); + +#endif +} + +static struct option long_options[] = { + {"access_key", required_argument, NULL, 'a'}, + {"library_path", required_argument, NULL, 'l'}, + {"model_path", required_argument, NULL, 'm'}, + {"text", required_argument, NULL, 't'}, + {"output_path", required_argument, NULL, 'o'}, +}; + +static pv_status_t num_bytes_character(unsigned char c, int32_t *num_bytes) { + *num_bytes = 0; + + int32_t nb; + if ((c & 0x80) == 0x00) { + nb = 1; + } else if ((c & 0xE0) == 0xC0) { + nb = 2; + } else if ((c & 0xF0) == 0xE0) { + nb = 3; + } else if ((c & 0xF8) == 0xF0) { + nb = 4; + } else { + return PV_STATUS_INVALID_ARGUMENT; + } + + *num_bytes = nb; + + return PV_STATUS_SUCCESS; +} + +static double get_time() { + struct timeval tv; + gettimeofday(&tv, NULL); + return (double) tv.tv_sec + ((double) tv.tv_usec * 1e-6); +} + +static void print_usage(const char *program_name) { + fprintf( + stdout, + "Usage: %s [-l LIBRARY_PATH -m MODEL_PATH -a ACCESS_KEY -t TEXT -o OUTPUT_PATH]\n", + program_name); +} + +typedef struct pcm_chunk pcm_chunk_t; + +struct pcm_chunk { + int32_t num_samples; + int16_t *pcm; + pcm_chunk_t *next; +}; + +static pv_status_t pcm_chunk_init( + int32_t num_samples, + int16_t *pcm, + pcm_chunk_t **chunk) { + *chunk = NULL; + + pcm_chunk_t *c = calloc(1, sizeof(pcm_chunk_t)); + if (!c) { + return PV_STATUS_OUT_OF_MEMORY; + } + + c->pcm = pcm; + c->num_samples = num_samples; + c->next = NULL; + + *chunk = c; + + return PV_STATUS_SUCCESS; +} + +static pv_status_t pcm_chunk_delete(pcm_chunk_t *chunk) { + if (chunk) { + free(chunk->pcm); + free(chunk); + } + return PV_STATUS_SUCCESS; +} + +void print_error_message(char **message_stack, int32_t message_stack_depth) { + for (int32_t i = 0; i < message_stack_depth; i++) { + fprintf(stderr, " [%d] %s\n", i, message_stack[i]); + } +} + +void handle_error( + char **message_stack, + int32_t message_stack_depth, + pv_status_t (*pv_get_error_stack_func)(char ***, int32_t *), + void (*pv_free_error_stack_func)(char **), + const char *(*pv_status_to_string_func)(pv_status_t)) { + pv_status_t error_status = pv_get_error_stack_func(&message_stack, &message_stack_depth); + + if (error_status != PV_STATUS_SUCCESS) { + fprintf(stderr, ".\nUnable to get Orca error state with '%s'\n", pv_status_to_string_func(error_status)); + exit(EXIT_FAILURE); + } + + if (message_stack_depth > 0) { + fprintf(stderr, ":\n"); + for (int32_t i = 0; i < message_stack_depth; i++) { + fprintf(stderr, " [%d] %s\n", i, message_stack[i]); + } + } + + pv_free_error_stack_func(message_stack); +} + +int32_t picovoice_main(int32_t argc, char **argv) { + const char *library_path = NULL; + const char *model_path = NULL; + const char *access_key = NULL; + const char *text = NULL; + const char *output_path = NULL; + + int32_t c; + while ((c = getopt_long(argc, argv, "l:m:a:t:o:", long_options, NULL)) != -1) { + switch (c) { + case 'l': + library_path = optarg; + break; + case 'm': + model_path = optarg; + break; + case 'a': + access_key = optarg; + break; + case 't': + text = optarg; + break; + case 'o': + output_path = optarg; + break; + default: + exit(EXIT_FAILURE); + } + } + + if (!library_path || !model_path || !access_key || !text || !output_path) { + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + void *orca_library = open_dl(library_path); + if (!orca_library) { + fprintf(stderr, "Failed to open library at `%s`.\n", library_path); + exit(EXIT_FAILURE); + } + + const char *(*pv_status_to_string_func)(pv_status_t) = + load_symbol(orca_library, "pv_status_to_string"); + if (!pv_status_to_string_func) { + print_dl_error("Failed to load 'pv_status_to_string'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_init_func)(const char *, const char *, pv_orca_t **) = + load_symbol(orca_library, "pv_orca_init"); + if (!pv_orca_init_func) { + print_dl_error("Failed to load 'pv_orca_init'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_delete_func)(pv_orca_t *) = load_symbol(orca_library, "pv_orca_delete"); + if (!pv_orca_delete_func) { + print_dl_error("Failed to load 'pv_orca_delete'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_sample_rate_func)(pv_orca_t *, int32_t *) = + load_symbol(orca_library, "pv_orca_sample_rate"); + if (!pv_orca_sample_rate_func) { + print_dl_error("Failed to load 'pv_orca_sample_rate'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_synthesize_params_init_func)(pv_orca_synthesize_params_t **) = + load_symbol(orca_library, "pv_orca_synthesize_params_init"); + if (!pv_orca_synthesize_params_init_func) { + print_dl_error("Failed to load 'pv_orca_synthesize_params_init'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_synthesize_params_delete_func)(pv_orca_synthesize_params_t *) = + load_symbol(orca_library, "pv_orca_synthesize_params_delete"); + if (!pv_orca_synthesize_params_delete_func) { + print_dl_error("Failed to load 'pv_orca_synthesize_params_delete'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_pcm_delete_func)(int16_t *) = load_symbol(orca_library, "pv_orca_pcm_delete"); + if (!pv_orca_pcm_delete_func) { + print_dl_error("Failed to load 'pv_orca_pcm_delete'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_open_func)( + pv_orca_t *, + const pv_orca_synthesize_params_t *, + pv_orca_stream_t **) = load_symbol(orca_library, "pv_orca_stream_open"); + if (!pv_orca_stream_open_func) { + print_dl_error("Failed to load 'pv_orca_stream_open'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_synthesize_func)( + pv_orca_stream_t *, + const char *, + int32_t *, + int16_t **) = load_symbol(orca_library, "pv_orca_stream_synthesize"); + if (!pv_orca_stream_synthesize_func) { + print_dl_error("Failed to load 'pv_orca_stream_synthesize'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_orca_stream_flush_func)( + pv_orca_stream_t *, + int32_t *, + int16_t **) = load_symbol(orca_library, "pv_orca_stream_flush"); + if (!pv_orca_stream_flush_func) { + print_dl_error("Failed to load 'pv_orca_stream_flush'"); + exit(EXIT_FAILURE); + } + + void (*pv_orca_stream_close_func)(pv_orca_stream_t *) = load_symbol(orca_library, "pv_orca_stream_close"); + if (!pv_orca_stream_close_func) { + print_dl_error("Failed to load 'pv_orca_stream_close'"); + exit(EXIT_FAILURE); + } + + const char *(*pv_orca_version_func)() = load_symbol(orca_library, "pv_orca_version"); + if (!pv_orca_version_func) { + print_dl_error("Failed to load 'pv_orca_version'"); + exit(EXIT_FAILURE); + } + + pv_status_t (*pv_get_error_stack_func)(char ***, int32_t *) = load_symbol(orca_library, "pv_get_error_stack"); + if (!pv_get_error_stack_func) { + print_dl_error("Failed to load 'pv_get_error_stack'"); + exit(EXIT_FAILURE); + } + + void (*pv_free_error_stack_func)(char **) = load_symbol(orca_library, "pv_free_error_stack"); + if (!pv_free_error_stack_func) { + print_dl_error("Failed to load 'pv_free_error_stack'"); + exit(EXIT_FAILURE); + } + + char **message_stack = NULL; + int32_t message_stack_depth = 0; + + fprintf(stdout, "Orca version: %s\n\n", pv_orca_version_func()); + + double time_before_init = get_time(); + + pv_orca_t *orca = NULL; + pv_status_t orca_status = pv_orca_init_func(access_key, model_path, &orca); + if (orca_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to create an instance of Orca with `%s`", pv_status_to_string_func(orca_status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + double init_sec = get_time() - time_before_init; + fprintf(stdout, "Initialized Orca in %.1f sec\n", init_sec); + + int32_t sample_rate = 0; + pv_status_t status = pv_orca_sample_rate_func(orca, &sample_rate); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Failed to get Orca sample rate with `%s`", pv_status_to_string_func(status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + drwav_data_format format; + format.container = drwav_container_riff; + format.format = DR_WAVE_FORMAT_PCM; + format.channels = 1; + format.sampleRate = sample_rate; + format.bitsPerSample = 16; + + drwav output_file; + +#if defined(_WIN32) || defined(_WIN64) + + int output_path_wchars_num = MultiByteToWideChar(CP_UTF8, UTF8_COMPOSITION_FLAG, output_path, NULL_TERMINATED, NULL, 0); + wchar_t output_path_w[output_path_wchars_num]; + MultiByteToWideChar(CP_UTF8, UTF8_COMPOSITION_FLAG, output_path, NULL_TERMINATED, output_path_w, output_path_wchars_num); + unsigned int drwav_init_file_status = drwav_init_file_write_w(&output_file, output_path_w, &format, NULL); + +#else + + unsigned int drwav_init_file_status = drwav_init_file_write(&output_file, output_path, &format, NULL); + +#endif + + if (!drwav_init_file_status) { + fprintf(stderr, "Failed to open the output wav file at '%s'.", output_path); + exit(EXIT_FAILURE); + } + + pv_orca_synthesize_params_t *synthesize_params = NULL; + pv_status_t synthesize_params_status = pv_orca_synthesize_params_init_func(&synthesize_params); + if (synthesize_params_status != PV_STATUS_SUCCESS) { + fprintf( + stderr, + "Failed to create an instance of Orca synthesize params with `%s`", + pv_status_to_string_func(synthesize_params_status)); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + fprintf(stdout, "\nSynthesizing text `%s` \n", text); + + int32_t num_samples_chunks[MAX_NUM_CHUNKS] = {0}; + double start_chunks[MAX_NUM_CHUNKS] = {0}; + start_chunks[0] = get_time(); + double end_chunks[MAX_NUM_CHUNKS] = {0}; + int32_t num_chunks = 0; + + pcm_chunk_t *pcm_chunk_prev = NULL; + pcm_chunk_t *pcm_chunk_head = NULL; + + pv_orca_stream_t *orca_stream = NULL; + pv_status_t stream_open_status = pv_orca_stream_open_func(orca, synthesize_params, &orca_stream); + if (stream_open_status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error opening stream"); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + char character[MAX_NUM_BYTES_PER_CHARACTER] = {0}; + for (int32_t i = 0; i < (int32_t) strlen(text); i++) { + if (num_chunks > (MAX_NUM_CHUNKS - 1)) { + fprintf(stderr, "Trying to synthesize too many chunks. Only `%d` chunks are supported.\n", MAX_NUM_CHUNKS); + exit(EXIT_FAILURE); + } + + int32_t num_bytes = 0; + status = num_bytes_character((unsigned char) text[i], &num_bytes); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error getting number of bytes for character: `%c`", text[i]); + exit(EXIT_FAILURE); + } + + for (int32_t j = 0; j < num_bytes; j++) { + character[j] = text[i + j]; + } + character[num_bytes] = '\0'; + + int32_t num_samples_chunk = 0; + int16_t *pcm_chunk = NULL; + status = pv_orca_stream_synthesize_func(orca_stream, character, &num_samples_chunk, &pcm_chunk); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error adding token: `%s`", character); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + if (num_samples_chunk > 0) { + if (pcm_chunk_prev == NULL) { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &pcm_chunk_prev); + pcm_chunk_head = pcm_chunk_prev; + } else { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &(pcm_chunk_prev->next)); + pcm_chunk_prev = pcm_chunk_prev->next; + } + + double timestamp = get_time(); + num_samples_chunks[num_chunks] = num_samples_chunk; + end_chunks[num_chunks++] = timestamp; + start_chunks[num_chunks] = timestamp; + } + } + + int32_t num_samples_chunk = 0; + int16_t *pcm_chunk = NULL; + status = pv_orca_stream_flush_func(orca_stream, &num_samples_chunk, &pcm_chunk); + if (status != PV_STATUS_SUCCESS) { + fprintf(stderr, "Error flushing Orca stream"); + handle_error( + message_stack, + message_stack_depth, + pv_get_error_stack_func, + pv_free_error_stack_func, + pv_status_to_string_func); + exit(EXIT_FAILURE); + } + + if (num_samples_chunk > 0) { + if (pcm_chunk_prev == NULL) { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &pcm_chunk_prev); + pcm_chunk_head = pcm_chunk_prev; + } else { + pcm_chunk_init(num_samples_chunk, pcm_chunk, &(pcm_chunk_prev->next)); + } + + double timestamp = get_time(); + num_samples_chunks[num_chunks] = num_samples_chunk; + end_chunks[num_chunks++] = timestamp; + start_chunks[num_chunks] = timestamp; + } + + pv_orca_stream_close_func(orca_stream); + pv_orca_synthesize_params_delete_func(synthesize_params); + pv_orca_delete_func(orca); + + int32_t num_samples = 0; + pcm_chunk_t *pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + num_samples += pcm_chunk_iter->num_samples; + pcm_chunk_iter = pcm_chunk_iter->next; + } + + int16_t *pcm = malloc(num_samples * sizeof(int16_t)); + int32_t offset = 0; + pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + memcpy(&pcm[offset], pcm_chunk_iter->pcm, pcm_chunk_iter->num_samples * sizeof(int16_t)); + offset += pcm_chunk_iter->num_samples; + pcm_chunk_iter = pcm_chunk_iter->next; + } + + pcm_chunk_iter = pcm_chunk_head; + while (pcm_chunk_iter != NULL) { + pcm_chunk_t *tmp = pcm_chunk_iter; + pcm_chunk_iter = pcm_chunk_iter->next; + pcm_chunk_delete(tmp); + } + + if ((int32_t) drwav_write_pcm_frames(&output_file, num_samples, pcm) != num_samples) { + fprintf(stderr, "Failed to write to output file.\n"); + exit(EXIT_FAILURE); + } + + drwav_uninit(&output_file); + free(pcm); + + fprintf( + stdout, + "\nGenerated %d audio chunk%s in %.2f seconds.\n", + num_chunks, num_chunks == 1 ? "" : "s", + end_chunks[num_chunks - 1] - start_chunks[0]); + + for (int32_t i = 0; i < num_chunks; i++) { + float num_seconds = (float) num_samples_chunks[i] / (float) sample_rate; + double process_time = end_chunks[i] - start_chunks[i]; + fprintf( + stdout, + "Audio chunk #%d: length: %.2f s, processing time %.2f s\n", + i, + num_seconds, + process_time); + } + + fprintf(stdout, "\nSaved final audio to `%s`\n", output_path); + + close_dl(orca_library); + + return EXIT_SUCCESS; +} + +int32_t main(int argc, char *argv[]) { + +#if defined(_WIN32) || defined(_WIN64) + +#define UTF8_COMPOSITION_FLAG (0) +#define NULL_TERMINATED (-1) + + LPWSTR *wargv = CommandLineToArgvW(GetCommandLineW(), &argc); + if (wargv == NULL) { + fprintf(stderr, "CommandLineToArgvW failed\n"); + exit(1); + } + + char *utf8_argv[argc]; + + for (int32_t i = 0; i < argc; ++i) { + // WideCharToMultiByte: + // https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte + int arg_chars_num = + WideCharToMultiByte(CP_UTF8, UTF8_COMPOSITION_FLAG, wargv[i], NULL_TERMINATED, NULL, 0, NULL, NULL); + utf8_argv[i] = (char *) malloc(arg_chars_num * sizeof(char)); + if (!utf8_argv[i]) { + fprintf(stderr, "failed to to allocate memory for converting args"); + } + WideCharToMultiByte(CP_UTF8, UTF8_COMPOSITION_FLAG, wargv[i], NULL_TERMINATED, utf8_argv[i], arg_chars_num, NULL, NULL); + } + + LocalFree(wargv); + argv = utf8_argv; + +#endif + + int result = picovoice_main(argc, argv); + +#if defined(_WIN32) || defined(_WIN64) + + for (int i = 0; i < argc; ++i) { + free(utf8_argv[i]); + } + +#endif + + return result; +} diff --git a/demo/c/test/test_orca_c.py b/demo/c/test/test_orca_c.py index 045a951e..c88728eb 100644 --- a/demo/c/test/test_orca_c.py +++ b/demo/c/test/test_orca_c.py @@ -10,21 +10,29 @@ # import os.path +import platform as pltf import subprocess import sys import unittest from test_util import get_model_paths, get_test_data -test_sentences = get_test_data() +test_data = get_test_data() class OrcaCTestCase(unittest.TestCase): @classmethod def setUpClass(cls): cls._access_key = sys.argv[1] - cls._platform = sys.argv[2] - cls._arch = "" if len(sys.argv) != 4 else sys.argv[3] + platform = sys.argv[2] + if platform == "mac": + if pltf.machine() == "x86_64": + cls._arch = "x86_64" + elif pltf.machine() == "arm64": + cls._arch = "arm64" + else: + cls._arch = "" if len(sys.argv) != 4 else sys.argv[3] + cls._platform = platform cls._root_dir = os.path.join(os.path.dirname(__file__), "../../..") @staticmethod @@ -52,18 +60,43 @@ def run_orca(self, model_path: str) -> None: "-a", self._access_key, "-l", self._get_library_file(), "-m", model_path, - "-t", test_sentences.text, + "-t", test_data.text, "-o", output_path, ] process = subprocess.Popen(args, stderr=subprocess.PIPE, stdout=subprocess.PIPE) stdout, stderr = process.communicate() - self.assertEqual(process.poll(), 0) + poll_result = process.poll() + if poll_result != 0: + print(stdout.decode('utf-8')) + print(stderr.decode('utf-8')) + raise RuntimeError("Error running demo. See details above") + + self.assertEqual(poll_result, 0) self.assertEqual(stderr.decode('utf-8'), '') self.assertTrue("Saved audio" in stdout.decode('utf-8')) os.remove(output_path) + def run_orca_streaming(self, model_path: str) -> None: + output_path = os.path.join(os.path.dirname(__file__), "output.wav") + args = [ + os.path.join(os.path.dirname(__file__), "../build/orca_demo_streaming"), + "-a", self._access_key, + "-l", self._get_library_file(), + "-m", model_path, + "-t", test_data.text, + "-o", output_path, + ] + + process = subprocess.Popen(args, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + stdout, stderr = process.communicate() + + self.assertEqual(process.poll(), 0) + self.assertEqual(stderr.decode('utf-8'), '') + self.assertTrue("Saved final audio" in stdout.decode('utf-8')) + os.remove(output_path) + def test_orca(self) -> None: for model_path in get_model_paths(): self.run_orca(model_path=model_path) diff --git a/demo/c/test/test_util.py b/demo/c/test/test_util.py index 430729e8..21acd5de 100644 --- a/demo/c/test/test_util.py +++ b/demo/c/test/test_util.py @@ -29,6 +29,7 @@ def get_test_data() -> TestSentences: with open(data_file_path, encoding="utf8") as data_file: json_test_data = data_file.read() test_data = json.loads(json_test_data)['test_sentences'] + test_data.pop("text_alignment") return TestSentences(**test_data) diff --git a/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj b/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj index cf096214..935a9715 100644 --- a/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj +++ b/demo/ios/OrcaDemo/OrcaDemo.xcodeproj/project.pbxproj @@ -13,7 +13,9 @@ 02A1195F268D3FD600A2AC99 /* ViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 02A1195E268D3FD600A2AC99 /* ViewModel.swift */; }; 1E001B682B76FFE700D8E72D /* AudioPlayer.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */; }; 1E001B6A2B7D451200D8E72D /* orca_params_female.pv in Resources */ = {isa = PBXBuildFile; fileRef = 1E001B692B7D451200D8E72D /* orca_params_female.pv */; }; - 4A00B7EF2D4C1FA9D1C474E1 /* libPods-OrcaDemo.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */; }; + B218600C461D96EA568B6D6C /* libPods-OrcaDemo.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */; }; + E125E1892BE99DCA008B6D56 /* AtomicBool.swift in Sources */ = {isa = PBXBuildFile; fileRef = E125E1882BE99DCA008B6D56 /* AtomicBool.swift */; }; + E1C5A45F2BE587A2002C0C40 /* AudioPlayerStream.swift in Sources */ = {isa = PBXBuildFile; fileRef = E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -25,12 +27,11 @@ 02A1195E268D3FD600A2AC99 /* ViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewModel.swift; sourceTree = ""; }; 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioPlayer.swift; sourceTree = ""; }; 1E001B692B7D451200D8E72D /* orca_params_female.pv */ = {isa = PBXFileReference; lastKnownFileType = file; name = orca_params_female.pv; path = ../../../../lib/common/orca_params_female.pv; sourceTree = ""; }; - 25220F02E797CC78BF7E6619 /* libPods-OrcaDemoUITests.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemoUITests.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.release.xcconfig"; sourceTree = ""; }; - 72F8162D9843C0A1C546BE64 /* Pods-OrcaDemoUITests.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemoUITests.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemoUITests/Pods-OrcaDemoUITests.debug.xcconfig"; sourceTree = ""; }; - 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemo.a"; sourceTree = BUILT_PRODUCTS_DIR; }; - 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.debug.xcconfig"; sourceTree = ""; }; - D2D9DCA10D9D1AB213098AEF /* Pods-OrcaDemoUITests.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemoUITests.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemoUITests/Pods-OrcaDemoUITests.release.xcconfig"; sourceTree = ""; }; + 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.debug.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.debug.xcconfig"; sourceTree = ""; }; + 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OrcaDemo.release.xcconfig"; path = "Target Support Files/Pods-OrcaDemo/Pods-OrcaDemo.release.xcconfig"; sourceTree = ""; }; + A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OrcaDemo.a"; sourceTree = BUILT_PRODUCTS_DIR; }; + E125E1882BE99DCA008B6D56 /* AtomicBool.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AtomicBool.swift; sourceTree = ""; }; + E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AudioPlayerStream.swift; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -38,7 +39,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 4A00B7EF2D4C1FA9D1C474E1 /* libPods-OrcaDemo.a in Frameworks */, + B218600C461D96EA568B6D6C /* libPods-OrcaDemo.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -50,8 +51,8 @@ children = ( 02A11947268D39A700A2AC99 /* OrcaDemo */, 02A11946268D39A700A2AC99 /* Products */, - 02A11957268D39D100A2AC99 /* Frameworks */, 8DB92FF3DC81AB04D3FF7242 /* Pods */, + 4374BA75AB06EC0D059377CD /* Frameworks */, ); sourceTree = ""; }; @@ -73,15 +74,16 @@ 02A11951268D39AB00A2AC99 /* Info.plist */, 02A1195E268D3FD600A2AC99 /* ViewModel.swift */, 1E001B672B76FFE700D8E72D /* AudioPlayer.swift */, + E1C5A45E2BE587A2002C0C40 /* AudioPlayerStream.swift */, + E125E1882BE99DCA008B6D56 /* AtomicBool.swift */, ); path = OrcaDemo; sourceTree = ""; }; - 02A11957268D39D100A2AC99 /* Frameworks */ = { + 4374BA75AB06EC0D059377CD /* Frameworks */ = { isa = PBXGroup; children = ( - 7C8FD0A21A7AA22B5EB1EB2D /* libPods-OrcaDemo.a */, - 25220F02E797CC78BF7E6619 /* libPods-OrcaDemoUITests.a */, + A9E91B80C84BF594FCF1FCBD /* libPods-OrcaDemo.a */, ); name = Frameworks; sourceTree = ""; @@ -89,10 +91,8 @@ 8DB92FF3DC81AB04D3FF7242 /* Pods */ = { isa = PBXGroup; children = ( - 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */, - 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */, - 72F8162D9843C0A1C546BE64 /* Pods-OrcaDemoUITests.debug.xcconfig */, - D2D9DCA10D9D1AB213098AEF /* Pods-OrcaDemoUITests.release.xcconfig */, + 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */, + 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */, ); path = Pods; sourceTree = ""; @@ -104,11 +104,11 @@ isa = PBXNativeTarget; buildConfigurationList = 02A11954268D39AB00A2AC99 /* Build configuration list for PBXNativeTarget "OrcaDemo" */; buildPhases = ( - 9E7C8E83BA330F7017CD5C56 /* [CP] Check Pods Manifest.lock */, + E5EA3B129B59D3DF4752D82D /* [CP] Check Pods Manifest.lock */, 02A11941268D39A700A2AC99 /* Sources */, 02A11942268D39A700A2AC99 /* Frameworks */, 02A11943268D39A700A2AC99 /* Resources */, - B387D574F16D312B6FFB5B42 /* [CP] Embed Pods Frameworks */, + E85A144184F1D605DB772089 /* [CP] Embed Pods Frameworks */, ); buildRules = ( ); @@ -164,7 +164,7 @@ /* End PBXResourcesBuildPhase section */ /* Begin PBXShellScriptBuildPhase section */ - 9E7C8E83BA330F7017CD5C56 /* [CP] Check Pods Manifest.lock */ = { + E5EA3B129B59D3DF4752D82D /* [CP] Check Pods Manifest.lock */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -186,7 +186,7 @@ shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; showEnvVarsInLog = 0; }; - B387D574F16D312B6FFB5B42 /* [CP] Embed Pods Frameworks */ = { + E85A144184F1D605DB772089 /* [CP] Embed Pods Frameworks */ = { isa = PBXShellScriptBuildPhase; buildActionMask = 2147483647; files = ( @@ -210,6 +210,8 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + E125E1892BE99DCA008B6D56 /* AtomicBool.swift in Sources */, + E1C5A45F2BE587A2002C0C40 /* AudioPlayerStream.swift in Sources */, 02A1194B268D39A700A2AC99 /* ContentView.swift in Sources */, 02A1195F268D3FD600A2AC99 /* ViewModel.swift in Sources */, 02A11949268D39A700A2AC99 /* OrcaDemoApp.swift in Sources */, @@ -338,13 +340,13 @@ }; 02A11955268D39AB00A2AC99 /* Debug */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 8DC160B174C3E4AE3F56942D /* Pods-OrcaDemo.debug.xcconfig */; + baseConfigurationReference = 2C3AE1B63A5DD37711F6DD7E /* Pods-OrcaDemo.debug.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = Y6S42VUYBV; ENABLE_PREVIEWS = YES; INFOPLIST_FILE = OrcaDemo/Info.plist; IPHONEOS_DEPLOYMENT_TARGET = 14.0; @@ -352,7 +354,7 @@ "$(inherited)", "@executable_path/Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo; + PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo.albert; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; @@ -361,13 +363,13 @@ }; 02A11956268D39AB00A2AC99 /* Release */ = { isa = XCBuildConfiguration; - baseConfigurationReference = 544345CBBDA09211F4620F3E /* Pods-OrcaDemo.release.xcconfig */; + baseConfigurationReference = 97762F0F3B18F16DC68C5D67 /* Pods-OrcaDemo.release.xcconfig */; buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; CODE_SIGN_STYLE = Automatic; DEVELOPMENT_ASSET_PATHS = ""; - DEVELOPMENT_TEAM = 65723695GD; + DEVELOPMENT_TEAM = Y6S42VUYBV; ENABLE_PREVIEWS = YES; INFOPLIST_FILE = OrcaDemo/Info.plist; IPHONEOS_DEPLOYMENT_TARGET = 14.0; @@ -375,7 +377,7 @@ "$(inherited)", "@executable_path/Frameworks", ); - PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo; + PRODUCT_BUNDLE_IDENTIFIER = ai.picovoice.OrcaDemo.albert; PRODUCT_NAME = "$(TARGET_NAME)"; SWIFT_VERSION = 5.0; TARGETED_DEVICE_FAMILY = "1,2"; diff --git a/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift b/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift new file mode 100644 index 00000000..f20c3e98 --- /dev/null +++ b/demo/ios/OrcaDemo/OrcaDemo/AtomicBool.swift @@ -0,0 +1,22 @@ +import Foundation + +class AtomicBool { + private var value: Bool + private let lock = NSLock() + + init(_ value: Bool = false) { + self.value = value + } + + func set(_ newValue: Bool) { + lock.lock() + value = newValue + lock.unlock() + } + + func get() -> Bool { + lock.lock() + defer { lock.unlock() } + return value + } +} diff --git a/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift b/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift new file mode 100644 index 00000000..f2860eaf --- /dev/null +++ b/demo/ios/OrcaDemo/OrcaDemo/AudioPlayerStream.swift @@ -0,0 +1,71 @@ +import Foundation +import AVFoundation + +class AudioPlayerStream { + private let engine = AVAudioEngine() + private let playerNode = AVAudioPlayerNode() + private let mixerNode = AVAudioMixerNode() + + private var pcmBuffers = [[Int16]]() + private var isPlaying = false + + init(sampleRate: Double) throws { + let audioSession = AVAudioSession.sharedInstance() + try audioSession.setCategory(.playback, mode: .default) + try audioSession.setActive(true) + + let format = AVAudioFormat( + commonFormat: .pcmFormatFloat32, + sampleRate: sampleRate, + channels: AVAudioChannelCount(1), + interleaved: false) + + engine.attach(mixerNode) + engine.connect(mixerNode, to: engine.outputNode, format: format) + + engine.attach(playerNode) + engine.connect(playerNode, to: mixerNode, format: format) + + try engine.start() + } + + func playStreamPCM(_ pcmData: [Int16], completion: @escaping (Bool) -> Void) { + pcmBuffers.append(pcmData) + if !isPlaying { + playNextPCMBuffer(completion: completion) + } else { + completion(true) + } + } + + private func playNextPCMBuffer(completion: @escaping (Bool) -> Void) { + guard let pcmData = pcmBuffers.first, !pcmData.isEmpty else { + isPlaying = false + completion(false) + return + } + pcmBuffers.removeFirst() + + let audioBuffer = AVAudioPCMBuffer( + pcmFormat: playerNode.outputFormat(forBus: 0), frameCapacity: AVAudioFrameCount(pcmData.count))! + + audioBuffer.frameLength = audioBuffer.frameCapacity + let buf = audioBuffer.floatChannelData![0] + for (index, sample) in pcmData.enumerated() { + buf[index] = Float32(sample) / Float32(Int16.max) + } + + playerNode.scheduleBuffer(audioBuffer) { [weak self] in + self?.playNextPCMBuffer(completion: completion) + } + + playerNode.play() + isPlaying = true + completion(true) + } + + func stopStreamPCM() { + playerNode.stop() + engine.stop() + } +} diff --git a/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift b/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift index 01453210..cb53f8bf 100644 --- a/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift +++ b/demo/ios/OrcaDemo/OrcaDemo/ContentView.swift @@ -19,47 +19,95 @@ struct ContentView: View { let lightGray = Color(red: 247 / 255, green: 247 / 255, blue: 247 / 255, opacity: 1) var body: some View { + let streamingMode = viewModel.state == .STREAM_OPEN || viewModel.state == .STREAM_PLAYING let interactionDisabled = - !viewModel.errorMessage.isEmpty || viewModel.state == UIState.PROCESSING - || viewModel.state == UIState.INIT || text.isEmpty || !viewModel.invalidTextMessage.isEmpty + !viewModel.errorMessage.isEmpty || viewModel.state == .PROCESSING + || viewModel.state == .INIT || (!streamingMode && !viewModel.invalidTextMessage.isEmpty) + let toggleDisabled = interactionDisabled || viewModel.state == .STREAM_PLAYING + let buttonDisabled = toggleDisabled || text.isEmpty + GeometryReader { _ in VStack(spacing: 10) { - GeometryReader { geometry in - VStack { - ScrollView { - ZStack(alignment: .topLeading) { - TextEditor(text: $text) - .transparentScrolling() - .padding() - .frame(minWidth: 0, - maxWidth: .infinity, - minHeight: geometry.size.height, - maxHeight: .infinity) - .font(.title3) - .background(lightGray) - .onChange(of: text) { _ in - text = String(text.prefix(Int(viewModel.maxCharacterLimit))) - viewModel.isValid(text: text) - } + Toggle( + isOn: Binding( + get: { streamingMode }, + set: { _ in viewModel.toggleStreaming() } + ), + label: { Text("Streaming Synthesis") } + ) + .disabled(toggleDisabled) + .onChange(of: streamingMode) { _ in text = "" } + .foregroundColor(Color.black) - if text.count == 0 { - Text("Enter any text to be synthesized") - .padding(25) + if viewModel.state == .STREAM_PLAYING { + GeometryReader { geometry in + ScrollView { + Text(viewModel.textStream) + .transparentScrolling() + .padding() + .frame(minWidth: 0, + maxWidth: .infinity, + minHeight: geometry.size.height, + maxHeight: .infinity, + alignment: .topLeading) + .font(.title3) + .background(lightGray) + .foregroundColor(Color.black) + } + } + } else { + GeometryReader { geometry in + VStack { + ScrollView { + ZStack(alignment: .topLeading) { + TextEditor(text: $text) + .transparentScrolling() + .padding() + .frame(minWidth: 0, + maxWidth: .infinity, + minHeight: geometry.size.height, + maxHeight: .infinity) .font(.title3) - .foregroundColor(Color.gray) + .background(lightGray) + .foregroundColor(Color.black) + .onChange(of: text) { newValue in + let updatedText = String( + newValue.prefix(Int(exactly: viewModel.maxCharacterLimit)!)) + text = updatedText.replacingOccurrences(of: "’", with: "'") + viewModel.isValid(text: text) + } + .disabled(viewModel.state == .PLAYING) + + if text.count == 0 { + Text("Enter any text to be synthesized") + .padding(25) + .font(.title3) + .foregroundColor(Color.gray) + } } } + Text("\(text.count) / \(viewModel.maxCharacterLimit)") + .font(.footnote) + .frame(maxWidth: .infinity, alignment: .trailing) + .foregroundColor(Color.gray) } - - Text("\(text.count) / \(viewModel.maxCharacterLimit)") - .font(.footnote) - .frame(maxWidth: .infinity, alignment: .trailing) - .foregroundColor(Color.gray) } } - if viewModel.state == .INIT || viewModel.state == .READY { + if streamingMode { + if viewModel.state == .STREAM_OPEN && !viewModel.streamInvalidTextMessage.isEmpty { + Text(viewModel.streamInvalidTextMessage) + .padding() + .font(.body) + .foregroundColor(Color.gray) + } else { + Text(viewModel.streamHelperText) + .padding() + .font(.body) + .foregroundColor(Color.black) + } + } else if viewModel.state == .INIT || viewModel.state == .READY { if viewModel.invalidTextMessage.isEmpty { Text("Enter text and press synthesize") .padding() @@ -100,12 +148,12 @@ struct ContentView: View { Text(viewModel.state == .PLAYING ? "Stop" : "Synthesize") .padding() .frame(minWidth: 200) - .background(interactionDisabled ? Color.gray : activeBlue) + .background(buttonDisabled ? Color.gray : activeBlue) .foregroundColor(Color.white) .font(.largeTitle) } ) - .disabled(interactionDisabled) + .disabled(buttonDisabled) } .onReceive( NotificationCenter.default.publisher( @@ -127,7 +175,6 @@ struct ContentView: View { .onTapGesture { hideKeyboard() } - } } } diff --git a/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift b/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift index 4e0b4480..1b9ac382 100644 --- a/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift +++ b/demo/ios/OrcaDemo/OrcaDemo/ViewModel.swift @@ -17,24 +17,34 @@ enum UIState { case PROCESSING case SYNTHESIZED case PLAYING + case STREAM_OPEN + case STREAM_PLAYING case ERROR } class ViewModel: ObservableObject { private let ACCESS_KEY = "{YOUR_ACCESS_KEY_HERE}" // Obtained from Picovoice Console (https://console.picovoice.ai) + private let NUM_AUDIO_WAIT_CHUNKS = 1 + private var orca: Orca! + private var orcaStream: Orca.OrcaStream! private var player: AudioPlayer = AudioPlayer() + private var playerStream: AudioPlayerStream! private var previousText = "" private var subscriptions = Set() private let audioFilePath = "temp.wav" private var audioFile: URL! - @Published var errorMessage = "" @Published var state = UIState.INIT - @Published var maxCharacterLimit = Orca.maxCharacterLimit + @Published var sampleRate: Int32 = 0 + @Published var maxCharacterLimit: Int32 = 0 + @Published var textStream = "" + @Published var streamHelperText = "" + @Published var errorMessage = "" @Published var invalidTextMessage = "" + @Published var streamInvalidTextMessage = "" init() { initialize() @@ -44,6 +54,8 @@ class ViewModel: ObservableObject { state = UIState.INIT do { try orca = Orca(accessKey: ACCESS_KEY, modelPath: "orca_params_female.pv") + maxCharacterLimit = orca.maxCharacterLimit! + sampleRate = orca.sampleRate! state = UIState.READY let audioDir = try FileManager.default.url( @@ -73,7 +85,32 @@ class ViewModel: ObservableObject { orca.delete() } + public func toggleStreaming() { + if state == UIState.READY || state == UIState.STREAM_OPEN { + if orcaStream == nil { + do { + self.textStream = "" + self.streamHelperText = "Enter text and press synthesize" + orcaStream = try orca.streamOpen() + self.state = UIState.STREAM_OPEN + } catch { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } else { + orcaStream.close() + orcaStream = nil + self.state = UIState.READY + } + } + } + public func toggleSynthesize(text: String) { + if state == UIState.STREAM_OPEN { + runStreamSynthesis(text: text) + return + } + if state == UIState.PLAYING { toggleSynthesizeOff() } else { @@ -81,6 +118,181 @@ class ViewModel: ObservableObject { } } + private func runStreamSynthesis(text: String) { + self.textStream = "" + self.state = UIState.STREAM_PLAYING + + do { + playerStream = try AudioPlayerStream(sampleRate: Double(self.sampleRate)) + } catch { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + + let textStreamQueue = DispatchQueue(label: "text-stream-queue") + let textStreamQueueConcurrent = DispatchQueue(label: "text-stream-queue-concurrent", attributes: .concurrent) + var textStreamArray = [String]() + let isTextStreamQueueActive = AtomicBool(false) + + func isTextStreamEmpty() -> Bool { + return textStreamQueueConcurrent.sync { + textStreamArray.isEmpty + } + } + + func getFromTextStream() -> String? { + var word: String? + textStreamQueueConcurrent.sync { + if !textStreamArray.isEmpty { + word = textStreamArray.removeFirst() + } + } + return word + } + + func addToTextStream(word: String) { + textStreamQueueConcurrent.async(flags: .barrier) { + textStreamArray.append(word) + } + } + + let pcmStreamQueue = DispatchQueue(label: "pcm-stream-queue") + let pcmStreamQueueConcurrent = DispatchQueue(label: "pcm-stream-queue-concurrent", attributes: .concurrent) + var pcmStreamArray = [[Int16]]() + let isPcmStreamQueueActive = AtomicBool(false) + + func isPcmStreamEmpty() -> Bool { + return pcmStreamQueueConcurrent.sync { + pcmStreamArray.isEmpty + } + } + + func getFromPcmStream() -> [Int16]? { + var pcm: [Int16]? + pcmStreamQueueConcurrent.sync { + if !pcmStreamArray.isEmpty { + pcm = pcmStreamArray.removeFirst() + } + } + return pcm + } + + func addToPcmStream(pcm: [Int16]) { + pcmStreamQueueConcurrent.async(flags: .barrier) { + pcmStreamArray.append(pcm) + } + } + + let playStreamQueue = DispatchQueue(label: "play-stream-queue") + let pcmStreamQueueLatch = DispatchSemaphore(value: 0) + let playStreamQueueLatch = DispatchSemaphore(value: 0) + + func getSecsString(secs: Float) -> String { + return "Seconds of audio synthesized: " + String(format: "%.3f", secs) + "s" + } + + textStreamQueue.async { + isTextStreamQueueActive.set(true) + + var isPcmStreamQueueStarted = false + let words = text.split(separator: " ") + for word in words { + let wordWithSpace = String(word) + " " + addToTextStream(word: wordWithSpace) + if isPcmStreamQueueStarted == false { + pcmStreamQueueLatch.signal() + isPcmStreamQueueStarted = true + } + usleep(100 * 1000) + DispatchQueue.main.async { + self.textStream.append(wordWithSpace) + } + } + + isTextStreamQueueActive.set(false) + } + + pcmStreamQueue.async { + isPcmStreamQueueActive.set(true) + + var audioSynthesizedSecs: Float = 0 + var numIterations = 0 + var isPlayStreamQueueStarted = false + + pcmStreamQueueLatch.wait() + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + + while isTextStreamQueueActive.get() || !isTextStreamEmpty() { + if !isTextStreamEmpty() { + do { + let word = getFromTextStream() + if word != nil { + let pcm = try self.orcaStream.synthesize(text: word!) + if pcm != nil { + addToPcmStream(pcm: pcm!) + audioSynthesizedSecs += Float(pcm!.count) / Float(self.sampleRate) + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + if numIterations == self.NUM_AUDIO_WAIT_CHUNKS { + playStreamQueueLatch.signal() + isPlayStreamQueueStarted = true + } + numIterations += 1 + } + } + } catch { + DispatchQueue.main.async { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } + } + } + + do { + let pcm = try self.orcaStream.flush() + if pcm != nil { + addToPcmStream(pcm: pcm!) + audioSynthesizedSecs += Float(pcm!.count) / Float(self.sampleRate) + DispatchQueue.main.async { + self.streamHelperText = getSecsString(secs: audioSynthesizedSecs) + } + if !isPlayStreamQueueStarted { + playStreamQueueLatch.signal() + } + } + } catch { + DispatchQueue.main.async { + self.errorMessage = "\(error.localizedDescription)" + self.state = UIState.ERROR + } + } + + isPcmStreamQueueActive.set(false) + } + + playStreamQueue.async { + playStreamQueueLatch.wait() + + while isPcmStreamQueueActive.get() || !isPcmStreamEmpty() { + if !isPcmStreamEmpty() { + let pcm = getFromPcmStream() + self.playerStream.playStreamPCM(pcm!) { isPlaying in + if !isPlaying { + DispatchQueue.main.async { + self.playerStream.stopStreamPCM() + self.state = UIState.STREAM_OPEN + } + } + } + } + } + } + } + public func toggleSynthesizeOff() { player.stop() state = UIState.READY @@ -127,27 +339,21 @@ class ViewModel: ObservableObject { } public func isValid(text: String) { - do { - let characters = try orca.validCharacters - let regex = try NSRegularExpression( - pattern: "[^\(characters.joined(separator: ""))\\s{}|']", - options: .caseInsensitive) - let range = NSRange(text.startIndex.. 0 { - let characterString = unexpectedCharacters.array.map { "\($0)" }.joined(separator: ", ") - self.invalidTextMessage = "Text contains the following invalid characters: `\(characterString)`" - } else { - self.invalidTextMessage = "" + var nonAllowedCharacters = [Character]() + for i in 0.. 0 { + let characterString = nonAllowedCharacters.map { "\($0)" }.joined(separator: ", ") + self.invalidTextMessage = "Text contains the following invalid characters: `\(characterString)`" + self.streamInvalidTextMessage = "The following characters will be ignored: `\(characterString)`" + } else { + self.invalidTextMessage = "" + self.streamInvalidTextMessage = "" } } } diff --git a/demo/ios/OrcaDemo/Podfile b/demo/ios/OrcaDemo/Podfile index b888dd18..e6434527 100644 --- a/demo/ios/OrcaDemo/Podfile +++ b/demo/ios/OrcaDemo/Podfile @@ -2,5 +2,5 @@ source 'https://cdn.cocoapods.org/' platform :ios, '13.0' target 'OrcaDemo' do - pod 'Orca-iOS', '~> 0.1.0' + pod 'Orca-iOS', '~> 0.2.0' end diff --git a/demo/ios/OrcaDemo/Podfile.lock b/demo/ios/OrcaDemo/Podfile.lock index 9a4803dc..ddb3725e 100644 --- a/demo/ios/OrcaDemo/Podfile.lock +++ b/demo/ios/OrcaDemo/Podfile.lock @@ -1,16 +1,16 @@ PODS: - - Orca-iOS (0.1.0) + - Orca-iOS (0.2.0) DEPENDENCIES: - - Orca-iOS (~> 0.1.0) + - Orca-iOS (~> 0.2.0) SPEC REPOS: trunk: - Orca-iOS SPEC CHECKSUMS: - Orca-iOS: 808b4c77678454905ea0a0c1408eff8f9255e3ac + Orca-iOS: 01bbf44ba52a102104fc09aded6bfda7beb4865e -PODFILE CHECKSUM: b2d1bae4a022122055b4d7532f81ce24a11ade44 +PODFILE CHECKSUM: 7655658323a426ab9a7ed6f7546e15081a877170 -COCOAPODS: 1.11.3 +COCOAPODS: 1.15.2 diff --git a/demo/llm_voice_assistant/.gitignore b/demo/llm_voice_assistant/.gitignore new file mode 100644 index 00000000..ba0430d2 --- /dev/null +++ b/demo/llm_voice_assistant/.gitignore @@ -0,0 +1 @@ +__pycache__/ \ No newline at end of file diff --git a/demo/llm_voice_assistant/README.md b/demo/llm_voice_assistant/README.md new file mode 100644 index 00000000..ecfdd9d5 --- /dev/null +++ b/demo/llm_voice_assistant/README.md @@ -0,0 +1,45 @@ +# LLM Voice Assistant Demo - Talk to ChatGPT in Real-Time + +Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) + +This demo showcases how [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) can be seamlessly +integrated into LLM-applications to drastically reduce the audio latency of voice assistants. + +## Technologies + +In this demo, the user can interact with a voice assistant in real-time by leveraging GenAI technologies. +It is built like the majority of voice assistant today, by chaining together a Speech-to-Text engine, an LLM, and +a Text-to-Speech engine. + +The following technologies are used: + +- Speech-to-Text: Picovoice's [Cheetah Streaming Speech-to-Text](https://picovoice.ai/platform/cheetah/) +- LLM: "ChatGPT" using `gpt-3.5-turbo` + with OpenAI Chat Completion API. +- TTS: + - Picovoice's [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) + - OpenAI TTS + +## Compatibility + +This demo has been tested on Linux (x86_64) and macOS (x86_64) using Python 3.10. + +## Access Keys + +To run all features of this demo, access keys are required for: + +- Picovoice Console: Get your `AccessKey` for free by signing up or logging in + to [Picovoice Console](https://console.picovoice.ai/). +- OpenAI API: Get your `AccessKey` from OpenAI. + +## Usage + +```bash +python llm_voice_assistant_demo.py --picovoice-access-key ${PV_ACCESS_KEY} --openai-access-key ${OPEN_AI_KEY} +``` + +Replace `${PV_ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console, +`${OPEN_AI_KEY}` with your `AccessKey` obtained from OpenAI. +You can toggle between Orca and OpenAI TTS by using the `--tts` flag, using `picovoice_orca` or `openai`, respectively. +If you don't want to use ChatGPT, set the `--llm` flag to `dummy`. +This will simulate an LLM response using example sentences that are synthesized by the TTS system. diff --git a/demo/llm_voice_assistant/llm_voice_assistant_demo.py b/demo/llm_voice_assistant/llm_voice_assistant_demo.py new file mode 100644 index 00000000..c48b8c3e --- /dev/null +++ b/demo/llm_voice_assistant/llm_voice_assistant_demo.py @@ -0,0 +1,265 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import argparse +import time +from typing import Dict + +from pvrecorder import PvRecorder + +from src import ( + LLM, + LLMs, + Synthesizer, + Synthesizers, + TimingPrinter, + Timer, + UserInput, + UserInputs, + StreamingAudioDevice, + Transcribers, +) + +MAX_WAIT_TIME_FIRST_AUDIO = 10 + + +def get_user_input_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + + user_input_type = UserInputs(args.user_input) + if user_input_type is UserInputs.VOICE: + kwargs["audio_device_index"] = args.input_audio_device_index + + kwargs["transcriber"] = Transcribers.PICOVOICE_CHEETAH + kwargs["transcriber_params"] = dict() + if args.picovoice_access_key is None: + raise ValueError("Picovoice access key is required when using voice user input") + kwargs["transcriber_params"]["access_key"] = args.picovoice_access_key + if args.speech_endpoint_duration_sec is not None: + kwargs["transcriber_params"]["endpoint_duration_sec"] = args.speech_endpoint_duration_sec + + elif user_input_type is UserInputs.TEXT: + kwargs["llm_type"] = LLMs(args.llm) + + return kwargs + + +def get_llm_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + llm_type = LLMs(args.llm) + + if llm_type is LLMs.OPENAI: + if args.openai_access_key is None: + raise ValueError( + f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") + if args.tokens_per_second is not None: + raise ValueError(f"Tokens per second is not supported for `{llm_type}`") + + kwargs["access_key"] = args.openai_access_key + if args.system_message is not None: + kwargs["system_message"] = args.system_message + + elif llm_type is LLMs.DUMMY: + if args.tokens_per_second is not None: + kwargs["tokens_per_second"] = args.tokens_per_second + + return kwargs + + +def get_synthesizer_init_kwargs(args: argparse.Namespace) -> Dict[str, str]: + kwargs = dict() + synthesizer_type = Synthesizers(args.synthesizer) + + if synthesizer_type is Synthesizers.PICOVOICE_ORCA: + if args.picovoice_access_key is None: + raise ValueError("Picovoice access key is required when using Picovoice TTS") + kwargs["access_key"] = args.picovoice_access_key + kwargs["model_path"] = args.orca_model_path + kwargs["library_path"] = args.orca_library_path + + elif synthesizer_type is Synthesizers.OPENAI: + if args.openai_access_key is None: + raise ValueError( + f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.") + kwargs["access_key"] = args.openai_access_key + + return kwargs + + +def main(args: argparse.Namespace) -> None: + max_num_interactions = args.num_interactions + + user_input_init_kwargs = get_user_input_init_kwargs(args) + user_input = UserInput.create(UserInputs(args.user_input), **user_input_init_kwargs) + + audio_output = StreamingAudioDevice.from_default_device() + + timer = Timer() + + synthesizer_init_kwargs = get_synthesizer_init_kwargs(args) + synthesizer = Synthesizer.create( + Synthesizers(args.synthesizer), + play_audio_callback=audio_output.play, + timer=timer, + **synthesizer_init_kwargs) + + llm_init_kwargs = get_llm_init_kwargs(args) + llm = LLM.create(LLMs(args.llm), **llm_init_kwargs) + + timing_printer = TimingPrinter(llm_string=f"{llm}", synthesizer_string=f"{synthesizer}") + + try: + num_interactions_counter = 0 + while True: + timer.reset() + + audio_output.start(sample_rate=synthesizer.sample_rate) + + text = user_input.get_user_input() + + timer.log_time_llm_request() + text_generator = llm.chat(user_input=text) + + llm_message = "" + printed_stats = False + for token in text_generator: + if token is None: + continue + + if timer.is_first_token: + timer.log_time_first_llm_token() + + llm_message += token + + if synthesizer.text_streamable: + synthesizer.synthesize(token) + + if not timer.before_first_audio and not printed_stats: + timing_printer.print_timing_stats( + num_seconds_first_llm_token=timer.num_seconds_to_first_token(), + num_seconds_first_audio=timer.num_seconds_to_first_audio(), + ) + printed_stats = True + print(f"Answering with {synthesizer} ...") + + timer.increment_num_tokens() + + timer.log_time_last_llm_token() + + if synthesizer.text_streamable: + synthesizer.flush() + else: + synthesizer.synthesize(llm_message) + + wait_start_time = time.time() + while timer.before_first_audio: + if time.time() - wait_start_time > MAX_WAIT_TIME_FIRST_AUDIO: + print( + f"Waited for {MAX_WAIT_TIME_FIRST_AUDIO}s for first audio but did not receive any. Exiting") + break + + if not printed_stats: + timing_printer.print_timing_stats( + num_seconds_first_llm_token=timer.num_seconds_to_first_token(), + num_seconds_first_audio=timer.num_seconds_to_first_audio()) + print(f"Answering with {synthesizer} ...") + + audio_output.flush_and_terminate() + + num_interactions_counter += 1 + + if 0 < max_num_interactions == num_interactions_counter: + print("\nDemo complete!") + break + + print() + + except KeyboardInterrupt: + pass + + synthesizer.terminate() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Text-to-speech streaming synthesis") + + parser.add_argument( + "--user-input", + default=UserInputs.VOICE.value, + choices=[u.value for u in UserInputs], + help="Choose type of input type") + parser.add_argument( + "--input-audio-device-index", + type=int, + default=-1, + help="Index of input audio device") + parser.add_argument( + "--speech-endpoint-duration-sec", + type=float, + default=None, + help="Duration in seconds for speechless audio to be considered an endpoint") + parser.add_argument( + "--show-audio-devices", + action="store_true", + help="Only list available devices and exit") + + parser.add_argument( + "--llm", + default=LLMs.OPENAI.value, + choices=[llm.value for llm in LLMs], + help="Choose LLM to use") + parser.add_argument( + "--openai-access-key", + default=None, + help="Open AI access key. Needed when using openai models") + parser.add_argument( + "--system-message", + default=None, + help="The system message to use to prompt the LLM response") + parser.add_argument( + "--tokens-per-second", + default=None, + type=int, + help="Imitated tokens per second to use for Dummy LLM") + + parser.add_argument( + "--tts", + dest="synthesizer", + default=Synthesizers.PICOVOICE_ORCA.value, + choices=[s.value for s in Synthesizers], + help="Choose voice synthesizer to use") + parser.add_argument( + "--picovoice-access-key", + default=None, + help="AccessKey obtained from Picovoice Console") + parser.add_argument( + "--orca-model-path", + default=None, + help="Path to the model parameters file") + parser.add_argument( + "--orca-library-path", + default=None, + help="Path to Orca's dynamic library") + + parser.add_argument( + "--num-interactions", + type=int, + default=-1, + help="Number of interactions with LLM run before completing the demo. Default is -1 (run indefinitely)") + + arg = parser.parse_args() + + if arg.show_audio_devices: + for index, name in enumerate(PvRecorder.get_available_devices()): + print('Device #%d: %s' % (index, name)) + exit(0) + + main(arg) diff --git a/demo/llm_voice_assistant/requirements.txt b/demo/llm_voice_assistant/requirements.txt new file mode 100644 index 00000000..34ec74df --- /dev/null +++ b/demo/llm_voice_assistant/requirements.txt @@ -0,0 +1,16 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +openai==1.17.0 +pvcheetah==2.0.1 +pvrecorder==1.2.2 +sounddevice==0.4.6 +tiktoken==0.6.0 diff --git a/demo/llm_voice_assistant/src/__init__.py b/demo/llm_voice_assistant/src/__init__.py new file mode 100644 index 00000000..d618d01b --- /dev/null +++ b/demo/llm_voice_assistant/src/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from .audio_device import * +from .llm import * +from .synthesizer import * +from .transcriber import * +from .user_input import * +from .util import * diff --git a/demo/llm_voice_assistant/src/audio_device.py b/demo/llm_voice_assistant/src/audio_device.py new file mode 100644 index 00000000..2036776e --- /dev/null +++ b/demo/llm_voice_assistant/src/audio_device.py @@ -0,0 +1,107 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import time +from queue import Queue +from typing import ( + Any, + Optional, + Sequence, + Union, +) + +import numpy as np +from numpy.typing import NDArray +from sounddevice import OutputStream, query_devices + + +class StreamingAudioDevice: + def __init__(self, device_index: int) -> None: + self._device_index = device_index + self._queue: Queue[NDArray] = Queue() + + self._buffer = None + self._stream = None + self._sample_rate = None + self._blocksize = None + + def start(self, sample_rate: int) -> None: + self._sample_rate = sample_rate + self._blocksize = self._sample_rate // 20 + self._stream = OutputStream( + channels=1, + samplerate=self._sample_rate, + dtype=np.int16, + device=self._device_index, + callback=self._callback, + blocksize=self._blocksize) + self._stream.start() + + # noinspection PyShadowingNames + # noinspection PyUnusedLocal + def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None: + if self._queue.empty(): + outdata[:] = 0 + return + data = self._queue.get() + outdata[:, 0] = data + + def play(self, pcm_chunk: Optional[Union[Sequence[int], NDArray]] = None) -> None: + if self._stream is None: + raise ValueError("Stream is not started. Call `start` method first.") + + if pcm_chunk is not None and isinstance(pcm_chunk, list): + pcm_chunk = np.array(pcm_chunk, dtype=np.int16) + + if self._buffer is not None: + pcm_chunk = self._buffer if pcm_chunk is None else np.concatenate([self._buffer, pcm_chunk]) + self._buffer = None + + if pcm_chunk is None: + return + + length = pcm_chunk.shape[0] + for index_block in range(0, length, self._blocksize): + if (length - index_block) < self._blocksize: + self._buffer = pcm_chunk[index_block: index_block + (length - index_block)] + else: + self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) + + def flush_and_terminate(self) -> None: + self.flush() + self.terminate() + + def flush(self) -> None: + if self._buffer is not None: + chunk = np.zeros(self._blocksize, dtype=np.int16) + chunk[:self._buffer.shape[0]] = self._buffer + self._queue.put_nowait(chunk) + + time_interval = self._blocksize / self._sample_rate + while not self._queue.empty(): + time.sleep(time_interval) + + time.sleep(time_interval) + + def terminate(self) -> None: + self._stream.stop() + self._stream.close() + + @classmethod + def from_default_device(cls) -> 'StreamingAudioDevice': + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + return cls(device_index=device_index) + + +__all__ = [ + "StreamingAudioDevice", +] diff --git a/demo/llm_voice_assistant/src/llm.py b/demo/llm_voice_assistant/src/llm.py new file mode 100644 index 00000000..542c9b09 --- /dev/null +++ b/demo/llm_voice_assistant/src/llm.py @@ -0,0 +1,144 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import json +import os +import random +import time +from enum import Enum +from typing import ( + Any, + Generator, + Sequence, +) + +import tiktoken + + +class LLMs(Enum): + DUMMY = "dummy" + OPENAI = "openai" + + +class LLM: + SYSTEM_MESSAGE = """ + You are a friendly voice assistant in customer service of an e-commerce platform. + Use natural, conversational language that are clear and easy to follow (short sentences, simple words). + Only use english letters and punctuation, no special characters. + Keep the conversation flowing naturally. + """ + + def __init__(self, system_message: str = SYSTEM_MESSAGE) -> None: + self._system_message = system_message + + def _chat(self, user_input: str) -> Generator[str, None, None]: + raise NotImplementedError( + f"Method `chat_stream` must be implemented in a subclass of {self.__class__.__name__}") + + def chat(self, user_input: str) -> Generator[str, None, None]: + for token in self._chat(user_input=user_input): + yield token + + @classmethod + def create(cls, llm_type: LLMs, **kwargs) -> 'LLM': + classes = { + LLMs.DUMMY: DummyLLM, + LLMs.OPENAI: OpenAILLM, + } + + if llm_type not in classes: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{llm_type.value}`") + + return classes[llm_type](**kwargs) + + def __str__(self) -> str: + raise NotImplementedError() + + +class OpenAILLM(LLM): + MODEL_NAME = "gpt-3.5-turbo" + RANDOM_SEED = 7777 + + def __init__( + self, + access_key: str, + model_name: str = MODEL_NAME, + **kwargs: Any, + ) -> None: + super().__init__(**kwargs) + + from openai import OpenAI + self._model_name = model_name + self._client = OpenAI(api_key=access_key) + + self._history = [{"role": "system", "content": self._system_message}] + + def _append_user_message(self, message: str) -> None: + self._history.append({"role": "user", "content": message}) + + def _append_assistant_message(self, message: str) -> None: + self._history.append({"role": "assistant", "content": message}) + + def _chat(self, user_input: str) -> Generator[str, None, None]: + self._append_user_message(user_input) + stream = self._client.chat.completions.create( + model=self._model_name, + messages=self._history, + seed=self.RANDOM_SEED, + temperature=0, + top_p=0.05, + stream=True) + assistant_message = "" + for chunk in stream: + token = chunk.choices[0].delta.content + yield token + if token is not None: + assistant_message += token + self._append_assistant_message(assistant_message) + + def __str__(self) -> str: + return f"ChatGPT ({self._model_name})" + + +class DummyLLM(LLM): + TOKENS_PER_SECOND = 25 + + def __init__(self, tokens_per_second: int = TOKENS_PER_SECOND) -> None: + super().__init__(system_message="") + + self._encoder = tiktoken.encoding_for_model("gpt-4") + self._tokens_delay = 1 / tokens_per_second + + data_file_path = os.path.join(os.path.dirname(__file__), "../../../resources/demo/demo_data.json") + with open(data_file_path, encoding="utf8") as data_file: + self._sentences = json.loads(data_file.read())["demo_sentences"] + + random.seed(7777) + + def _tokenize(self, text: str) -> Sequence[str]: + tokens = [self._encoder.decode([i]) for i in self._encoder.encode(text)] + return tokens + + def _chat(self, user_input: str) -> Generator[str, None, None]: + sentence = self._sentences[random.randint(0, len(self._sentences) - 1)] + + for i in self._tokenize(text=sentence): + time.sleep(self._tokens_delay) + yield i + + def __str__(self) -> str: + return "Dummy LLM" + + +__all__ = [ + "LLMs", + "LLM", +] diff --git a/demo/llm_voice_assistant/src/synthesizer.py b/demo/llm_voice_assistant/src/synthesizer.py new file mode 100644 index 00000000..f48b4345 --- /dev/null +++ b/demo/llm_voice_assistant/src/synthesizer.py @@ -0,0 +1,241 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import threading +import time +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from queue import Queue +from typing import ( + Any, + Callable, + Literal, + Optional, + Sequence, + Union, +) + +import numpy as np +import pvorca +from numpy.typing import NDArray +from openai import OpenAI +from pvorca import OrcaActivationLimitError + +from .util import Timer + + +class Synthesizers(Enum): + OPENAI = "openai" + PICOVOICE_ORCA = "picovoice_orca" + + +class Synthesizer: + def __init__( + self, + sample_rate: int, + play_audio_callback: Callable[[Union[Sequence[int], NDArray]], None], + timer: Timer, + text_streamable: bool = False, + ) -> None: + self.sample_rate = sample_rate + self.text_streamable = text_streamable + + self._play_audio_callback = play_audio_callback + self._timer = timer + + def synthesize(self, text: str) -> None: + raise NotImplementedError( + f"Method `synthesize` must be implemented in a subclass of {self.__class__.__name__}") + + @property + def info(self) -> str: + raise NotImplementedError( + f"Method `info` must be implemented in a subclass of {self.__class__.__name__}") + + def flush(self) -> None: + pass + + def terminate(self) -> None: + pass + + @classmethod + def create(cls, engine: Synthesizers, **kwargs: Any) -> 'Synthesizer': + subclasses = { + Synthesizers.PICOVOICE_ORCA: PicovoiceOrcaSynthesizer, + Synthesizers.OPENAI: OpenAISynthesizer, + } + + if engine not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{engine.value}`") + + return subclasses[engine](**kwargs) + + def __str__(self) -> str: + raise NotImplementedError() + + +class OpenAISynthesizer(Synthesizer): + SAMPLE_RATE = 24000 + NAME = "OpenAI TTS" + + DEFAULT_MODEL_NAME = "tts-1" + DEFAULT_VOICE_NAME = "shimmer" + + def __init__( + self, + access_key: str, + model_name: str = DEFAULT_MODEL_NAME, + voice_name: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"] = DEFAULT_VOICE_NAME, + **kwargs: Any + ) -> None: + super().__init__(sample_rate=self.SAMPLE_RATE, **kwargs) + + self._model_name = model_name + self._voice_name = voice_name + self._client = OpenAI(api_key=access_key) + + @staticmethod + def _decode(b: bytes) -> NDArray: + pcm = np.frombuffer(BytesIO(b).read(), dtype=np.int16) + return pcm + + def synthesize(self, text: str) -> None: + self._timer.maybe_log_time_first_synthesis_request() + + response = self._client.audio.speech.create( + model=self._model_name, + voice=self._voice_name, + response_format="pcm", + input=text) + + for chunk in response.iter_bytes(chunk_size=1024): + self._timer.maybe_log_time_first_audio() + + pcm = self._decode(chunk) + self._play_audio_callback(pcm) + + @property + def info(self) -> str: + return f"{self.NAME} (model: {self.DEFAULT_MODEL_NAME}, voice: {self.DEFAULT_VOICE_NAME})" + + def __str__(self) -> str: + return f"{self.NAME}" + + +class PicovoiceOrcaSynthesizer(Synthesizer): + NUM_TOKENS_PER_PCM_CHUNK = 4 + + @dataclass + class OrcaTextInput: + text: str + flush: bool + + def __init__( + self, + play_audio_callback: Callable[[Union[Sequence[int], NDArray]], None], + timer: Timer, + access_key: str, + model_path: Optional[str] = None, + library_path: Optional[str] = None, + ) -> None: + self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + super().__init__( + sample_rate=self._orca.sample_rate, + play_audio_callback=play_audio_callback, + timer=timer, + text_streamable=True) + + self._orca_stream = self._orca.stream_open() + + self._queue: Queue[Optional[PicovoiceOrcaSynthesizer.OrcaTextInput]] = Queue() + + self._num_tokens = 0 + + self._thread = None + self._start_thread() + + def _start_thread(self) -> None: + self._thread = threading.Thread(target=self._run) + self._thread.start() + + def _close_thread_blocking(self): + self._queue.put_nowait(None) + self._thread.join() + + def _reset_state(self) -> None: + self._num_tokens = 0 + + def _compute_first_audio_delay(self, pcm: Sequence[int], processing_time: float) -> float: + seconds_audio = len(pcm) / self.sample_rate + tokens_per_sec = self._num_tokens / (time.time() - self._timer.time_first_synthesis_request) + llm_delay_seconds = (self.NUM_TOKENS_PER_PCM_CHUNK / (tokens_per_sec + 1e-4)) + orca_delay_seconds = 3 * processing_time + delay_seconds = max(llm_delay_seconds + orca_delay_seconds - seconds_audio, 0) + return delay_seconds + + def _run(self) -> None: + while True: + orca_input = self._queue.get() + if orca_input is None: + break + + self._timer.maybe_log_time_first_synthesis_request() + + self._num_tokens += 1 + + start = time.time() + try: + if not orca_input.flush: + pcm = self._orca_stream.synthesize(orca_input.text) + else: + pcm = self._orca_stream.flush() + except OrcaActivationLimitError: + raise ValueError("Orca activation limit reached.") + processing_time = time.time() - start + + if pcm is not None: + if self._timer.before_first_audio: + self._timer.maybe_log_time_first_audio() + + initial_audio_delay = self._compute_first_audio_delay(pcm=pcm, processing_time=processing_time) + self._timer.set_initial_audio_delay(initial_audio_delay) + + time.sleep(initial_audio_delay) + + self._play_audio_callback(pcm) + + def synthesize(self, text: str) -> None: + self._queue.put_nowait(self.OrcaTextInput(text=text, flush=False)) + + def flush(self) -> None: + self._queue.put_nowait(self.OrcaTextInput(text="", flush=True)) + self._close_thread_blocking() + self._reset_state() + self._start_thread() + + def terminate(self): + self._close_thread_blocking() + self._orca_stream.close() + self._orca.delete() + + @property + def info(self) -> str: + return f"Picovoice Orca v{self._orca.version}" + + def __str__(self) -> str: + return "Picovoice Orca" + + +__all__ = [ + "Synthesizers", + "Synthesizer", +] diff --git a/demo/llm_voice_assistant/src/transcriber.py b/demo/llm_voice_assistant/src/transcriber.py new file mode 100644 index 00000000..f03ae432 --- /dev/null +++ b/demo/llm_voice_assistant/src/transcriber.py @@ -0,0 +1,87 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from enum import Enum +from typing import ( + Any, + Optional, + Sequence, + Tuple, +) + +from pvcheetah import CheetahActivationLimitError, create + + +class Transcribers(Enum): + PICOVOICE_CHEETAH = "picovoice_cheetah" + + +class Transcriber: + def process(self, pcm_frame: Sequence[int]) -> Tuple[str, bool]: + raise NotImplementedError() + + def flush(self) -> str: + raise NotImplementedError() + + @property + def frame_length(self) -> int: + raise NotImplementedError() + + @classmethod + def create(cls, x: Transcribers, **kwargs: Any) -> 'Transcriber': + subclasses = { + Transcribers.PICOVOICE_CHEETAH: PicovoiceCheetahTranscriber, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") + + return subclasses[x](**kwargs) + + +class PicovoiceCheetahTranscriber(Transcriber): + def __init__( + self, + access_key: str, + library_path: Optional[str] = None, + model_path: Optional[str] = None, + endpoint_duration_sec: float = 1.0, + enable_automatic_punctuation: bool = True + ) -> None: + self._cheetah = create( + access_key=access_key, + library_path=library_path, + model_path=model_path, + endpoint_duration_sec=endpoint_duration_sec, + enable_automatic_punctuation=enable_automatic_punctuation) + + def process(self, pcm_frame: Sequence[int]) -> Tuple[str, bool]: + try: + partial_transcript, is_endpoint = self._cheetah.process(pcm_frame) + except CheetahActivationLimitError: + raise ValueError("Cheetah activation limit reached.") + return partial_transcript, is_endpoint + + def flush(self) -> str: + try: + return self._cheetah.flush() + except CheetahActivationLimitError: + raise ValueError("Cheetah activation limit reached.") + + @property + def frame_length(self) -> int: + return self._cheetah.frame_length + + +__all__ = [ + "Transcriber", + "Transcribers", +] diff --git a/demo/llm_voice_assistant/src/user_input.py b/demo/llm_voice_assistant/src/user_input.py new file mode 100644 index 00000000..791943e8 --- /dev/null +++ b/demo/llm_voice_assistant/src/user_input.py @@ -0,0 +1,94 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +from enum import Enum +from typing import ( + Any, + Dict, + Optional, +) + +from pvrecorder import PvRecorder + +from .llm import LLMs +from .transcriber import Transcriber, Transcribers + + +class UserInputs(Enum): + VOICE = "voice" + TEXT = "text" + + +class UserInput: + def get_user_input(self) -> str: + raise NotImplementedError() + + @classmethod + def create(cls, x: UserInputs, **kwargs: Any) -> 'UserInput': + subclasses = { + UserInputs.VOICE: VoiceUserInput, + UserInputs.TEXT: TextUserInput, + } + + if x not in subclasses: + raise NotImplementedError(f"Cannot create {cls.__name__} of type `{x.value}`") + + return subclasses[x](**kwargs) + + +class VoiceUserInput(UserInput): + def __init__( + self, + audio_device_index: int, + transcriber: Transcribers, + transcriber_params: Dict[str, Any], + ) -> None: + self._transcriber = Transcriber.create(transcriber, **transcriber_params) + self._recorder = PvRecorder(frame_length=self._transcriber.frame_length, device_index=audio_device_index) + + def get_user_input(self) -> str: + print("Listening ...") + if not self._recorder.is_recording: + self._recorder.start() + + transcript = "" + try: + while True: + partial_transcript, is_endpoint = self._transcriber.process(self._recorder.read()) + transcript += partial_transcript + if is_endpoint: + final_transcript = self._transcriber.flush() + transcript += final_transcript + self._recorder.stop() + return transcript + except Exception as e: + self._recorder.stop() + raise e + + +class TextUserInput(UserInput): + USER_PROMPT = "Your question: " + USER_PROMPT_DUMMY_LLM = "Press ENTER to generate a demo LLM response " + + def __init__(self, llm_type: LLMs, prompt: Optional[str] = None) -> None: + if prompt is not None: + self._prompt = prompt + else: + self._prompt = self.USER_PROMPT_DUMMY_LLM if llm_type is LLMs.DUMMY else self.USER_PROMPT + + def get_user_input(self) -> str: + return input(self._prompt) + + +__all__ = [ + "UserInput", + "UserInputs", +] diff --git a/demo/llm_voice_assistant/src/util.py b/demo/llm_voice_assistant/src/util.py new file mode 100644 index 00000000..79fb7966 --- /dev/null +++ b/demo/llm_voice_assistant/src/util.py @@ -0,0 +1,169 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import time +from dataclasses import dataclass +from typing import Tuple + + +@dataclass +class Colors: + GREEN = "\033[92m" + RESET = "\033[0m" + BOLD = "\033[1m" + + +@dataclass +class Timer: + time_llm_request: float = -1.0 + time_first_llm_token: float = -1.0 + time_last_llm_token: float = -1.0 + time_first_synthesis_request: float = -1.0 + time_first_audio: float = -1.0 + initial_audio_delay: float = 0.0 + + before_first_audio: bool = True + _is_first_synthesis_request: bool = True + _num_tokens: int = 0 + + @staticmethod + def _get_time() -> float: + return time.time() + + def log_time_llm_request(self) -> None: + self.time_llm_request = self._get_time() + + def log_time_first_llm_token(self) -> None: + self.time_first_llm_token = self._get_time() + + def log_time_last_llm_token(self) -> None: + self.time_last_llm_token = self._get_time() + + def maybe_log_time_first_synthesis_request(self) -> None: + if self._is_first_synthesis_request: + self.time_first_synthesis_request = self._get_time() + self._is_first_synthesis_request = False + + def maybe_log_time_first_audio(self) -> None: + if self.before_first_audio: + self.time_first_audio = self._get_time() + self.before_first_audio = False + + def increment_num_tokens(self) -> None: + self._num_tokens += 1 + + @property + def is_first_token(self) -> bool: + return self._num_tokens == 0 + + def set_initial_audio_delay(self, delay: float) -> None: + self.initial_audio_delay = delay + + def num_seconds_to_first_audio(self) -> float: + return self.time_first_audio - self.time_first_llm_token + + def num_seconds_to_first_token(self) -> float: + return self.time_first_llm_token - self.time_llm_request + + def reset(self) -> None: + self.time_llm_request = -1.0 + self.time_first_llm_token = -1.0 + self.time_last_llm_token = -1.0 + self.time_first_synthesis_request = -1.0 + self.time_first_audio = -1.0 + self.initial_audio_delay = 0.0 + + self._is_first_synthesis_request = True + self.before_first_audio = True + + self._num_tokens = 0 + + +class TimingPrinter: + TIMER_MESSAGE = "Time to wait for" + + TIMER_BAR_MAX_RED_SECONDS = 2.0 + TIMER_BAR_SYMBOLS_PER_SECONDS = 40 + TIMER_BAR_SYMBOL = ">" + + MAX_GREEN_VALUE = 0.6 + MAX_RED_VALUE = 0.75 + + def __init__( + self, + llm_string: str, + synthesizer_string: str, + timer_bar_max_red_seconds: float = TIMER_BAR_MAX_RED_SECONDS, + timer_bar_symbols_per_second: float = TIMER_BAR_SYMBOLS_PER_SECONDS, + timer_bar_symbol: str = TIMER_BAR_SYMBOL, + ) -> None: + max_length = len(llm_string) if len(llm_string) > len(synthesizer_string) else len(synthesizer_string) + llm_info_string = llm_string.ljust(max_length) + synthesizer_info_string = synthesizer_string.ljust(max_length) + + self._timer_message_llm = f"{self.TIMER_MESSAGE} {llm_info_string} : " + self._timer_message_tts = f"{self.TIMER_MESSAGE} {synthesizer_info_string} : " + + self._progress_bar_color_max = timer_bar_max_red_seconds * timer_bar_symbols_per_second + self._progress_bar_symbols_per_second = timer_bar_symbols_per_second + self._progress_bar_symbol = timer_bar_symbol + + @staticmethod + def _colored_string(text: str, red: float, green: float, blue: float, bold: bool = False) -> str: + s = Colors.BOLD if bold else "" + s = f"{s}\033[38;2;{int(red * 255)};{int(green * 255)};{int(blue * 255)}m{text}{Colors.RESET}" + return s + + def _print_colored_progress_bar(self, num_seconds: float, bold: bool = False) -> Tuple[float, float, float]: + red = 0 + green = self.MAX_GREEN_VALUE + blue = 0 + + half_max_length = self._progress_bar_color_max // 2 + + length = int(num_seconds * self._progress_bar_symbols_per_second) + for i in range(length): + if i < half_max_length: + red = min(i / (half_max_length - 1), self.MAX_RED_VALUE) + else: + green = max(0.5 - (i - half_max_length) / (half_max_length - 1), 0) + + print(f"{self._colored_string(self._progress_bar_symbol, red, green, blue, bold=bold)}", end="") + + return red, green, blue + + def _print_timer_bar_llm(self, num_seconds_first_llm_token: float) -> None: + print(self._colored_string(self._timer_message_llm, 0, self.MAX_GREEN_VALUE, 0), end="") + + red, green, blue = self._print_colored_progress_bar(num_seconds_first_llm_token) + + num_seconds_string = f"{round(num_seconds_first_llm_token, 1):.1f}s" + print(f" {self._colored_string(num_seconds_string, red, green, blue)}", flush=True) + + def _print_timer_bar_tts(self, num_seconds_first_audio: float) -> None: + print(self._colored_string(self._timer_message_tts, 0, self.MAX_GREEN_VALUE, 0, bold=True), end="") + + red, green, blue = self._print_colored_progress_bar(num_seconds_first_audio, bold=True) + + num_seconds_string = f"{round(num_seconds_first_audio, 1):.1f}s" + print(f" {self._colored_string(num_seconds_string, red, green, blue, bold=True)}", flush=True) + + def print_timing_stats(self, num_seconds_first_llm_token: float, num_seconds_first_audio: float) -> None: + print() + self._print_timer_bar_llm(num_seconds_first_llm_token) + self._print_timer_bar_tts(num_seconds_first_audio) + + +__all__ = [ + "Colors", + "TimingPrinter", + "Timer", +] diff --git a/demo/python/.gitignore b/demo/python/.gitignore index 2ce46fe0..1e1bea03 100644 --- a/demo/python/.gitignore +++ b/demo/python/.gitignore @@ -3,3 +3,4 @@ dist MANIFEST.in pvorcademo pvorcademo.egg-info +__pycache__/ \ No newline at end of file diff --git a/demo/python/README.md b/demo/python/README.md index fa86a25e..f8d11e8d 100644 --- a/demo/python/README.md +++ b/demo/python/README.md @@ -1,10 +1,11 @@ -# Orca Text-to-Speech Engine Demo +# Orca Streaming Text-to-Speech Engine Python Demo Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) ## Orca -Orca is an on-device text-to-speech engine producing high-quality, realistic, spoken audio with zero latency. Orca is: +Orca is an on-device streaming text-to-speech engine that is designed for use with LLMs, enabling zero-latency +voice assistants. Orca is: - Private; All voice processing runs locally. - Cross-Platform: @@ -15,7 +16,7 @@ Orca is an on-device text-to-speech engine producing high-quality, realistic, sp ## Compatibility -- Python 3.7+ +- Python 3.8+ - Runs on Linux (x86_64), macOS (x86_64, arm64), Windows (x86_64), Raspberry Pi (5, 4, 3), and NVIDIA Jetson Nano. ## Installation @@ -32,12 +33,35 @@ Signup or Login to [Picovoice Console](https://console.picovoice.ai/) to get you ## Usage -To synthesize speech, run the following: +Orca supports two modes of operation: streaming and single synthesis. + +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +This is demonstrated in the Orca streaming demo. + +In the single synthesis mode, the text is synthesized in a single call to the Orca engine. + +### Streaming synthesis demo + +In this demo, we simulate a response from a language model by creating a text stream from a user-defined text. +We stream that text to Orca and play the synthesized audio as soon as it gets generated. + +To run it, execute the following: + +```console +orca_demo_streaming --access_key ${ACCESS_KEY} --text_to_stream ${TEXT} +``` + +Replace `${ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console and `${TEXT}` with your text to be +streamed to Orca. Please note that this demo was not tested on macOS. + +### Single synthesis demo + +To synthesize speech in a single call to Orca and without audio playback, run the following: ```console orca_demo --access_key ${ACCESS_KEY} --text ${TEXT} --output_path ${WAV_OUTPUT_PATH} ``` -Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with your text to be synthesized, -and `${WAV_OUTPUT_PATH}` with a path to a `.wav` file where the generated audio will be stored as a single-channel, +Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console, `${TEXT}` with your text to be synthesized, +and `${WAV_OUTPUT_PATH}` with a path to a `.wav` file where the generated audio will be stored as a single-channel, 16-bit PCM `.wav` file. diff --git a/demo/python/orca_demo.py b/demo/python/orca_demo.py index 628c816a..1f2a0cbe 100644 --- a/demo/python/orca_demo.py +++ b/demo/python/orca_demo.py @@ -11,53 +11,75 @@ import argparse import struct +import time import wave from pvorca import create, OrcaActivationLimitError -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument( '--access_key', + '-a', required=True, help='AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)') parser.add_argument( '--text', + '-t', required=True, help='Text to be synthesized') parser.add_argument( '--output_path', + '-o', required=True, help='Absolute path to .wav file where the generated audio will be stored') parser.add_argument( '--library_path', + '-l', help='Absolute path to dynamic library. Default: using the library provided by `pvorca`') parser.add_argument( '--model_path', + '-m', help='Absolute path to Orca model. Default: using the model provided by `pvorca`') args = parser.parse_args() - if not args.output_path.lower().endswith('.wav'): + access_key = args.access_key + model_path = args.model_path + library_path = args.library_path + output_path = args.output_path + text = args.text + + if not output_path.lower().endswith('.wav'): raise ValueError('Given argument --output_path must have WAV file extension') - orca = create(access_key=args.access_key, model_path=args.model_path, library_path=args.library_path) + orca = create(access_key=access_key, model_path=model_path, library_path=library_path) try: - print('Orca version: %s' % orca.version) - pcm = orca.synthesize(args.text) + print(f"Orca version: {orca.version}") + + start = time.time() + + pcm, alignments = orca.synthesize(text) + + processing_time = time.time() - start length_sec = len(pcm) / orca.sample_rate - with wave.open(args.output_path, 'wb') as output_file: + + with wave.open(output_path, "wb") as output_file: output_file.setnchannels(1) output_file.setsampwidth(2) output_file.setframerate(orca.sample_rate) - output_file.writeframes(struct.pack('%dh' % len(pcm), *pcm)) - print('%.2f seconds of audio were written to `%s`.' % (length_sec, args.output_path)) + output_file.writeframes(struct.pack(f"{len(pcm)}h", *pcm)) + + print( + f"Orca took {processing_time:.2f} seconds to synthesize {length_sec:.2f} seconds of speech which is " + f"~{length_sec / processing_time:.0f} times faster than real-time.") + print(f"Audio written to `{output_path}`.") except OrcaActivationLimitError: - print('AccessKey has reached its processing limit') + print("AccessKey has reached its processing limit") finally: orca.delete() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/demo/python/orca_demo_streaming.py b/demo/python/orca_demo_streaming.py new file mode 100644 index 00000000..05b0d92e --- /dev/null +++ b/demo/python/orca_demo_streaming.py @@ -0,0 +1,399 @@ +# +# Copyright 2024 Picovoice Inc. +# +# You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" +# file accompanying this source. +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# + +import argparse +import platform +import re +import subprocess +import threading +import time +import traceback +from dataclasses import dataclass +from queue import Queue +from typing import ( + Any, + Callable, + Dict, + Optional, + Sequence, +) + +import numpy as np +import pvorca +import tiktoken +from numpy.typing import NDArray +from pvorca import OrcaActivationLimitError, OrcaInvalidArgumentError +from sounddevice import ( + OutputStream, + query_devices, + PortAudioError, +) + +CUSTOM_PRON_PATTERN = r"\{(.*?\|.*?)\}" +CUSTOM_PRON_PATTERN_NO_WHITESPACE = r"\{(.*?\|.*?)\}(?!\s)" + + +class StreamingAudioDevice: + def __init__(self, device_index: Optional[int] = None) -> None: + if device_index is None: + device_info = query_devices(kind="output") + device_index = int(device_info["index"]) + + self._device_index = device_index + self._queue: Queue[Sequence[int]] = Queue() + + self._buffer = None + self._stream = None + self._sample_rate = None + self._blocksize = None + + def start(self, sample_rate: int) -> None: + self._sample_rate = sample_rate + self._blocksize = self._sample_rate // 20 + self._stream = OutputStream( + channels=1, + samplerate=self._sample_rate, + dtype=np.int16, + device=self._device_index, + callback=self._callback, + blocksize=self._blocksize) + self._stream.start() + + # noinspection PyShadowingNames + # noinspection PyUnusedLocal + def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None: + if self._queue.empty(): + outdata[:] = 0 + return + + pcm = self._queue.get() + outdata[:, 0] = pcm + + def play(self, pcm_chunk: Sequence[int]) -> None: + if self._stream is None: + raise ValueError("Stream is not started. Call `start` method first.") + + pcm_chunk = np.array(pcm_chunk, dtype=np.int16) + + if self._buffer is not None: + if pcm_chunk is not None: + pcm_chunk = np.concatenate([self._buffer, pcm_chunk]) + else: + pcm_chunk = self._buffer + self._buffer = None + + length = pcm_chunk.shape[0] + for index_block in range(0, length, self._blocksize): + if (length - index_block) < self._blocksize: + self._buffer = pcm_chunk[index_block: index_block + (length - index_block)] + else: + self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize]) + + def flush_and_terminate(self) -> None: + self.flush() + self.terminate() + + def flush(self) -> None: + if self._buffer is not None: + chunk = np.zeros(self._blocksize, dtype=np.int16) + chunk[:self._buffer.shape[0]] = self._buffer + self._queue.put_nowait(chunk) + + time_interval = self._blocksize / self._sample_rate + while not self._queue.empty(): + time.sleep(time_interval) + + time.sleep(time_interval) + + def terminate(self) -> None: + self._stream.stop() + self._stream.close() + + @staticmethod + def list_output_devices() -> Dict[str, Any]: + return query_devices(kind="output") + + +def linux_machine() -> str: + machine = platform.machine() + if machine == "x86_64": + return machine + elif machine in ["aarch64", "armv7l"]: + arch_info = ("-" + machine) if "64bit" in platform.architecture()[0] else "" + else: + raise NotImplementedError("Unsupported CPU architecture: `%s`" % machine) + + cpu_info = "" + try: + cpu_info = subprocess.check_output(["cat", "/proc/cpuinfo"]).decode("utf-8") + cpu_part_list = [x for x in cpu_info.split("\n") if "CPU part" in x] + cpu_part = cpu_part_list[0].split(" ")[-1].lower() + except Exception as e: + raise RuntimeError("Failed to identify the CPU with `%s`\nCPU info: `%s`" % (e, cpu_info)) + + if "0xd03" == cpu_part: + return "cortex-a53" + arch_info + elif "0xd07" == cpu_part: + return "cortex-a57" + arch_info + elif "0xd08" == cpu_part: + return "cortex-a72" + arch_info + elif "0xd0b" == cpu_part: + return "cortex-a76" + arch_info + else: + raise NotImplementedError("Unsupported CPU: `%s`." % cpu_part) + + +class OrcaThread: + @dataclass + class OrcaInput: + text: str + flush: bool + + def __init__( + self, + play_audio_callback: Callable[[Sequence[int]], None], + access_key: str, + num_tokens_per_second: int, + model_path: Optional[str] = None, + library_path: Optional[str] = None, + audio_wait_chunks: Optional[int] = None, + ) -> None: + + self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path) + self._orca_stream = self._orca.stream_open() + self._sample_rate = self._orca.sample_rate + + self._play_audio_callback = play_audio_callback + self._num_tokens_per_second = num_tokens_per_second + assert self._num_tokens_per_second > 0 + + self._queue: Queue[Optional[OrcaThread.OrcaInput]] = Queue() + self._thread = None + + self._time_first_audio_available = -1 + self._pcm_buffer: Queue[Sequence[int]] = Queue() + + self._wait_chunks = audio_wait_chunks or self._get_first_audio_wait_chunks() + self._num_pcm_chunks_processed = 0 + + @staticmethod + def _get_first_audio_wait_chunks() -> int: + wait_chunks = 0 + if platform.system() == "Linux": + machine = linux_machine() + if "cortex" in machine: + wait_chunks = 1 + return wait_chunks + + def _run(self) -> None: + while True: + orca_input = self._queue.get() + if orca_input is None: + while not self._pcm_buffer.empty(): + self._play_audio_callback(self._pcm_buffer.get()) + break + + try: + if not orca_input.flush: + pcm = self._orca_stream.synthesize(orca_input.text) + else: + pcm = self._orca_stream.flush() + except OrcaInvalidArgumentError as e: + raise ValueError(f"Orca could not synthesize text input `{orca_input.text}`: `{e}`") + + if pcm is not None: + if self._num_pcm_chunks_processed < self._wait_chunks: + self._pcm_buffer.put_nowait(pcm) + else: + while not self._pcm_buffer.empty(): + self._play_audio_callback(self._pcm_buffer.get()) + self._play_audio_callback(pcm) + + if self._num_pcm_chunks_processed == 0: + self._time_first_audio_available = time.time() + + self._num_pcm_chunks_processed += 1 + + def _close_thread_blocking(self): + self._queue.put_nowait(None) + self._thread.join() + + def start(self) -> None: + self._thread = threading.Thread(target=self._run) + self._thread.start() + + def synthesize(self, text: str) -> None: + self._queue.put_nowait(self.OrcaInput(text=text, flush=False)) + + def flush(self) -> None: + self._queue.put_nowait(self.OrcaInput(text="", flush=True)) + self._close_thread_blocking() + self.start() + + def delete(self) -> None: + self._close_thread_blocking() + self._orca_stream.close() + self._orca.delete() + + def get_time_first_audio_available(self) -> float: + return self._time_first_audio_available + + @property + def sample_rate(self) -> int: + return self._sample_rate + + @property + def version(self) -> str: + return self._orca.version + + +def tokenize_text(text: str) -> Sequence[str]: + text = re.sub(CUSTOM_PRON_PATTERN_NO_WHITESPACE, r'{\1} ', text) + + custom_pronunciations = re.findall(CUSTOM_PRON_PATTERN, text) + custom_pronunciations = set(["{" + pron + "}" for pron in custom_pronunciations]) + + encoder = tiktoken.encoding_for_model("gpt-4") + tokens_raw = [encoder.decode([i]) for i in encoder.encode(text)] + + custom_pron = "" + tokens_with_custom_pronunciations = [] + for i, token in enumerate(tokens_raw): + in_custom_pron = False + for pron in custom_pronunciations: + in_custom_pron_global = len(custom_pron) > 0 + current_match = token.strip() if not in_custom_pron_global else custom_pron + token + if pron.startswith(current_match): + custom_pron += token.strip() if not in_custom_pron_global else token + in_custom_pron = True + + if not in_custom_pron: + if custom_pron != "": + tokens_with_custom_pronunciations.append(f" {custom_pron}" if i != 0 else custom_pron) + custom_pron = "" + tokens_with_custom_pronunciations.append(token) + + return tokens_with_custom_pronunciations + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "--access_key", + "-a", + required=True, + help="AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)") + parser.add_argument( + "--library_path", + "-l", + help="Absolute path to dynamic library. Default: using the library provided by `pvorca`") + parser.add_argument( + "--model_path", + "-m", + help="Absolute path to Orca model. Default: using the model provided by `pvorca`") + parser.add_argument( + "--text_to_stream", + "-t", + required=True, + help="Text to be streamed to Orca") + parser.add_argument( + "--tokens_per_second", + type=int, + default=15, + help="Number of tokens per second to be streamed to Orca, simulating an LLM response.") + parser.add_argument( + "--audio_wait_chunks", + type=int, + default=None, + help="Number of PCM chunks to wait before starting to play audio. Default: system-dependent.") + parser.add_argument( + "--show_audio_devices", + action="store_true", + help="Only list available audio output devices and exit") + parser.add_argument('--audio-device-index', type=int, default=None, help='Index of input audio device') + args = parser.parse_args() + + if args.show_audio_devices: + print(StreamingAudioDevice.list_output_devices()) + exit(0) + + access_key = args.access_key + model_path = args.model_path + library_path = args.library_path + text = args.text_to_stream + tokens_per_second = args.tokens_per_second + audio_wait_chunks = args.audio_wait_chunks + audio_device_index = args.audio_device_index + + try: + audio_device = StreamingAudioDevice(device_index=audio_device_index) + # Some systems may have issues with PortAudio only when starting the audio device. Test it here. + audio_device.start(sample_rate=16000) + audio_device.terminate() + play_audio_callback = audio_device.play + except PortAudioError: + print(traceback.format_exc()) + print( + "WARNING: Failed to initialize audio device, see details above. Falling back to running " + "the demo without audio playback.\n") + audio_device = None + + # noinspection PyUnusedLocal + def play_audio_callback(pcm: Sequence[int]): + pass + + orca = OrcaThread( + play_audio_callback=play_audio_callback, + num_tokens_per_second=tokens_per_second, + access_key=access_key, + model_path=model_path, + library_path=library_path, + audio_wait_chunks=audio_wait_chunks, + ) + + orca.start() + if audio_device is not None: + audio_device.start(sample_rate=orca.sample_rate) + + try: + print(f"Orca version: {orca.version}\n") + + print(f"Simulated text stream:") + tokens = tokenize_text(text=text) + + time_start_text_stream = time.time() + for token in tokens: + print(f"{token}", end="", flush=True) + + orca.synthesize(text=token) + + time.sleep(1 / tokens_per_second) + + text_stream_duration_seconds = time.time() - time_start_text_stream + + orca.flush() + + first_audio_available_seconds = orca.get_time_first_audio_available() - time_start_text_stream + print(f"\n\nTime to finish text stream: {text_stream_duration_seconds:.2f} seconds") + print(f"Time to receive first audio: {first_audio_available_seconds:.2f} seconds after text stream started\n") + + if audio_device is not None: + print("Waiting for audio to finish ...") + audio_device.flush_and_terminate() + + except OrcaActivationLimitError: + print("AccessKey has reached its processing limit") + finally: + orca.delete() + + +if __name__ == "__main__": + main() diff --git a/demo/python/requirements.txt b/demo/python/requirements.txt index af97749e..af73a886 100644 --- a/demo/python/requirements.txt +++ b/demo/python/requirements.txt @@ -1 +1,4 @@ -pvorca==0.1.4 +numpy>=1.24.0 +pvorca==0.2.1 +sounddevice==0.4.6 +tiktoken==0.6.0 diff --git a/demo/python/setup.py b/demo/python/setup.py index fc8cc2d2..a83f2029 100644 --- a/demo/python/setup.py +++ b/demo/python/setup.py @@ -3,12 +3,14 @@ import setuptools +INCLUDE_FILES = [ + "../../LICENSE", + "orca_demo.py", + "orca_demo_streaming.py"] -INCLUDE_FILES = ('../../LICENSE', 'orca_demo.py') +os.system("git clean -dfx") -os.system('git clean -dfx') - -package_folder = os.path.join(os.path.dirname(__file__), 'pvorcademo') +package_folder = os.path.join(os.path.dirname(__file__), "pvorcademo") os.mkdir(package_folder) manifest_in = "" @@ -16,23 +18,23 @@ shutil.copy(os.path.join(os.path.dirname(__file__), rel_path), package_folder) manifest_in += "include pvorcademo/%s\n" % os.path.basename(rel_path) -with open(os.path.join(os.path.dirname(__file__), 'MANIFEST.in'), 'w') as f: +with open(os.path.join(os.path.dirname(__file__), "MANIFEST.in"), "w") as f: f.write(manifest_in) -with open(os.path.join(os.path.dirname(__file__), 'README.md'), 'r') as f: +with open(os.path.join(os.path.dirname(__file__), "README.md"), "r") as f: long_description = f.read() setuptools.setup( name="pvorcademo", - version="0.1.3", + version="0.2.1", author="Picovoice", author_email="hello@picovoice.ai", - description="Orca Text-to-Speech Engine demos", + description="Orca Streaming Text-to-Speech Engine demos", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/Picovoice/orca", packages=["pvorcademo"], - install_requires=["pvorca==0.1.4"], + install_requires=["numpy>=1.24.0", "pvorca==0.2.1", "sounddevice==0.4.6", "tiktoken==0.6.0"], include_package_data=True, classifiers=[ "Development Status :: 4 - Beta", @@ -44,9 +46,10 @@ ], entry_points=dict( console_scripts=[ - 'orca_demo=pvorcademo.orca_demo:main', + "orca_demo=pvorcademo.orca_demo:main", + "orca_demo_streaming=pvorcademo.orca_demo_streaming:main", ], ), - python_requires='>=3.7', - keywords="Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", + python_requires=">=3.8", + keywords="Streaming Text-to-Speech, TTS, Speech Synthesis, Voice Generation, Speech Engine", ) diff --git a/demo/web/README.md b/demo/web/README.md index 1af0b486..88164656 100644 --- a/demo/web/README.md +++ b/demo/web/README.md @@ -34,10 +34,33 @@ Available on: Hit CTRL-C to stop the server ``` -Wait until Orca has initialized. Type in any text (in English only), and optionally select a desired speech rate. Click -synthesize, and once Orca has finished synthesizing your text, click play and listen for the speech. +Copy in your AccessKey from Picovoice Console, and click "Start Orca". -**Optional**: If you wish, you may replace the model file in the `index.html` with a male model file for a male +## Usage + + +Orca supports two modes of operation: streaming and single synthesis. +In the streaming synthesis mode, Orca processes an incoming text stream in real-time and generates audio in parallel. +In the single synthesis mode, the complete text needs to be known in advance and is synthesized in a single call to +the Orca engine. + +Click on either "Streaming Synthesis" or "Single Synthesis" to continue. + +### Streaming Synthesis + +1. Choose desired speech rate (or keep the default) +2. Click "Open Stream" +3. Type in any text (in English only). +4. When you're done, click "Run Streaming Synthesis" to run streaming synthesis on a simulated text stream. + +### Single Synthesis + +1. Type in any text (in English only) +2. Change the speech rate (or keep the default) +3. Click "Synthesize" +4. Click "Play" and listen for the generated speech. + +**Optional**: If you wish, you may replace the model file in the `index.html` with the male model file for a male voice: ```html diff --git a/demo/web/index.html b/demo/web/index.html index 60d737c2..ff3dfaed 100644 --- a/demo/web/index.html +++ b/demo/web/index.html @@ -8,6 +8,8 @@ let orca = null; let pcm = null; + let alignments = null; + let orcaStream = null; function writeMessage(message) { console.log(message); @@ -36,24 +38,41 @@ return buffer; } - let isPlaying = false; - let originalAudioSource; + const chooseBtnsEl = document.getElementById('choose-btns'); + const chooseSingleBtnEl = document.getElementById('choose-single-btn'); + const chooseStreamBtnEl = document.getElementById('choose-stream-btn'); + const singleSynthesisEl = document.getElementById('single-synthesis'); + const streamSynthesisEl = document.getElementById('stream-synthesis'); + + chooseSingleBtnEl.addEventListener('click', () => { + chooseBtnsEl.style.display = 'none'; + singleSynthesisEl.style.display = 'block'; + }); + + chooseStreamBtnEl.addEventListener('click', () => { + chooseBtnsEl.style.display = 'none'; + streamSynthesisEl.style.display = 'block'; + }); + const textToSynthesizeEl = document.getElementById('text-to-synthesize'); + const textToSynthesizeNumCharsEl = document.getElementById('text-to-synthesize-num-chars'); const textToSynthesizeErrorEl = document.getElementById('text-to-synthesize-error'); const speechRateSliderEl = document.getElementById('speech-rate'); + const speechRateDisplayEl = document.getElementById('speech-rate-display'); const synthesizeBtnEl = document.getElementById('synthesize-btn'); const controlBtnEl = document.getElementById('control-btn'); const downloadBtnEl = document.getElementById('download-btn'); - const speechRateDisplayEl = document.getElementById('speech-rate-display'); + const alignmentsTableEl = document.getElementById('alignments-table'); - function onSynthesizeParamChange() { - if (orca !== null && isPlaying === false) { - synthesizeBtnEl.disabled = false; - controlBtnEl.disabled = true; - downloadBtnEl.disabled = true; - controlBtnEl.innerText = 'Play'; - } - } + const streamTextToSynthesizeEl = document.getElementById('stream-text-to-synthesize'); + const streamTextDisplayEl = document.getElementById('stream-text-display'); + const streamSecondsDisplayEl = document.getElementById('stream-seconds-display'); + const streamTextToSynthesizeErrorEl = document.getElementById('stream-text-to-synthesize-error'); + const streamSpeechRateSliderEl = document.getElementById('stream-speech-rate'); + const streamSpeechRateDisplayEl = document.getElementById('stream-speech-rate-display'); + const streamOpenBtnEl = document.getElementById('stream-open-btn'); + const streamPlayBtnEl = document.getElementById('stream-play-btn'); + const streamCloseBtnEl = document.getElementById('stream-close-btn'); function validateInput(input, validChars) { let nonAllowedCharacters = []; @@ -66,32 +85,130 @@ if (nonAllowedCharacters.length > 0) { textToSynthesizeErrorEl.innerText = `Error: Characters ${JSON.stringify(nonAllowedCharacters)} are not allowed.`; + streamTextToSynthesizeErrorEl.innerText = `Characters ${JSON.stringify(nonAllowedCharacters)} will be ignored.`; synthesizeBtnEl.disabled = true; } else { - textToSynthesizeErrorEl.innerHTML = ' '; + const text = ' '; + textToSynthesizeErrorEl.innerHTML = text; + streamTextToSynthesizeErrorEl.innerHTML = text; synthesizeBtnEl.disabled = false; } } + // Single Synthesis + let isPlaying = false; + let originalAudioSource; + + textToSynthesizeEl.addEventListener('input', (e) => { + textToSynthesizeNumCharsEl.innerText = e.target.value.trim().length.toString(); + }); + + function onSynthesizeParamChange() { + if (orca !== null && isPlaying === false) { + synthesizeBtnEl.disabled = false; + controlBtnEl.disabled = true; + downloadBtnEl.disabled = true; + controlBtnEl.innerText = 'Play'; + } + } + + function setAlignmentsTable(alignments) { + if (alignments === null) { + alignmentsTableEl.style.display = 'none'; + return; + } + + alignmentsTableEl.style.display = 'block'; + const rowCount = alignmentsTableEl.rows.length; + for (let i = 1; i < rowCount; i++) { + alignmentsTableEl.deleteRow(1); + } + + alignments.forEach((a) => { + const row = alignmentsTableEl.insertRow(-1); + row.style.verticalAlign = 'top'; + const word = row.insertCell(0); + const start = row.insertCell(1); + const end = row.insertCell(2); + const phonemes = row.insertCell(3); + + word.innerHTML = `${a.word}`; + start.innerHTML = `${a.startSec.toFixed(3)}`; + end.innerHTML = `${a.endSec.toFixed(3)}`; + const phonemesInnerHTML = a.phonemes.map(p => { + return ` + ${p.phoneme} + [${p.startSec.toFixed(3)} - ${p.endSec.toFixed(3)}s] + `; + }).join(''); + phonemes.innerHTML = ` + + + + + + ${phonemesInnerHTML} + + `; + }); + } + + async function synthesize() { + const text = textToSynthesizeEl.value.trim(); + if (text === '') return; + + writeMessage('Synthesizing. Please wait...'); + try { + textToSynthesizeEl.disabled = true; + speechRateSliderEl.disabled = true; + synthesizeBtnEl.disabled = true; + controlBtnEl.disabled = true; + downloadBtnEl.disabled = true; + + const result = await orca.synthesize( + text, + { speechRate: speechRateSliderEl.value }, + ); + + pcm = result.pcm; + setAlignmentsTable(result.alignments); + writeMessage('Synthesizing complete!'); + + controlBtnEl.disabled = false; + downloadBtnEl.disabled = false; + } catch (err) { + writeMessage(err); + } finally { + textToSynthesizeEl.disabled = false; + speechRateSliderEl.disabled = false; + } + } + + function onAudioStop() { + isPlaying = false; + controlBtnEl.innerText = 'Play'; + textToSynthesizeEl.disabled = false; + speechRateSliderEl.disabled = false; + synthesizeBtnEl.disabled = false; + } + textToSynthesizeEl.addEventListener('input', (e) => { onSynthesizeParamChange(); if (orca !== null) { validateInput(e.target.value, orca.validCharacters); } }); + speechRateSliderEl.addEventListener('change', () => { onSynthesizeParamChange(); speechRateDisplayEl.innerText = speechRateSliderEl.value; }); - function onAudioStop() { - isPlaying = false; - controlBtnEl.innerText = 'Play'; - textToSynthesizeEl.disabled = false; - speechRateSliderEl.disabled = false; - } + synthesizeBtnEl.addEventListener('click', async () => await synthesize()); controlBtnEl.addEventListener('click', () => { + if (pcm === null) return; + if (!isPlaying) { originalAudioSource = audioContext.createBufferSource(); originalAudioSource.addEventListener('ended', onAudioStop); @@ -109,56 +226,164 @@ onAudioStop(); } }); + + // Streaming Synthesis + let isPlayingStream = false; + const audioBuffer = []; + let streamSource; + + async function playStream() { + if (isPlayingStream) return; + + if (audioBuffer.length === 0) { + streamPlayBtnEl.disabled = false; + streamCloseBtnEl.disabled = false; + return; + } else { + streamPlayBtnEl.disabled = true; + streamCloseBtnEl.disabled = true; + } + + streamSource = audioContext.createBufferSource(); + + streamSource.buffer = audioBuffer.shift(); + streamSource.connect(originalAudioGain); + + streamSource.onended = async () => { + isPlayingStream = false; + await playStream(); + }; + + streamSource.start(); + isPlayingStream = true; + } + + async function streamOpen() { + writeMessage('Opening stream. Please wait...'); + try { + streamTextToSynthesizeEl.disabled = true; + streamSpeechRateSliderEl.disabled = true; + streamOpenBtnEl.disabled = true; + + orcaStream = await orca.streamOpen({ + speechRate: streamSpeechRateSliderEl.value, + }); + + streamTextToSynthesizeEl.disabled = false; + streamCloseBtnEl.disabled = false; + streamPlayBtnEl.disabled = false; + + writeMessage('Stream opened. Type in the input field!'); + } catch (err) { + writeMessage(err); + } + } + + async function streamPlay() { + writeMessage('Synthesizing and playing speech! Please listen for audio.'); + try { + streamTextDisplayEl.innerText = ''; + streamSecondsDisplayEl.innerText = '0'; + + const text = streamTextToSynthesizeEl.value; + const words = text.split(' ').map(str => `${str} `); + let numIterations = 0; + + for (const word of words) { + streamTextDisplayEl.innerText += word; + const wordPcm = await orcaStream.synthesize(word); + if (wordPcm !== null) { + const curSecs = parseFloat(streamSecondsDisplayEl.innerText); + const newSecs = wordPcm.length / orca.sampleRate; + const time = curSecs + newSecs; + streamSecondsDisplayEl.innerText = time.toFixed(3); + audioBuffer.push(createBuffer(wordPcm)); + if (numIterations === 1) { + await playStream(); + } + numIterations++; + } + await new Promise(r => setTimeout(r, 100)); + } + + const flushPcm = await orcaStream.flush(); + if (flushPcm !== null) { + const curSecs = parseFloat(streamSecondsDisplayEl.innerText); + const newSecs = flushPcm.length / orca.sampleRate; + const time = curSecs + newSecs; + streamSecondsDisplayEl.innerText = time.toFixed(3); + audioBuffer.push(createBuffer(flushPcm)); + await playStream(); + } + } catch (err) { + writeMessage(err); + } + } + + async function streamClose() { + writeMessage('Closing stream. Please wait...'); + try { + streamTextToSynthesizeEl.disabled = true; + if (streamSource) { + streamSource.stop(); + } + + await orcaStream.close(); + orcaStream = null; + + streamSpeechRateSliderEl.disabled = false; + streamOpenBtnEl.disabled = false; + streamPlayBtnEl.disabled = true; + streamCloseBtnEl.disabled = true; + streamTextToSynthesizeEl.value = ''; + writeMessage('Stream closed! Click "Open Stream" to begin.'); + } catch (err) { + writeMessage(err); + } + } + + streamOpenBtnEl.addEventListener('click', async () => await streamOpen()); + streamPlayBtnEl.addEventListener('click', async () => await streamPlay()); + streamCloseBtnEl.addEventListener('click', async () => await streamClose()); + + streamTextToSynthesizeEl.addEventListener('input', (e) => { + if (orca !== null) { + validateInput(e.target.value, orca.validCharacters); + } + }); + + streamSpeechRateSliderEl.addEventListener('change', () => { + streamSpeechRateDisplayEl.innerText = streamSpeechRateSliderEl.value; + }); }; async function startOrca(accessKey) { writeMessage('Orca is loading. Please wait...'); try { document.getElementById('start-orca').disabled = true; - document.getElementById('text-to-synthesize').disabled = true; orca = await OrcaWeb.OrcaWorker.create( accessKey, { base64: modelParams, forceWrite: true }, ); - document.getElementById('text-to-synthesize').disabled = false; - document.getElementById('speech-rate').disabled = false; + document.getElementById('choose-btns').style.display = 'block'; + const maxCharacterLimit = orca.maxCharacterLimit.toString(); + document.getElementById('max-char-limit').innerText = maxCharacterLimit; + document.getElementById('text-to-synthesize').maxLength = maxCharacterLimit; + document.getElementById('stream-text-to-synthesize').maxLength = maxCharacterLimit; writeMessage('Orca worker ready!'); } catch (err) { writeMessage(err); } } - async function synthesize() { - writeMessage('Synthesizing. Please wait...'); - try { - document.getElementById('text-to-synthesize').disabled = true; - document.getElementById('speech-rate').disabled = true; - document.getElementById('synthesize-btn').disabled = true; - document.getElementById('control-btn').disabled = true; - document.getElementById('download-btn').disabled = true; - const text = document.getElementById('text-to-synthesize').value; - const speechRate = document.getElementById('speech-rate').value; - pcm = await orca.synthesize(text, { speechRate }); - writeMessage('Synthesizing complete!'); - document.getElementById('control-btn').disabled = false; - document.getElementById('download-btn').disabled = false; - } catch (err) { - writeMessage(err); - } finally { - document.getElementById('text-to-synthesize').disabled = false; - document.getElementById('speech-rate').disabled = false; - } - } - function downloadDumpAudio() { let blob = new Blob([pcm]); let a = document.createElement('a'); a.download = 'orca_speech_audio.pcm'; a.href = window.URL.createObjectURL(blob); a.click(); - document.removeChild(a); } @@ -185,31 +410,93 @@ Orca Web Demo onclick="startOrca(document.getElementById('accessKey').value)" /> -Text to synthesize (English) - - + + Synthesize speech live as you type: + + Streaming Synthesis + + OR + Synthesize speech in one call: + + Single Synthesize + + + + + + Text to synthesize + + - - - Speech Rate: 1.0 - - - - - - Synthesize - -Play -Download Audio - + 0/ + + + Speech Rate: 1.0 + + + Synthesize + + + Play + Download Audio + + + + + + + + + + + Word + Start time (s) + End time (s) + Phonemes + + + + + + Speech Rate: 1.0 + + + + + Open Stream + + + Text to stream to Orca + + + + + + Run Streaming Synthesis + Reset + + + Simulated text stream + + + + Seconds of audio synthesized: 0s + +
OR