From 7019010f44f3c6a3ca07248e92f60eaae5fe5690 Mon Sep 17 00:00:00 2001 From: Benjamin Date: Wed, 8 May 2024 11:23:51 -0700 Subject: [PATCH] change folder name and small improvements --- .github/workflows/python-demo.yml | 4 +- binding/python/README.md | 12 ++-- binding/python/_orca.py | 3 +- .../.gitignore | 0 .../README.md | 10 +-- .../llm_voice_assistant_demo.py} | 0 .../requirements.txt | 0 .../src/__init__.py | 0 .../src/audio_device.py | 0 .../src/llm.py | 0 .../src/synthesizer.py | 0 .../src/transcriber.py | 0 .../src/user_input.py | 0 .../src/util.py | 0 demo/voice_assistant/generate_sentences.py | 72 ------------------- include/pv_orca.h | 5 +- 16 files changed, 20 insertions(+), 86 deletions(-) rename demo/{voice_assistant => llm_voice_assistant}/.gitignore (100%) rename demo/{voice_assistant => llm_voice_assistant}/README.md (84%) rename demo/{voice_assistant/orca_voice_assistant_demo.py => llm_voice_assistant/llm_voice_assistant_demo.py} (100%) rename demo/{voice_assistant => llm_voice_assistant}/requirements.txt (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/__init__.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/audio_device.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/llm.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/synthesizer.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/transcriber.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/user_input.py (100%) rename demo/{voice_assistant => llm_voice_assistant}/src/util.py (100%) delete mode 100644 demo/voice_assistant/generate_sentences.py diff --git a/.github/workflows/python-demo.yml b/.github/workflows/python-demo.yml index 7bcc19f2..e35f1155 100644 --- a/.github/workflows/python-demo.yml +++ b/.github/workflows/python-demo.yml @@ -51,7 +51,7 @@ jobs: pip install wheel cd ../../binding/python python3 setup.py sdist bdist_wheel - python3 -m pip install dist/pvorca-0.2.0-py3-none-any.whl + python3 -m pip install dist/pvorca-0.2.0-py3-none-any.whl --force-reinstall - name: Install dependencies run: | @@ -88,7 +88,7 @@ jobs: pip install wheel cd ../../binding/python python3 setup.py sdist bdist_wheel - python3 -m pip install dist/pvorca-0.2.0-py3-none-any.whl + python3 -m pip install dist/pvorca-0.2.0-py3-none-any.whl --force-reinstall - name: Install dependencies run: pip3 install -r requirements.txt diff --git a/binding/python/README.md b/binding/python/README.md index 2beaa9da..0ecf296b 100644 --- a/binding/python/README.md +++ b/binding/python/README.md @@ -53,9 +53,9 @@ To synthesize a text stream, create an `Orca.Stream` object and add text to it o stream = orca.stream_open() for text_chunk in text_generator(): - pcm = stream.synthesize(text_chunk) - if pcm is not None: - # handle pcm + pcm = stream.synthesize(text_chunk) + if pcm is not None: + # handle pcm pcm = stream.flush() if pcm is not None: @@ -135,11 +135,14 @@ and replace `${MODEL_PATH}` with the path to the model file with the desired voi ### Speech control -Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `stream_open` +Orca allows for keyword arguments to control the synthesized speech. They can be provided to the `stream_open` method or the single synthesis methods `synthesize` and `synthesize_to_file`: - `speech_rate`: Controls the speed of the generated speech. Valid values are within [0.7, 1.3]. A higher (lower) value produces speech that is faster (slower). The default is `1.0`. +- `random_state`: Sets the random state for sampling during synthesis. This can be used to ensure that the synthesized + speech is deterministic across different runs. Valid values are all non-negative integers. If not provided, a random + seed will be chosen and the synthesis process will be non-deterministic. ### Orca properties @@ -167,4 +170,3 @@ The `Orca.PhonemeAlignment` object has the following properties: [pvorcademo](https://pypi.org/project/pvorcademo/) provides command-line utilities for synthesizing audio using Orca. - diff --git a/binding/python/_orca.py b/binding/python/_orca.py index 70d584da..8cf56604 100644 --- a/binding/python/_orca.py +++ b/binding/python/_orca.py @@ -444,7 +444,8 @@ def synthesize( The pronunciation is expressed in ARPAbet format, e.g.: "I {live|L IH V} in {Sevilla|S EH V IY Y AH}". :param speech_rate: Rate of speech of the synthesized audio. Higher numbers correspond to faster speech. Valid values are within [0.7, 1.3]. - :param random_state: Random seed for the synthesis process. + :param random_state: Random seed for the synthesis process. Valid values are all non-negative integer. If not + provided, a random seed will be chosen. :return: A tuple containing the generated audio as a sequence of 16-bit linearly-encoded integers and a sequence of OrcaWordAlignment objects representing the word alignments. """ diff --git a/demo/voice_assistant/.gitignore b/demo/llm_voice_assistant/.gitignore similarity index 100% rename from demo/voice_assistant/.gitignore rename to demo/llm_voice_assistant/.gitignore diff --git a/demo/voice_assistant/README.md b/demo/llm_voice_assistant/README.md similarity index 84% rename from demo/voice_assistant/README.md rename to demo/llm_voice_assistant/README.md index da779519..68546e4a 100644 --- a/demo/voice_assistant/README.md +++ b/demo/llm_voice_assistant/README.md @@ -2,8 +2,8 @@ Made in Vancouver, Canada by [Picovoice](https://picovoice.ai) -This demo showcases how [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) can be seamlessly integrated into LLM-applications to drastically reduce the audio latency -of voice assistants. +This demo showcases how [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) can be seamlessly +integrated into LLM-applications to drastically reduce the audio latency of voice assistants. ## Technologies @@ -14,12 +14,12 @@ a Text-to-Speech engine. The following technologies are used: - Speech-to-Text: Picovoice's [Cheetah Streaming Speech-to-Text](https://picovoice.ai/platform/cheetah/) -- LLM: \"ChatGPT\" using `gpt-3.5-turbo` +- LLM: "ChatGPT" using `gpt-3.5-turbo` with OpenAI Chat Completion API. - TTS: - Picovoice's [Orca Streaming Text-to-Speech](https://picovoice.ai/platform/orca/) - OpenAI TTS - + ## Compatibility This demo has been tested on Linux (x86_64) and macOS (x86_64) using Python 3.10. @@ -35,7 +35,7 @@ To run all features of this demo, access keys are required for: ## Usage ```bash -python orca_voice_assistant_demo.py --picovoice-access-key ${PV_ACCESS_KEY} --openai-access-key ${OPEN_AI_KEY} +python llm_voice_assistant_demo.py --picovoice-access-key ${PV_ACCESS_KEY} --openai-access-key ${OPEN_AI_KEY} ``` Replace `${PV_ACCESS_KEY}` with your `AccessKey` obtained from Picovoice Console, diff --git a/demo/voice_assistant/orca_voice_assistant_demo.py b/demo/llm_voice_assistant/llm_voice_assistant_demo.py similarity index 100% rename from demo/voice_assistant/orca_voice_assistant_demo.py rename to demo/llm_voice_assistant/llm_voice_assistant_demo.py diff --git a/demo/voice_assistant/requirements.txt b/demo/llm_voice_assistant/requirements.txt similarity index 100% rename from demo/voice_assistant/requirements.txt rename to demo/llm_voice_assistant/requirements.txt diff --git a/demo/voice_assistant/src/__init__.py b/demo/llm_voice_assistant/src/__init__.py similarity index 100% rename from demo/voice_assistant/src/__init__.py rename to demo/llm_voice_assistant/src/__init__.py diff --git a/demo/voice_assistant/src/audio_device.py b/demo/llm_voice_assistant/src/audio_device.py similarity index 100% rename from demo/voice_assistant/src/audio_device.py rename to demo/llm_voice_assistant/src/audio_device.py diff --git a/demo/voice_assistant/src/llm.py b/demo/llm_voice_assistant/src/llm.py similarity index 100% rename from demo/voice_assistant/src/llm.py rename to demo/llm_voice_assistant/src/llm.py diff --git a/demo/voice_assistant/src/synthesizer.py b/demo/llm_voice_assistant/src/synthesizer.py similarity index 100% rename from demo/voice_assistant/src/synthesizer.py rename to demo/llm_voice_assistant/src/synthesizer.py diff --git a/demo/voice_assistant/src/transcriber.py b/demo/llm_voice_assistant/src/transcriber.py similarity index 100% rename from demo/voice_assistant/src/transcriber.py rename to demo/llm_voice_assistant/src/transcriber.py diff --git a/demo/voice_assistant/src/user_input.py b/demo/llm_voice_assistant/src/user_input.py similarity index 100% rename from demo/voice_assistant/src/user_input.py rename to demo/llm_voice_assistant/src/user_input.py diff --git a/demo/voice_assistant/src/util.py b/demo/llm_voice_assistant/src/util.py similarity index 100% rename from demo/voice_assistant/src/util.py rename to demo/llm_voice_assistant/src/util.py diff --git a/demo/voice_assistant/generate_sentences.py b/demo/voice_assistant/generate_sentences.py deleted file mode 100644 index e8ff809e..00000000 --- a/demo/voice_assistant/generate_sentences.py +++ /dev/null @@ -1,72 +0,0 @@ -import argparse - -from tqdm import tqdm - -from src import LLM, LLMs - - -SYSTEM_PROMPT = """ - You are a friendly voice assistant in customer service of an e-commerce platform. - Use natural, conversational language that are clear and easy to follow (short sentences, simple words). - Only use english letters and punctuation, no special characters. - Be verbose. - Keep the conversation flowing naturally. - Don't use lists. - If the customer was successful, say "Great!" and ask if they need help with anything else. - """ - - -def main(args: argparse.Namespace) -> None: - - sentences = [] - - first_sentence = """ - Hi, I'm trying to place an order on your webpage but I'm having trouble with the checkout process. - Can you help me?""" - second_sentence = "The place order button isn't working." - - llm = LLM.create(LLMs.OPENAI, access_key=args.openai_access_key, assistant_prompt=SYSTEM_PROMPT) - - for _ in tqdm(range(50)): - llm_message = "".join([t for t in llm.chat(first_sentence) if t is not None]) - sentences.append(llm_message) - - #print(llm_message) - - llm_message = "".join([t for t in llm.chat(second_sentence) if t is not None]) - sentences.append(llm_message) - - #print(llm_message) - - # TODO: implement this method if using this script - llm.reset_history() - - print("=============================================================================") - - # print sentences like a python list in the following format - # [ - # "sentence 1", - # "sentence 2", - # ] - - print("[") - for sentence in sentences: - sentence = sentence.replace("\n", "") - sentence = sentence.replace("\"Place Order\"", "Place Order") - print(f' "{sentence}",') - print("]") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Text-to-speech streaming synthesis") - - parser.add_argument( - "--llm", - default=LLMs.DUMMY.value, - choices=[l.value for l in LLMs], - help="Choose LLM to use") - parser.add_argument( - "--openai-access-key", - help="Open AI access key. Needed when using openai models") - - main(parser.parse_args()) diff --git a/include/pv_orca.h b/include/pv_orca.h index 3c22cb17..f7f7d9d5 100644 --- a/include/pv_orca.h +++ b/include/pv_orca.h @@ -190,7 +190,8 @@ typedef struct { /** * Generates audio from text. The returned audio contains the speech representation of the text. * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. - * The memory of the returned audio is allocated by Orca and can be deleted with `pv_orca_pcm_delete()` + * The memory of the returned audio and the alignment metadata is allocated by Orca and can be deleted with + * `pv_orca_pcm_delete()` and `pv_orca_word_alignments_delete()`, respectively. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling @@ -219,6 +220,8 @@ PV_API pv_status_t pv_orca_synthesize( /** * Generates audio from text and saves it to a file. The file contains the speech representation of the text. * This function returns `PV_STATUS_INVALID_STATE` if an OrcaStream object is open. + * The memory of the returned alignment metadata is allocated by Orca and can be deleted with + * `pv_orca_word_alignments_delete()`. * * @param object The Orca object. * @param text Text to be converted to audio. The maximum length can be attained by calling