include open ai llm in demo

Picovoice · bejager · May 10, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
commit 5aa2ab6266ac0ac13e954677c0db7fcb61984eca
diff --git a/binding/python/test_orca.py b/binding/python/test_orca.py
@@ -47,7 +47,7 @@ def tearDownClass(cls):
     def _test_audio(self, pcm: Sequence[int], ground_truth: Sequence[int]) -> None:
         self.assertEqual(len(pcm), len(ground_truth))
         for i in range(len(pcm)):
-            self.assertAlmostEqual(pcm[i], ground_truth[i])
+            self.assertAlmostEqual(pcm[i], ground_truth[i], delta=100)
 
     def _test_equal_timestamp(self, timestamp: float, timestamp_truth: float) -> None:
         self.assertAlmostEqual(timestamp, timestamp_truth, places=2)

diff --git a/demo/python/.gitignore b/demo/python/.gitignore
@@ -3,3 +3,4 @@ dist
 MANIFEST.in
 pvorcademo
 pvorcademo.egg-info
+__pycache__/
diff --git a/demo/python/demo_util.py b/demo/python/demo_util.py
@@ -1,34 +1,478 @@
 import json
 import os
 import random
+import threading
 import time
-from typing import Generator, Optional, Sequence
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+from queue import Queue
+from typing import *
 
+import numpy as np
+import pvorca
+import sounddevice as sd
+import tiktoken
+from numpy.typing import NDArray
+from pvorca import Orca, OrcaInvalidArgumentError
 
-def load_demo_sentences() -> Sequence[str]:
-    data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/demo/demo_data.json")
-    with open(data_file_path, encoding="utf8") as data_file:
-        demo_data = json.loads(data_file.read())
-    return demo_data["demo_sentences"]
 
+@dataclass
+class Timestamps:
+    time_llm_request: float = -1.0
+    time_first_llm_token: float = -1.0
+    time_last_llm_token: float = -1.0
+    time_first_synthesis_request: float = -1.0
+    time_first_audio: float = -1.0
+    initial_audio_delay: float = 0.0
 
-def get_text_generator(token_delay: float = 0.1, text_index: Optional[int] = None) -> Generator[str, None, None]:
-    sentences = load_demo_sentences()
-    text_index = text_index if text_index is not None else random.randint(0, len(sentences) - 1)
+    def reset(self) -> None:
+        self.time_llm_request = -1.0
+        self.time_first_llm_token = -1.0
+        self.time_last_llm_token = -1.0
+        self.time_first_synthesis_request = -1.0
+        self.time_first_audio = -1.0
+        self.initial_audio_delay = 0.0
+
+    def pretty_print_diffs(self, num_tokens: int) -> None:
+        print("\033[90m", end="")
+        print("\n** Responsiveness metrics **")
+
+        api_request_delay = max(self.time_first_llm_token - self.time_llm_request, 0.01)
+        print(
+            f"Estimated tokens / second: ~{num_tokens / (self.time_last_llm_token - self.time_llm_request):.0f}",
+            end="")
+        print(
+            "" if api_request_delay == 0.01 else
+            f" (includes delay of API request of `{api_request_delay:.2f}` seconds).")
+
+        print(f"Time to generate text: {self.time_last_llm_token - self.time_first_llm_token:.2f} seconds")
+        print(f"Time to first audio: {self.time_first_audio - self.time_first_llm_token:.2f} seconds", end="")
+        if self.initial_audio_delay > 0:
+            print(
+                f" (+ applied initial delay of `{self.initial_audio_delay:.2f} seconds` "
+                "to ensure continuous audio)")
 
-    text = ""
-    for char in sentences[text_index]:
-        if char == " " or char in {".", ",", "!", "?"}:
-            token = f"{text}{char}"
-            time.sleep(token_delay)
-            yield token
-            text = ""
         else:
-            text += char
-    if text != "":
-        yield text
+            print()
+        print("\033[0m")
+
+    def debug_print(self) -> None:
+        def to_hms(t: float) -> str:
+            date_object = datetime.fromtimestamp(t)
+            return date_object.strftime("%H:%M:%S.%f")[:-3]
+
+        print(f"time first LLM token {to_hms(self.time_first_llm_token)}")
+        print(f"time last LLM token {to_hms(self.time_last_llm_token)}")
+        print(f"time first synthesis request {to_hms(self.time_first_synthesis_request)}")
+        print(f"time first audio {to_hms(self.time_first_audio)}")
+
+
+class LLMs(Enum):
+    DUMMY = "dummy"
+    OPENAI = "openai"
+
+
+class LLM:
+    SYSTEM_PROMPT = """
+    You are a voice assistant. 
+    Use natural, conversational language that are clear and easy to follow (short sentences, simple words).
+    Only use english letters and punctuation, no special characters. 
+    Don't ever use numbers directly. Verbalize them (e.g. "five" instead of "5").
+    Keep the conversation flowing. 
+    Ask relevant follow-up questions. 
+    """
+    DEFAULT_USER_PROMPT = "USER PROMPT:\n"
+
+    def __init__(
+            self,
+            synthesize_text_callback: Optional[Callable[[str], None]],
+            user_prompt: Optional[str] = None,
+    ) -> None:
+        self._synthesize_text_callback = synthesize_text_callback
+        self._user_prompt = user_prompt if user_prompt is not None else self.DEFAULT_USER_PROMPT
+
+    def chat(self, prompt: str) -> Generator[str, None, None]:
+        print("LLM RESPONSE:")
+        for token in self._chat(prompt=prompt):
+            if token is not None:
+                if self._synthesize_text_callback is not None:
+                    self._synthesize_text_callback(token)
+                print(token, end="", flush=True)
+            yield token
+
+    def _chat(self, prompt: str) -> Generator[str, None, None]:
+        raise NotImplementedError(
+            f"Method `chat_stream` must be implemented in a subclass of {self.__class__.__name__}")
+
+    def user_prompt(self) -> str:
+        return input(self._user_prompt)
+
+    @classmethod
+    def create(cls, llm_type: LLMs, **kwargs) -> 'LLM':
+        classes = {
+            LLMs.DUMMY: DummyLLM,
+            LLMs.OPENAI: OpenAILLM,
+        }
+
+        if llm_type not in classes:
+            raise NotImplementedError(f"Cannot create {cls.__name__} of type `{llm_type.value}`")
+
+        return classes[llm_type](**kwargs)
+
+
+class DummyLLM(LLM):
+    USER_PROMPT = "Press ENTER to generate a demo LLM response\n"
+
+    def __init__(self, tokens_per_second: int = 8, **kwargs: Any) -> None:
+        super().__init__(user_prompt=self.USER_PROMPT, **kwargs)
+
+        self._encoder = tiktoken.encoding_for_model("gpt-4")
+        self._tokens_delay = 1 / tokens_per_second
+
+        data_file_path = os.path.join(os.path.dirname(__file__), "../../resources/demo/demo_data.json")
+        with open(data_file_path, encoding="utf8") as data_file:
+            demo_data = json.loads(data_file.read())
+        self._sentences = demo_data["demo_sentences"]
+
+    def _tokenize(self, text: str) -> Sequence[str]:
+        tokens = [self._encoder.decode([i]) for i in self._encoder.encode(text)]
+        return tokens
+
+    def _chat(self, prompt: str) -> Generator[str, None, None]:
+        try:
+            text_index = int(prompt)
+            sentence = self._sentences[text_index]
+        except ValueError:
+            sentence = self._sentences[random.randint(0, len(self._sentences) - 1)]
+
+        for i in self._tokenize(text=sentence):
+            time.sleep(self._tokens_delay)
+            yield i
+
+
+class OpenAILLM(LLM):
+    def __init__(
+            self,
+            access_key: str,
+            model_name: str = "gpt-3.5-turbo",
+            **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+        from openai import OpenAI
+        self._model_name = model_name
+        self._client = OpenAI(api_key=access_key)
+
+        self._message_stack = [{"role": "system", "content": self.SYSTEM_PROMPT}]
+
+    def _append_user_message(self, message: str) -> None:
+        self._message_stack.append({"role": "user", "content": message})
+
+    def _append_assistant_message(self, message: str) -> None:
+        self._message_stack.append({"role": "assistant", "content": message})
+
+    def _chat(self, prompt: str) -> Generator[str, None, None]:
+        self._append_user_message(prompt)
+        stream = self._client.chat.completions.create(
+            model=self._model_name,
+            messages=self._message_stack,
+            stream=True)
+        assistant_message = ""
+        for chunk in stream:
+            token = chunk.choices[0].delta.content
+            yield token
+            if token is not None:
+                assistant_message += token
+        self._append_assistant_message(message=assistant_message)
+
+
+class Synthesizers(Enum):
+    OPENAI = "openai"
+    PICOVOICE_ORCA = "picovoice_orca"
+    PICOVOICE_ORCA_STREAMING = "picovoice_orca_streaming"
+
+
+class Synthesizer:
+    def __init__(
+            self,
+            samplerate: int,
+            play_audio_callback: Callable,
+            timestamps: Timestamps,
+            input_streamable: bool = False,
+    ) -> None:
+        self.samplerate = samplerate
+        self.input_streamable = input_streamable
+
+        self._play_audio_callback = play_audio_callback
+        self._timestamps = timestamps
+
+    def synthesize(self, text: str, **kwargs: Any) -> None:
+        self._timestamps.time_first_synthesis_request = time.time()
+        pcm = self._synthesize(text=text, **kwargs)
+        self._timestamps.time_first_audio = time.time()
+        self._play_audio_callback(pcm)
+
+    def flush(self) -> None:
+        pass
+
+    def reset(self) -> None:
+        pass
+
+    def start(self) -> None:
+        pass
+
+    def wait(self) -> None:
+        pass
+
+    def wait_and_terminate(self) -> None:
+        pass
+
+    def delete(self) -> None:
+        pass
+
+    def _synthesize(self, text: str, **kwargs: Any) -> NDArray:
+        raise NotImplementedError(
+            f"Method `_synthesize` must be implemented in a subclass of {self.__class__.__name__}")
+
+    @classmethod
+    def create(cls, engine: Union[str, Synthesizers], **kwargs: Any) -> 'Synthesizer':
+        classes = {
+            Synthesizers.PICOVOICE_ORCA: PicovoiceOrcaSynthesizer,
+            Synthesizers.PICOVOICE_ORCA_STREAMING: PicovoiceOrcaStreamingSynthesizer,
+            Synthesizers.OPENAI: OpenAISynthesizer,
+        }
+
+        if engine not in classes:
+            raise NotImplementedError(f"Cannot create {cls.__name__} of type `{engine.value}`")
+
+        return classes[engine](**kwargs)
+
+
+class OpenAISynthesizer(Synthesizer):
+    pass
+
+
+class PicovoiceOrcaSynthesizer(Synthesizer):
+    def __init__(
+            self,
+            access_key: str,
+            model_path: Optional[str] = None,
+            library_path: Optional[str] = None,
+            **kwargs: Any,
+    ) -> None:
+        self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path)
+
+        super().__init__(samplerate=self._orca.sample_rate, **kwargs)
+
+    @staticmethod
+    def _clean_text(text: str) -> str:
+        text = text.replace("\n", " ")
+        text = text.replace("\r", " ")
+        text = text.replace("\t", " ")
+        return text
+
+    def _synthesize(self, text: str, **kwargs: Any) -> Sequence[int]:
+        cleaned_text = self._clean_text(text)
+        return self._orca.synthesize(text=cleaned_text)[0]
+
+    def delete(self) -> None:
+        self._orca.delete()
+
+
+class PicovoiceOrcaStreamingSynthesizer(Synthesizer):
+    NUM_TOKENS_PER_PCM_CHUNK = 8
+
+    @dataclass
+    class OrcaTextInput:
+        text: str
+        flush: bool
+
+    def __init__(
+            self,
+            play_audio_callback: Callable,
+            timestamps: Timestamps,
+            access_key: str,
+            model_path: Optional[str] = None,
+            library_path: Optional[str] = None,
+    ) -> None:
+        self._orca = pvorca.create(access_key=access_key, model_path=model_path, library_path=library_path)
+        super().__init__(
+            samplerate=self._orca.sample_rate,
+            play_audio_callback=play_audio_callback,
+            timestamps=timestamps,
+            input_streamable=True)
+
+        self._orca_stream = self._orca.open_stream()
+        self._sample_rate = self._orca.sample_rate
+
+        self._queue: Queue[Optional[PicovoiceOrcaStreamingSynthesizer.OrcaTextInput]] = Queue()
+        self._play_audio_callback = play_audio_callback
+
+        self._thread = threading.Thread(target=self._run)
+
+        self._timestamps = timestamps
+        self._total_processing_time = 0.0
+
+        self._num_tokens = 0
+        self._first_token = True
+
+    def _compute_first_audio_delay(self, pcm: Sequence[int], processing_time: float) -> float:
+        seconds_audio = len(pcm) / self._sample_rate
+        tokens_per_sec = self._num_tokens / (time.time() - self._timestamps.time_first_synthesis_request)
+        time_delay = \
+            max(((self.NUM_TOKENS_PER_PCM_CHUNK / (
+                    tokens_per_sec + 1e-4)) - seconds_audio) + processing_time,
+                0)
+        return time_delay
+
+    def _run(self) -> None:
+        while True:
+            orca_input = self._queue.get()
+            if orca_input is None:
+                break
+
+            if self._first_token:
+                self._timestamps.time_first_synthesis_request = time.time()
+                self._first_token = False
+
+            self._num_tokens += 1
+
+            start = time.time()
+            try:
+                if not orca_input.flush:
+                    pcm = self._orca_stream.synthesize(orca_input.text)
+                else:
+                    pcm = self._orca_stream.flush()
+            except OrcaInvalidArgumentError as e:
+                print(f"Orca could not synthesize text input `{orca_input.text}`: `{e}`")
+                continue
+
+            processing_time = time.time() - start
+            self._total_processing_time += processing_time
+
+            if len(pcm) > 0:
+                if self._timestamps.time_first_audio < 0.0:
+                    self._timestamps.time_first_audio = time.time()
+
+                    self._timestamps.initial_audio_delay = \
+                        self._compute_first_audio_delay(pcm=pcm, processing_time=processing_time)
+                    time.sleep(self._timestamps.initial_audio_delay)
+
+                self._play_audio_callback(pcm)
+
+    @property
+    def total_processing_time(self) -> float:
+        return self._total_processing_time
+
+    def synthesize(self, text: str, **kwargs: Any) -> None:
+        self._queue.put_nowait(self.OrcaTextInput(text=text, flush=False))
+
+    def flush(self) -> None:
+        self._queue.put_nowait(self.OrcaTextInput(text="", flush=True))
+
+    def start(self) -> None:
+        self._thread.start()
+
+    def reset(self) -> None:
+        self._num_tokens = 0
+        self._first_token = True
+
+    def wait_and_terminate(self) -> None:
+        self.wait()
+        self.terminate()
+
+    def wait(self):
+        while not self._queue.empty():
+            time.sleep(0.1)
+
+    def terminate(self):
+        self._queue.put_nowait(None)
+        self._thread.join()
+        self._orca_stream.close()
+
+    def delete(self) -> None:
+        self._orca.delete()
+
+
+class StreamingAudioOutput:
+    def __init__(self, device_info: dict) -> None:
+        self._device_info = device_info
+        self._queue: Queue[NDArray] = Queue()
+
+        self._buffer = None
+        self._stream = None
+        self._sample_rate = None
+        self._blocksize = None
+
+    def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None:
+        if self._queue.empty():
+            outdata[:] = 0
+            return
+        data = self._queue.get()
+        outdata[:, 0] = data
+
+    def set_sample_rate(self, sample_rate: int) -> None:
+        self._sample_rate = sample_rate
+
+    def play(self, pcm_chunk: Sequence[int]) -> None:
+        if self._stream is None:
+            raise ValueError("Stream is not started. Call `start` method first.")
+
+        pcm_chunk = np.array(pcm_chunk, dtype=np.int16)
+
+        if self._buffer is not None:
+            pcm_chunk = np.concatenate([self._buffer, pcm_chunk])
+            self._buffer = None
+
+        length = pcm_chunk.shape[0]
+        for index_block in range(0, length, self._blocksize):
+            if (length - index_block) < self._blocksize:
+                self._buffer = pcm_chunk[index_block: index_block + (length - index_block)]
+            else:
+                self._queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize])
+
+    def start(self, sample_rate: int) -> None:
+        self._sample_rate = sample_rate
+        self._blocksize = self._sample_rate // 20
+        self._stream = sd.OutputStream(
+            channels=1,
+            samplerate=self._sample_rate,
+            dtype=np.int16,
+            device=int(self._device_info["index"]),
+            callback=self._callback,
+            blocksize=self._blocksize)
+        self._stream.start()
+
+    def wait_and_terminate(self) -> None:
+        self.wait()
+        self.terminate()
+
+    def wait(self):
+        if self._buffer is not None:
+            chunk = np.zeros(self._blocksize, dtype=np.int16)
+            chunk[:self._buffer.shape[0]] = self._buffer
+            self._queue.put_nowait(chunk)
+
+        time_interval = self._blocksize / self._sample_rate
+        while not self._queue.empty():
+            time.sleep(time_interval)
+
+        time.sleep(time_interval)
+
+    def terminate(self):
+        self._stream.stop()
+        self._stream.close()
+
+    @classmethod
+    def from_default_device(cls):
+        return cls(device_info=sd.query_devices(kind="output"))
 
 
 __all__ = [
-    "get_text_generator",
+    "LLMs",
+    "LLM",
+    "StreamingAudioOutput",
+    "Synthesizers",
+    "Synthesizer",
+    "Timestamps",
 ]
diff --git a/demo/python/orca_demo.py b/demo/python/orca_demo.py
@@ -11,6 +11,7 @@
 
 import argparse
 import struct
+import time
 import wave
 
 from pvorca import create, OrcaActivationLimitError
@@ -49,20 +50,25 @@ def main():
     orca = create(access_key=args.access_key, model_path=args.model_path, library_path=args.library_path)
 
     try:
-        print('Orca version: %s' % orca.version)
+        print(f"Orca version: {orca.version}")
+        start = time.time()
         pcm, alignment = orca.synthesize(args.text)
+        processing_time = time.time() - start
         length_sec = len(pcm) / orca.sample_rate
         with wave.open(args.output_path, 'wb') as output_file:
             output_file.setnchannels(1)
             output_file.setsampwidth(2)
             output_file.setframerate(orca.sample_rate)
             output_file.writeframes(struct.pack('%dh' % len(pcm), *pcm))
-        print('%.2f seconds of audio were written to `%s`.' % (length_sec, args.output_path))
+        print(
+            f"Orca took {processing_time:.2f} seconds to synthesize {length_sec:.2f} seconds of speech which is "
+            f"~{length_sec / processing_time:.0f} times faster than real-time.")
+        print(f"Audio written to `{args.output_path}`.")
     except OrcaActivationLimitError:
-        print('AccessKey has reached its processing limit')
+        print("AccessKey has reached its processing limit")
     finally:
         orca.delete()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/demo/python/requirements.txt b/demo/python/requirements.txt
@@ -1,3 +1,5 @@
 ../../binding/python/dist/pvorca-0.1.4-py3-none-any.whl
 pvorca==0.1.4
 sounddevice
+tiktoken
+openai
diff --git a/demo/python/streaming_orca_demo.py b/demo/python/streaming_orca_demo.py
@@ -10,278 +10,143 @@
 #
 
 import argparse
-import contextlib
-import sys
 import time
-import threading
-import queue
-from dataclasses import dataclass
-from typing import *
 
-import numpy as np
-import sounddevice as sd
-from numpy.typing import NDArray
+from demo_util import *
 
-from pvorca import create, Orca
 
-from demo_util import get_text_generator
+def get_llm_init_kwargs(args: argparse.Namespace) -> dict:
+    kwargs = dict()
+    llm_type = LLMs(args.llm)
+    if llm_type is LLMs.OPENAI:
+        if not args.openai_access_key:
+            raise ValueError(
+                f"An OpenAI access key is required when using OpenAI models. Specify with `--openai-access-key`.")
 
+        kwargs["access_key"] = args.openai_access_key
+    elif llm_type is LLMs.DUMMY:
+        kwargs["tokens_per_second"] = args.tokens_per_second
 
-@dataclass
-class TimestampDeltas:
-    time_to_first_llm_token: float = -1.0
-    time_to_last_llm_token: float = -1.0
-    time_to_first_audio: float = -1.0
+    return kwargs
 
 
-class StreamingAudioOutput:
-    def __init__(
-            self,
-            device_info: dict,
-            sample_rate: int,
-    ) -> None:
-        self._sample_rate = sample_rate
+def get_synthesizer_init_kwargs(args: argparse.Namespace) -> dict:
+    kwargs = dict()
+    synthesizer_type = Synthesizers(args.synthesizer)
+    if synthesizer_type is Synthesizers.PICOVOICE_ORCA or synthesizer_type is Synthesizers.PICOVOICE_ORCA_STREAMING:
+        if not args.picovoice_access_key:
+            raise ValueError("Picovoice access key is required when using Picovoice TTS")
 
-        self._blocksize = self._sample_rate // 20
+        kwargs["access_key"] = args.picovoice_access_key
+        kwargs["model_path"] = args.picovoice_model_path
+        kwargs["library_path"] = args.picovoice_library_path
 
-        self.queue: queue.Queue[NDArray] = queue.Queue()
-        self._buffer = None
+    return kwargs
 
-        self.stream = sd.OutputStream(
-            channels=1,
-            samplerate=self._sample_rate,
-            dtype=np.int16,
-            device=int(device_info["index"]),
-            callback=self._callback,
-            blocksize=self._blocksize)
 
-    def _callback(self, outdata: NDArray, frames: int, time: Any, status: Any) -> None:
-        if self.queue.empty():
-            outdata[:] = 0
-            return
-        data = self.queue.get()
-        outdata[:, 0] = data
-
-    def play(self, pcm_chunk: Sequence[int]) -> None:
-        pcm_chunk = np.array(pcm_chunk, dtype=np.int16)
-
-        if self._buffer is not None:
-            pcm_chunk = np.concatenate([self._buffer, pcm_chunk])
-            self._buffer = None
-
-        length = pcm_chunk.shape[0]
-        for index_block in range(0, length, self._blocksize):
-            if (length - index_block) < self._blocksize:
-                self._buffer = pcm_chunk[index_block: index_block + (length - index_block)]
-            else:
-                self.queue.put_nowait(pcm_chunk[index_block: index_block + self._blocksize])
-
-    def start(self) -> None:
-        self.stream.start()
-
-    def wait_and_terminate(self) -> None:
-        self.wait()
-        self.terminate()
-
-    def wait(self):
-        if self._buffer is not None:
-            chunk = np.zeros(self._blocksize, dtype=np.int16)
-            chunk[:self._buffer.shape[0]] = self._buffer
-            self.queue.put_nowait(chunk)
-
-        time_interval = self._blocksize / self._sample_rate
-        while not self.queue.empty():
-            time.sleep(time_interval)
-
-        time.sleep(time_interval)
-        self.stream.stop()
-
-    def terminate(self):
-        self.stream.close()
-
-    @classmethod
-    def from_default_device(cls, **kwargs):
-        return cls(sd.query_devices(kind="output"), **kwargs)
-
-
-class StreamingOrcaThread:
-    NUM_TOKENS_PER_PCM_CHUNK = 6
-
-    @dataclass
-    class OrcaTextInput:
-        text: str
-        flush: bool
-
-    def __init__(self, orca: Orca, play_audio_callback: Callable) -> None:
-        self._orca_stream = orca.open_stream()
-        self._sample_rate = orca.sample_rate
+def main(args: argparse.Namespace) -> None:
+    llm_type = LLMs(args.llm)
+    synthesizer_type = Synthesizers(args.synthesizer)
 
-        self._queue: queue.Queue[Optional[StreamingOrcaThread.OrcaTextInput]] = queue.Queue()
-        self._play_audio_callback = play_audio_callback
+    timestamps = Timestamps()
 
-        self._thread = threading.Thread(target=self._run)
+    audio_output = StreamingAudioOutput.from_default_device()
 
-        self._timestamp_first_audio = -1.0
-        self._timestamp_start_text_stream = 0.0
-        self._first_audio_time_delay = 0.0
+    synthesizer_init_kwargs = get_synthesizer_init_kwargs(args)
+    synthesizer = Synthesizer.create(
+        synthesizer_type,
+        play_audio_callback=audio_output.play,
+        timestamps=timestamps,
+        **synthesizer_init_kwargs)
 
-        self._total_processing_time = 0.0
+    llm_init_kwargs = get_llm_init_kwargs(args)
+    synthesize_text_callback = synthesizer.synthesize if synthesizer.input_streamable else None
+    llm = LLM.create(llm_type, synthesize_text_callback=synthesize_text_callback, **llm_init_kwargs)
 
-        self._num_tokens = 0
-        self._first_token = True
+    if synthesizer.input_streamable:
+        synthesizer.start()
+    audio_output.start(sample_rate=synthesizer.samplerate)
 
-    def _compute_first_audio_delay(self, pcm: Sequence[int]) -> float:
-        seconds_audio = len(pcm) / self._sample_rate
-        tokens_per_sec = self._num_tokens / (time.time() - self._timestamp_start_text_stream)
-        time_delay = \
-            max(((self.NUM_TOKENS_PER_PCM_CHUNK / tokens_per_sec) - seconds_audio) + self._total_processing_time, 0)
-        return time_delay
+    print(f"Picovoice Orca Streaming Demo")
+    print("The following let's you chat with an LLM model using Orca for TTS. Press Ctrl+C to exit.\n")
 
-    def _run(self) -> None:
+    try:
         while True:
-            orca_input = self._queue.get()
-            if orca_input is None:
-                break
-
-            if self._first_token:
-                self._timestamp_start_text_stream = time.time()
-                self._first_token = False
-
-            self._num_tokens += 1
-            start = time.time()
-            if not orca_input.flush:
-                pcm = self._orca_stream.synthesize(orca_input.text)
-            else:
-                pcm = self._orca_stream.flush()
-            self._total_processing_time += time.time() - start
-
-            if len(pcm) > 0:
-                if self._timestamp_first_audio < 0.0:
-                    self._timestamp_first_audio = time.time()
-
-                    self._first_audio_time_delay = self._compute_first_audio_delay(pcm)
-                    time.sleep(self._first_audio_time_delay)
-
-                self._play_audio_callback(pcm)
+            text = llm.user_prompt()
+            generator = llm.chat(text)
 
-    @property
-    def timestamp_first_audio(self) -> float:
-        return self._timestamp_first_audio
+            llm_message = ""
+            num_tokens = 0
+            while True:
+                try:
+                    if timestamps.time_llm_request < 0:
+                        timestamps.time_llm_request = time.time()
+                    token = next(generator)
+                    if timestamps.time_first_llm_token < 0:
+                        timestamps.time_first_llm_token = time.time()
 
-    @property
-    def total_processing_time(self) -> float:
-        return self._total_processing_time
+                    if token is not None:
+                        llm_message += token
+                except StopIteration:
+                    print(" (waiting for audio ...)", flush=True)
+                    timestamps.time_last_llm_token = time.time()
 
-    @property
-    def first_audio_time_delay(self) -> float:
-        return self._first_audio_time_delay
+                    if synthesizer.input_streamable:
+                        synthesizer.flush()
+                        synthesizer.wait()
+                    else:
+                        synthesizer.synthesize(llm_message)
 
-    def synthesize(self, text: str) -> None:
-        self._queue.put_nowait(self.OrcaTextInput(text=text, flush=False))
+                    audio_output.wait()
 
-    def flush(self) -> None:
-        self._queue.put_nowait(self.OrcaTextInput(text="", flush=True))
+                    if synthesizer.input_streamable:
+                        synthesizer.reset()
 
-    def start(self) -> None:
-        self._thread.start()
+                    break
 
-    def wait_and_terminate(self) -> None:
-        self.wait()
-        self.terminate()
+                num_tokens += 1
 
-    def wait(self):
-        while not self._queue.empty():
-            time.sleep(0.1)
+            timestamps.pretty_print_diffs(num_tokens=num_tokens)
+            timestamps.reset()
 
-    def terminate(self):
-        self._queue.put_nowait(None)
-        self._thread.join()
-        self._orca_stream.close()
-
-
-def main(args: argparse.Namespace) -> None:
-    access_key = args.access_key
-    model_path = args.model_path
-    library_path = args.library_path
-    tokens_per_second = args.tokens_per_second
-
-    orca = create(access_key=access_key, model_path=model_path, library_path=library_path)
-
-    output_device = StreamingAudioOutput.from_default_device(sample_rate=orca.sample_rate)
-
-    orca_thread = StreamingOrcaThread(orca=orca, play_audio_callback=output_device.play)
-
-    with contextlib.redirect_stdout(None):  # to avoid stdouts from ALSA lib
-        orca_thread.start()
-        output_device.start()
-        time.sleep(0.5)
-
-    def terminate():
-        orca_thread.wait_and_terminate()
-        output_device.wait_and_terminate()
-        orca.delete()
-        sys.exit()
-
-    try:
-        res = input("Press ENTER to run a demo LLM response (CTRL+C to exit)\n")
-        if res != "":
-            terminate()
     except KeyboardInterrupt:
-        terminate()
+        pass
 
-    generator = get_text_generator(token_delay=1 / tokens_per_second)
-    timestamp_deltas = TimestampDeltas()
+    if synthesizer.input_streamable:
+        synthesizer.wait_and_terminate()
 
-    text = ""
-    num_tokens = 0
-    start = time.time()
-    while True:
-        try:
-            token = next(generator)
-            timestamp_deltas.time_to_first_llm_token = time.time() - start
-        except StopIteration:
-            timestamp_deltas.time_to_last_llm_token = time.time() - start
-            orca_thread.synthesize(text=text)
-            orca_thread.flush()
-            break
+    audio_output.wait_and_terminate()
 
-        orca_thread.synthesize(text=token)
+    # TODO:
+    # Give final stats (TTS delay)
 
-        print(token, end="", flush=True)
-        num_tokens += 1
-
-    end = time.time()
-
-    timestamp_deltas.time_to_first_audio = orca_thread.timestamp_first_audio - start
-
-    print("\n(waiting for audio to finish ...)")
-
-    orca_thread.wait_and_terminate()
-    output_device.wait_and_terminate()
-
-    print(f"\nImitated tokens / second: ~{num_tokens / (end - start):.0f}")
-    print(f"Time to generate text: {timestamp_deltas.time_to_last_llm_token:.2f} seconds")
-    print(f"Time to first audio: {timestamp_deltas.time_to_first_audio:.2f} seconds", end="")
-    if orca_thread.first_audio_time_delay > 0:
-        print(
-            f" (+ applied initial delay of `{orca_thread.first_audio_time_delay:.2f} seconds` "
-            "to ensure continuous audio)")
-    else:
-        print()
-
-    orca.delete()
+    synthesizer.delete()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Text-to-speech streaming synthesis")
-    parser.add_argument("--access_key", "-a", required=True, help="AccessKey obtained from Picovoice Console")
-    parser.add_argument("--model_path", "-m", help="Path to the model parameters file")
-    parser.add_argument("--library_path", "-l", help="Path to Orca's dynamic library")
+    parser.add_argument(
+        "--llm",
+        default=LLMs.DUMMY.value,
+        choices=[l.value for l in LLMs],
+        help="Choose LLM to use")
+    parser.add_argument(
+        "--openai-access-key",
+        help="Open AI access key. Needed when using openai models")
     parser.add_argument(
         "--tokens-per-second",
         "-t",
-        default=8,
-        type=float,
+        default=10,
+        type=int,
         help="Imitated tokens per second")
 
+    parser.add_argument(
+        "--synthesizer",
+        default=Synthesizers.PICOVOICE_ORCA.value,
+        choices=[s.value for s in Synthesizers],
+        help="Choose voice synthesizer to use")
+    parser.add_argument("--picovoice-access-key", "-a", help="AccessKey obtained from Picovoice Console")
+    parser.add_argument("--picovoice-model-path", "-m", help="Path to the model parameters file")
+    parser.add_argument("--picovoice-library-path", "-l", help="Path to Orca's dynamic library")
+
     main(parser.parse_args())