From bfbd1c8321cf59fdbd47ce7aeb1afd20ce34c553 Mon Sep 17 00:00:00 2001
From: rjheeta <rjheeta@gmail.com>
Date: Thu, 27 Jun 2024 08:42:10 -0400
Subject: [PATCH 1/4] upgrade to latest cartesia 1.0.3

---
 poetry.lock                                   | 12 ++---
 pyproject.toml                                |  2 +-
 vocode/streaming/models/synthesizer.py        |  2 +-
 .../synthesizer/cartesia_synthesizer.py       | 49 ++++++++++++-------
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 28a3e98fa..c68d3b6b2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -300,13 +300,13 @@ files = [
 
 [[package]]
 name = "cartesia"
-version = "0.1.1"
+version = "1.0.3"
 description = "The official Python library for the Cartesia API."
-optional = true
+optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "cartesia-0.1.1-py2.py3-none-any.whl", hash = "sha256:7a7365f17e220247ee2af1efdb88e69b0aa332e390c85775bf356b5e7b882498"},
-    {file = "cartesia-0.1.1.tar.gz", hash = "sha256:c584770f4698e6dc826a75b7b5fd39bfce749c88ad9786dca46edd9527710002"},
+    {file = "cartesia-1.0.3-py2.py3-none-any.whl", hash = "sha256:d680c197361507faf11e8ed99a30a0d6ece682298ea306f41a66a0195c08ae37"},
+    {file = "cartesia-1.0.3.tar.gz", hash = "sha256:446e7bea274e71c95f790d1efdc4b04a6eec1747f3ae5cc48a4fd68985d0aafc"},
 ]
 
 [package.dependencies]
@@ -4678,4 +4678,4 @@ transcribers = ["google-cloud-speech"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "e56040c6bd76616232f8ef2b371771ee098c6a12fd57228c0b0ab2a0d24e3e58"
+content-hash = "5959a95fd3aa9f446c3a8d43f5fed96f83f46d0b017ec2fe133e88956f0ae475"
diff --git a/pyproject.toml b/pyproject.toml
index 3466af1b0..a5d360f7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true }
 # Synthesizers
 google-cloud-texttospeech = { version = "^2.16.3", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
-cartesia = { version = "^0.1.1", optional = true }
+cartesia = "^1.0.3"
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }
diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
index f1c8c01cb..eeb6c3133 100644
--- a/vocode/streaming/models/synthesizer.py
+++ b/vocode/streaming/models/synthesizer.py
@@ -228,7 +228,7 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
     sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE
 
 
-DEFAULT_CARTESIA_MODEL_ID = "upbeat-moon"
+DEFAULT_CARTESIA_MODEL_ID = "sonic-english"
 DEFAULT_CARTESIA_VOICE_ID = "5345cf08-6f37-424d-a5d9-8ae1101b9377"
 
 
diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
index 9e1392e6a..64e9629a8 100644
--- a/vocode/streaming/synthesizer/cartesia_synthesizer.py
+++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -19,31 +19,39 @@ def __init__(
 
         # Lazy import the cartesia module
         try:
-            from cartesia.tts import AsyncCartesiaTTS
+            from cartesia import AsyncCartesia
         except ImportError as e:
             raise ImportError(
                 f"Missing required dependancies for CartesiaSynthesizer"
             ) from e
         
-        self.cartesia_tts = AsyncCartesiaTTS
-        
         self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
         if not self.api_key:
             raise ValueError("Missing Cartesia API key")
         
+        self.cartesia_tts = AsyncCartesia
 
         if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
             self.channel_width = 2
             match synthesizer_config.sampling_rate:
                 case SamplingRate.RATE_44100:
-                    self.sampling_rate = 44100
-                    self.output_format = "pcm_44100"
+                    self.output_format = {
+                        "sample_rate": 44100,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case SamplingRate.RATE_22050:
-                    self.sampling_rate = 22050
-                    self.output_format = "pcm_22050"
+                    self.output_format = {
+                        "sample_rate": 22050,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case SamplingRate.RATE_16000:
-                    self.sampling_rate = 16000
-                    self.output_format = "pcm_16000"
+                    self.output_format = {
+                        "sample_rate": 16000,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case _:
                     raise ValueError(
                         f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
@@ -52,20 +60,26 @@ def __init__(
             # Cartesia has issues with MuLaw/8000. Use pcm/16000 and
             # create_synthesis_result_from_wav will handle the conversion to mulaw/8000
             self.channel_width = 2
-            self.output_format = "pcm_16000"
-            self.sampling_rate = 16000
+            self.output_format = {
+                "sample_rate": 16000,
+                "encoding": "pcm_s16le",
+                "container": "raw",
+            }
         else:
             raise ValueError(
                 f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
             )
 
+        if not isinstance(self.output_format["sample_rate"], int):
+            raise ValueError(
+                f"Invalid type for sample_rate: {type(self.output_format["sample_rate"])}"
+            )
+        self.sampling_rate = self.output_format["sample_rate"]
         self.num_channels = 1
         self.model_id = synthesizer_config.model_id
         self.voice_id = synthesizer_config.voice_id
         self.client = self.cartesia_tts(api_key=self.api_key)
-        self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
         
-
     async def create_speech_uncached(
         self,
         message: BaseMessage,
@@ -73,12 +87,11 @@ async def create_speech_uncached(
         is_first_text_chunk: bool = False,
         is_sole_text_chunk: bool = False,
     ) -> SynthesisResult:
-        generator = await self.client.generate(
+        generator = await self.client.tts.sse(
+            model_id=self.model_id,
             transcript=message.text,
-            voice=self.voice_embedding,
+            voice_id=self.voice_id,
             stream=True,
-            model_id=self.model_id,
-            data_rtype='bytes',
             output_format=self.output_format
         )
 
@@ -86,7 +99,7 @@ async def create_speech_uncached(
         with wave.open(audio_file, 'wb') as wav_file:
             wav_file.setnchannels(self.num_channels)
             wav_file.setsampwidth(self.channel_width)
-            wav_file.setframerate(self.sampling_rate)
+            wav_file.setframerate(float(self.sampling_rate))
             async for chunk in generator:
                 wav_file.writeframes(chunk['audio'])
         audio_file.seek(0)

From 5fcb20a7598ea21a7b1fa2dc8da8e7268069d55e Mon Sep 17 00:00:00 2001
From: rjheeta <rjheeta@gmail.com>
Date: Thu, 27 Jun 2024 08:46:09 -0400
Subject: [PATCH 2/4] fixed linting conflict

---
 vocode/streaming/synthesizer/cartesia_synthesizer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
index 64e9629a8..ec1a5d350 100644
--- a/vocode/streaming/synthesizer/cartesia_synthesizer.py
+++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -71,9 +71,7 @@ def __init__(
             )
 
         if not isinstance(self.output_format["sample_rate"], int):
-            raise ValueError(
-                f"Invalid type for sample_rate: {type(self.output_format["sample_rate"])}"
-            )
+            raise ValueError(f"Invalid type for sample_rate")
         self.sampling_rate = self.output_format["sample_rate"]
         self.num_channels = 1
         self.model_id = synthesizer_config.model_id

From abde4687c1f5e68d7c58e8c09eee63a3c06ba9b7 Mon Sep 17 00:00:00 2001
From: Ajay Raj <ajay.n.raj@gmail.com>
Date: Wed, 3 Jul 2024 11:24:24 -0700
Subject: [PATCH 3/4] finish streaming

---
 .../streaming/synthesizer/synthesize.py       |  5 +-
 .../streaming/synthesizer/base_synthesizer.py |  2 +-
 .../synthesizer/cartesia_synthesizer.py       | 67 +++++++++----------
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/playground/streaming/synthesizer/synthesize.py b/playground/streaming/synthesizer/synthesize.py
index 086169772..1c92efe91 100644
--- a/playground/streaming/synthesizer/synthesize.py
+++ b/playground/streaming/synthesizer/synthesize.py
@@ -1,5 +1,6 @@
 import time
 
+from vocode.streaming.constants import PER_CHUNK_ALLOWANCE_SECONDS
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import AzureSynthesizerConfig
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
@@ -42,7 +43,9 @@ async def speak(
                 end_time = time.time()
                 await asyncio.sleep(
                     max(
-                        speech_length_seconds - (end_time - start_time),
+                        speech_length_seconds
+                        - (end_time - start_time)
+                        - PER_CHUNK_ALLOWANCE_SECONDS,
                         0,
                     )
                 )
diff --git a/vocode/streaming/synthesizer/base_synthesizer.py b/vocode/streaming/synthesizer/base_synthesizer.py
index e3e86661f..ac3a41736 100644
--- a/vocode/streaming/synthesizer/base_synthesizer.py
+++ b/vocode/streaming/synthesizer/base_synthesizer.py
@@ -285,7 +285,7 @@ def get_message_cutoff_from_total_response_length(
 
     @staticmethod
     def get_message_cutoff_from_voice_speed(
-        message: BaseMessage, seconds: float, words_per_minute: int
+        message: BaseMessage, seconds: float, words_per_minute: int = 150
     ) -> str:
         words_per_second = words_per_minute / 60
         estimated_words_spoken = math.floor(words_per_second * seconds)
diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
index ec1a5d350..d7b2ba961 100644
--- a/vocode/streaming/synthesizer/cartesia_synthesizer.py
+++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -1,5 +1,3 @@
-import io
-import wave
 import hashlib
 
 from vocode import getenv
@@ -7,7 +5,6 @@
 from vocode.streaming.models.message import BaseMessage
 from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig
 from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult
-from vocode.streaming.utils.create_task import asyncio_create_task_with_done_error_log
 
 
 class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]):
@@ -21,18 +18,15 @@ def __init__(
         try:
             from cartesia import AsyncCartesia
         except ImportError as e:
-            raise ImportError(
-                f"Missing required dependancies for CartesiaSynthesizer"
-            ) from e
-        
+            raise ImportError(f"Missing required dependancies for CartesiaSynthesizer") from e
+
         self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
         if not self.api_key:
             raise ValueError("Missing Cartesia API key")
-        
+
         self.cartesia_tts = AsyncCartesia
 
         if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
-            self.channel_width = 2
             match synthesizer_config.sampling_rate:
                 case SamplingRate.RATE_44100:
                     self.output_format = {
@@ -52,23 +46,25 @@ def __init__(
                         "encoding": "pcm_s16le",
                         "container": "raw",
                     }
+                case SamplingRate.RATE_8000:
+                    self.output_format = {
+                        "sample_rate": 8000,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case _:
                     raise ValueError(
                         f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
                     )
         elif synthesizer_config.audio_encoding == AudioEncoding.MULAW:
-            # Cartesia has issues with MuLaw/8000. Use pcm/16000 and
-            # create_synthesis_result_from_wav will handle the conversion to mulaw/8000
             self.channel_width = 2
             self.output_format = {
-                "sample_rate": 16000,
-                "encoding": "pcm_s16le",
+                "sample_rate": 8000,
+                "encoding": "pcm_mulaw",
                 "container": "raw",
             }
         else:
-            raise ValueError(
-                f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
-            )
+            raise ValueError(f"Unsupported audio encoding {synthesizer_config.audio_encoding}")
 
         if not isinstance(self.output_format["sample_rate"], int):
             raise ValueError(f"Invalid type for sample_rate")
@@ -77,7 +73,7 @@ def __init__(
         self.model_id = synthesizer_config.model_id
         self.voice_id = synthesizer_config.voice_id
         self.client = self.cartesia_tts(api_key=self.api_key)
-        
+
     async def create_speech_uncached(
         self,
         message: BaseMessage,
@@ -90,27 +86,28 @@ async def create_speech_uncached(
             transcript=message.text,
             voice_id=self.voice_id,
             stream=True,
-            output_format=self.output_format
+            output_format=self.output_format,
         )
 
-        audio_file = io.BytesIO()
-        with wave.open(audio_file, 'wb') as wav_file:
-            wav_file.setnchannels(self.num_channels)
-            wav_file.setsampwidth(self.channel_width)
-            wav_file.setframerate(float(self.sampling_rate))
-            async for chunk in generator:
-                wav_file.writeframes(chunk['audio'])
-        audio_file.seek(0)
+        async def chunk_generator(sse):
+            buffer = bytearray()
+            async for event in sse:
+                audio = event.get("audio")
+                buffer.extend(audio)
+                while len(buffer) >= chunk_size:
+                    yield SynthesisResult.ChunkResult(
+                        chunk=buffer[:chunk_size], is_last_chunk=False
+                    )
+                    buffer = buffer[chunk_size:]
+            yield SynthesisResult.ChunkResult(chunk=buffer, is_last_chunk=True)
 
-        result = self.create_synthesis_result_from_wav(
-            synthesizer_config=self.synthesizer_config,
-            file=audio_file,
-            message=message,
-            chunk_size=chunk_size,
+        return SynthesisResult(
+            chunk_generator=chunk_generator(generator),
+            get_message_up_to=lambda seconds: self.get_message_cutoff_from_voice_speed(
+                message, seconds
+            ),
         )
 
-        return result
-    
     @classmethod
     def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
         hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest()
@@ -120,6 +117,6 @@ def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig):
                 hashed_api_key,
                 str(synthesizer_config.voice_id),
                 str(synthesizer_config.model_id),
-                synthesizer_config.audio_encoding
+                synthesizer_config.audio_encoding,
             )
-        )
\ No newline at end of file
+        )

From c2c68e62b902cc183df2b3493e56be6c33e5ee53 Mon Sep 17 00:00:00 2001
From: Ajay Raj <ajay.n.raj@gmail.com>
Date: Wed, 3 Jul 2024 11:25:36 -0700
Subject: [PATCH 4/4] make cartesia optional

---
 poetry.lock    | 6 +++---
 pyproject.toml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index c68d3b6b2..e12b4cd14 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -302,7 +302,7 @@ files = [
 name = "cartesia"
 version = "1.0.3"
 description = "The official Python library for the Cartesia API."
-optional = false
+optional = true
 python-versions = ">=3.8.0"
 files = [
     {file = "cartesia-1.0.3-py2.py3-none-any.whl", hash = "sha256:d680c197361507faf11e8ed99a30a0d6ece682298ea306f41a66a0195c08ae37"},
@@ -4678,4 +4678,4 @@ transcribers = ["google-cloud-speech"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<4.0"
-content-hash = "5959a95fd3aa9f446c3a8d43f5fed96f83f46d0b017ec2fe133e88956f0ae475"
+content-hash = "53dcd434eecc407c0af533faecc9b16e5574c4368e3c136e785819243baa28d3"
diff --git a/pyproject.toml b/pyproject.toml
index a5d360f7f..04732a58e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true }
 # Synthesizers
 google-cloud-texttospeech = { version = "^2.16.3", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
-cartesia = "^1.0.3"
+cartesia = { version = "^1.0.3", optional = true }
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }