vocodedev · ajar98 · Jul 3, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jul 3, 2024
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true }
 # Synthesizers
 google-cloud-texttospeech = { version = "^2.16.3", optional = true }
 pvkoala = { version = "^2.0.1", optional = true }
-cartesia = { version = "^0.1.1", optional = true }
+cartesia = "^1.0.3"
 
 # Transcribers
 google-cloud-speech = { version = "^2.26.0", optional = true }

diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -228,7 +228,7 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value
     sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE
 
 
-DEFAULT_CARTESIA_MODEL_ID = "upbeat-moon"
+DEFAULT_CARTESIA_MODEL_ID = "sonic-english"
 DEFAULT_CARTESIA_VOICE_ID = "5345cf08-6f37-424d-a5d9-8ae1101b9377"
 
 

diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py
@@ -19,31 +19,39 @@ def __init__(
 
         # Lazy import the cartesia module
         try:
-            from cartesia.tts import AsyncCartesiaTTS
+            from cartesia import AsyncCartesia
         except ImportError as e:
             raise ImportError(
                 f"Missing required dependancies for CartesiaSynthesizer"
             ) from e
 
-        self.cartesia_tts = AsyncCartesiaTTS
-
         self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY")
         if not self.api_key:
             raise ValueError("Missing Cartesia API key")
 
+        self.cartesia_tts = AsyncCartesia
 
         if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16:
             self.channel_width = 2
             match synthesizer_config.sampling_rate:
                 case SamplingRate.RATE_44100:
-                    self.sampling_rate = 44100
-                    self.output_format = "pcm_44100"
+                    self.output_format = {
+                        "sample_rate": 44100,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case SamplingRate.RATE_22050:
-                    self.sampling_rate = 22050
-                    self.output_format = "pcm_22050"
+                    self.output_format = {
+                        "sample_rate": 22050,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case SamplingRate.RATE_16000:
-                    self.sampling_rate = 16000
-                    self.output_format = "pcm_16000"
+                    self.output_format = {
+                        "sample_rate": 16000,
+                        "encoding": "pcm_s16le",
+                        "container": "raw",
+                    }
                 case _:
                     raise ValueError(
                         f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}"
@@ -52,41 +60,44 @@ def __init__(
             # Cartesia has issues with MuLaw/8000. Use pcm/16000 and
             # create_synthesis_result_from_wav will handle the conversion to mulaw/8000
             self.channel_width = 2
-            self.output_format = "pcm_16000"
-            self.sampling_rate = 16000
+            self.output_format = {
+                "sample_rate": 16000,
+                "encoding": "pcm_s16le",
+                "container": "raw",
+            }
         else:
             raise ValueError(
                 f"Unsupported audio encoding {synthesizer_config.audio_encoding}"
             )
 
+        if not isinstance(self.output_format["sample_rate"], int):
+            raise ValueError(f"Invalid type for sample_rate")
+        self.sampling_rate = self.output_format["sample_rate"]
         self.num_channels = 1
         self.model_id = synthesizer_config.model_id
         self.voice_id = synthesizer_config.voice_id
         self.client = self.cartesia_tts(api_key=self.api_key)
-        self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id)
 
-
     async def create_speech_uncached(
         self,
         message: BaseMessage,
         chunk_size: int,
         is_first_text_chunk: bool = False,
         is_sole_text_chunk: bool = False,
     ) -> SynthesisResult:
-        generator = await self.client.generate(
+        generator = await self.client.tts.sse(
+            model_id=self.model_id,
             transcript=message.text,
-            voice=self.voice_embedding,
+            voice_id=self.voice_id,
             stream=True,
-            model_id=self.model_id,
-            data_rtype='bytes',
             output_format=self.output_format
         )
 
         audio_file = io.BytesIO()
         with wave.open(audio_file, 'wb') as wav_file:
             wav_file.setnchannels(self.num_channels)
             wav_file.setsampwidth(self.channel_width)
-            wav_file.setframerate(self.sampling_rate)
+            wav_file.setframerate(float(self.sampling_rate))
             async for chunk in generator:
                 wav_file.writeframes(chunk['audio'])
         audio_file.seek(0)