From bfbd1c8321cf59fdbd47ce7aeb1afd20ce34c553 Mon Sep 17 00:00:00 2001 From: rjheeta Date: Thu, 27 Jun 2024 08:42:10 -0400 Subject: [PATCH 1/4] upgrade to latest cartesia 1.0.3 --- poetry.lock | 12 ++--- pyproject.toml | 2 +- vocode/streaming/models/synthesizer.py | 2 +- .../synthesizer/cartesia_synthesizer.py | 49 ++++++++++++------- 4 files changed, 39 insertions(+), 26 deletions(-) diff --git a/poetry.lock b/poetry.lock index 28a3e98fa..c68d3b6b2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -300,13 +300,13 @@ files = [ [[package]] name = "cartesia" -version = "0.1.1" +version = "1.0.3" description = "The official Python library for the Cartesia API." -optional = true +optional = false python-versions = ">=3.8.0" files = [ - {file = "cartesia-0.1.1-py2.py3-none-any.whl", hash = "sha256:7a7365f17e220247ee2af1efdb88e69b0aa332e390c85775bf356b5e7b882498"}, - {file = "cartesia-0.1.1.tar.gz", hash = "sha256:c584770f4698e6dc826a75b7b5fd39bfce749c88ad9786dca46edd9527710002"}, + {file = "cartesia-1.0.3-py2.py3-none-any.whl", hash = "sha256:d680c197361507faf11e8ed99a30a0d6ece682298ea306f41a66a0195c08ae37"}, + {file = "cartesia-1.0.3.tar.gz", hash = "sha256:446e7bea274e71c95f790d1efdc4b04a6eec1747f3ae5cc48a4fd68985d0aafc"}, ] [package.dependencies] @@ -4678,4 +4678,4 @@ transcribers = ["google-cloud-speech"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "e56040c6bd76616232f8ef2b371771ee098c6a12fd57228c0b0ab2a0d24e3e58" +content-hash = "5959a95fd3aa9f446c3a8d43f5fed96f83f46d0b017ec2fe133e88956f0ae475" diff --git a/pyproject.toml b/pyproject.toml index 3466af1b0..a5d360f7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true } # Synthesizers google-cloud-texttospeech = { version = "^2.16.3", optional = true } pvkoala = { version = "^2.0.1", optional = true } -cartesia = { version = "^0.1.1", optional = true } +cartesia = "^1.0.3" # Transcribers google-cloud-speech = { version = "^2.26.0", optional = true } diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py index f1c8c01cb..eeb6c3133 100644 --- a/vocode/streaming/models/synthesizer.py +++ b/vocode/streaming/models/synthesizer.py @@ -228,7 +228,7 @@ class PollySynthesizerConfig(SynthesizerConfig, type=SynthesizerType.POLLY.value sampling_rate: int = DEFAULT_POLLY_SAMPLING_RATE -DEFAULT_CARTESIA_MODEL_ID = "upbeat-moon" +DEFAULT_CARTESIA_MODEL_ID = "sonic-english" DEFAULT_CARTESIA_VOICE_ID = "5345cf08-6f37-424d-a5d9-8ae1101b9377" diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index 9e1392e6a..64e9629a8 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -19,31 +19,39 @@ def __init__( # Lazy import the cartesia module try: - from cartesia.tts import AsyncCartesiaTTS + from cartesia import AsyncCartesia except ImportError as e: raise ImportError( f"Missing required dependancies for CartesiaSynthesizer" ) from e - self.cartesia_tts = AsyncCartesiaTTS - self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY") if not self.api_key: raise ValueError("Missing Cartesia API key") + self.cartesia_tts = AsyncCartesia if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: self.channel_width = 2 match synthesizer_config.sampling_rate: case SamplingRate.RATE_44100: - self.sampling_rate = 44100 - self.output_format = "pcm_44100" + self.output_format = { + "sample_rate": 44100, + "encoding": "pcm_s16le", + "container": "raw", + } case SamplingRate.RATE_22050: - self.sampling_rate = 22050 - self.output_format = "pcm_22050" + self.output_format = { + "sample_rate": 22050, + "encoding": "pcm_s16le", + "container": "raw", + } case SamplingRate.RATE_16000: - self.sampling_rate = 16000 - self.output_format = "pcm_16000" + self.output_format = { + "sample_rate": 16000, + "encoding": "pcm_s16le", + "container": "raw", + } case _: raise ValueError( f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}" @@ -52,20 +60,26 @@ def __init__( # Cartesia has issues with MuLaw/8000. Use pcm/16000 and # create_synthesis_result_from_wav will handle the conversion to mulaw/8000 self.channel_width = 2 - self.output_format = "pcm_16000" - self.sampling_rate = 16000 + self.output_format = { + "sample_rate": 16000, + "encoding": "pcm_s16le", + "container": "raw", + } else: raise ValueError( f"Unsupported audio encoding {synthesizer_config.audio_encoding}" ) + if not isinstance(self.output_format["sample_rate"], int): + raise ValueError( + f"Invalid type for sample_rate: {type(self.output_format["sample_rate"])}" + ) + self.sampling_rate = self.output_format["sample_rate"] self.num_channels = 1 self.model_id = synthesizer_config.model_id self.voice_id = synthesizer_config.voice_id self.client = self.cartesia_tts(api_key=self.api_key) - self.voice_embedding = self.client.get_voice_embedding(voice_id=self.voice_id) - async def create_speech_uncached( self, message: BaseMessage, @@ -73,12 +87,11 @@ async def create_speech_uncached( is_first_text_chunk: bool = False, is_sole_text_chunk: bool = False, ) -> SynthesisResult: - generator = await self.client.generate( + generator = await self.client.tts.sse( + model_id=self.model_id, transcript=message.text, - voice=self.voice_embedding, + voice_id=self.voice_id, stream=True, - model_id=self.model_id, - data_rtype='bytes', output_format=self.output_format ) @@ -86,7 +99,7 @@ async def create_speech_uncached( with wave.open(audio_file, 'wb') as wav_file: wav_file.setnchannels(self.num_channels) wav_file.setsampwidth(self.channel_width) - wav_file.setframerate(self.sampling_rate) + wav_file.setframerate(float(self.sampling_rate)) async for chunk in generator: wav_file.writeframes(chunk['audio']) audio_file.seek(0) From 5fcb20a7598ea21a7b1fa2dc8da8e7268069d55e Mon Sep 17 00:00:00 2001 From: rjheeta Date: Thu, 27 Jun 2024 08:46:09 -0400 Subject: [PATCH 2/4] fixed linting conflict --- vocode/streaming/synthesizer/cartesia_synthesizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index 64e9629a8..ec1a5d350 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -71,9 +71,7 @@ def __init__( ) if not isinstance(self.output_format["sample_rate"], int): - raise ValueError( - f"Invalid type for sample_rate: {type(self.output_format["sample_rate"])}" - ) + raise ValueError(f"Invalid type for sample_rate") self.sampling_rate = self.output_format["sample_rate"] self.num_channels = 1 self.model_id = synthesizer_config.model_id From abde4687c1f5e68d7c58e8c09eee63a3c06ba9b7 Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Wed, 3 Jul 2024 11:24:24 -0700 Subject: [PATCH 3/4] finish streaming --- .../streaming/synthesizer/synthesize.py | 5 +- .../streaming/synthesizer/base_synthesizer.py | 2 +- .../synthesizer/cartesia_synthesizer.py | 67 +++++++++---------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/playground/streaming/synthesizer/synthesize.py b/playground/streaming/synthesizer/synthesize.py index 086169772..1c92efe91 100644 --- a/playground/streaming/synthesizer/synthesize.py +++ b/playground/streaming/synthesizer/synthesize.py @@ -1,5 +1,6 @@ import time +from vocode.streaming.constants import PER_CHUNK_ALLOWANCE_SECONDS from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import AzureSynthesizerConfig from vocode.streaming.output_device.base_output_device import BaseOutputDevice @@ -42,7 +43,9 @@ async def speak( end_time = time.time() await asyncio.sleep( max( - speech_length_seconds - (end_time - start_time), + speech_length_seconds + - (end_time - start_time) + - PER_CHUNK_ALLOWANCE_SECONDS, 0, ) ) diff --git a/vocode/streaming/synthesizer/base_synthesizer.py b/vocode/streaming/synthesizer/base_synthesizer.py index e3e86661f..ac3a41736 100644 --- a/vocode/streaming/synthesizer/base_synthesizer.py +++ b/vocode/streaming/synthesizer/base_synthesizer.py @@ -285,7 +285,7 @@ def get_message_cutoff_from_total_response_length( @staticmethod def get_message_cutoff_from_voice_speed( - message: BaseMessage, seconds: float, words_per_minute: int + message: BaseMessage, seconds: float, words_per_minute: int = 150 ) -> str: words_per_second = words_per_minute / 60 estimated_words_spoken = math.floor(words_per_second * seconds) diff --git a/vocode/streaming/synthesizer/cartesia_synthesizer.py b/vocode/streaming/synthesizer/cartesia_synthesizer.py index ec1a5d350..d7b2ba961 100644 --- a/vocode/streaming/synthesizer/cartesia_synthesizer.py +++ b/vocode/streaming/synthesizer/cartesia_synthesizer.py @@ -1,5 +1,3 @@ -import io -import wave import hashlib from vocode import getenv @@ -7,7 +5,6 @@ from vocode.streaming.models.message import BaseMessage from vocode.streaming.models.synthesizer import CartesiaSynthesizerConfig from vocode.streaming.synthesizer.base_synthesizer import BaseSynthesizer, SynthesisResult -from vocode.streaming.utils.create_task import asyncio_create_task_with_done_error_log class CartesiaSynthesizer(BaseSynthesizer[CartesiaSynthesizerConfig]): @@ -21,18 +18,15 @@ def __init__( try: from cartesia import AsyncCartesia except ImportError as e: - raise ImportError( - f"Missing required dependancies for CartesiaSynthesizer" - ) from e - + raise ImportError(f"Missing required dependancies for CartesiaSynthesizer") from e + self.api_key = synthesizer_config.api_key or getenv("CARTESIA_API_KEY") if not self.api_key: raise ValueError("Missing Cartesia API key") - + self.cartesia_tts = AsyncCartesia if synthesizer_config.audio_encoding == AudioEncoding.LINEAR16: - self.channel_width = 2 match synthesizer_config.sampling_rate: case SamplingRate.RATE_44100: self.output_format = { @@ -52,23 +46,25 @@ def __init__( "encoding": "pcm_s16le", "container": "raw", } + case SamplingRate.RATE_8000: + self.output_format = { + "sample_rate": 8000, + "encoding": "pcm_s16le", + "container": "raw", + } case _: raise ValueError( f"Unsupported PCM sampling rate {synthesizer_config.sampling_rate}" ) elif synthesizer_config.audio_encoding == AudioEncoding.MULAW: - # Cartesia has issues with MuLaw/8000. Use pcm/16000 and - # create_synthesis_result_from_wav will handle the conversion to mulaw/8000 self.channel_width = 2 self.output_format = { - "sample_rate": 16000, - "encoding": "pcm_s16le", + "sample_rate": 8000, + "encoding": "pcm_mulaw", "container": "raw", } else: - raise ValueError( - f"Unsupported audio encoding {synthesizer_config.audio_encoding}" - ) + raise ValueError(f"Unsupported audio encoding {synthesizer_config.audio_encoding}") if not isinstance(self.output_format["sample_rate"], int): raise ValueError(f"Invalid type for sample_rate") @@ -77,7 +73,7 @@ def __init__( self.model_id = synthesizer_config.model_id self.voice_id = synthesizer_config.voice_id self.client = self.cartesia_tts(api_key=self.api_key) - + async def create_speech_uncached( self, message: BaseMessage, @@ -90,27 +86,28 @@ async def create_speech_uncached( transcript=message.text, voice_id=self.voice_id, stream=True, - output_format=self.output_format + output_format=self.output_format, ) - audio_file = io.BytesIO() - with wave.open(audio_file, 'wb') as wav_file: - wav_file.setnchannels(self.num_channels) - wav_file.setsampwidth(self.channel_width) - wav_file.setframerate(float(self.sampling_rate)) - async for chunk in generator: - wav_file.writeframes(chunk['audio']) - audio_file.seek(0) + async def chunk_generator(sse): + buffer = bytearray() + async for event in sse: + audio = event.get("audio") + buffer.extend(audio) + while len(buffer) >= chunk_size: + yield SynthesisResult.ChunkResult( + chunk=buffer[:chunk_size], is_last_chunk=False + ) + buffer = buffer[chunk_size:] + yield SynthesisResult.ChunkResult(chunk=buffer, is_last_chunk=True) - result = self.create_synthesis_result_from_wav( - synthesizer_config=self.synthesizer_config, - file=audio_file, - message=message, - chunk_size=chunk_size, + return SynthesisResult( + chunk_generator=chunk_generator(generator), + get_message_up_to=lambda seconds: self.get_message_cutoff_from_voice_speed( + message, seconds + ), ) - return result - @classmethod def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig): hashed_api_key = hashlib.sha256(f"{synthesizer_config.api_key}".encode("utf-8")).hexdigest() @@ -120,6 +117,6 @@ def get_voice_identifier(cls, synthesizer_config: CartesiaSynthesizerConfig): hashed_api_key, str(synthesizer_config.voice_id), str(synthesizer_config.model_id), - synthesizer_config.audio_encoding + synthesizer_config.audio_encoding, ) - ) \ No newline at end of file + ) From c2c68e62b902cc183df2b3493e56be6c33e5ee53 Mon Sep 17 00:00:00 2001 From: Ajay Raj Date: Wed, 3 Jul 2024 11:25:36 -0700 Subject: [PATCH 4/4] make cartesia optional --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index c68d3b6b2..e12b4cd14 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -302,7 +302,7 @@ files = [ name = "cartesia" version = "1.0.3" description = "The official Python library for the Cartesia API." -optional = false +optional = true python-versions = ">=3.8.0" files = [ {file = "cartesia-1.0.3-py2.py3-none-any.whl", hash = "sha256:d680c197361507faf11e8ed99a30a0d6ece682298ea306f41a66a0195c08ae37"}, @@ -4678,4 +4678,4 @@ transcribers = ["google-cloud-speech"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "5959a95fd3aa9f446c3a8d43f5fed96f83f46d0b017ec2fe133e88956f0ae475" +content-hash = "53dcd434eecc407c0af533faecc9b16e5574c4368e3c136e785819243baa28d3" diff --git a/pyproject.toml b/pyproject.toml index a5d360f7f..04732a58e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ groq = { version = "^0.9.0", optional = true } # Synthesizers google-cloud-texttospeech = { version = "^2.16.3", optional = true } pvkoala = { version = "^2.0.1", optional = true } -cartesia = "^1.0.3" +cartesia = { version = "^1.0.3", optional = true } # Transcribers google-cloud-speech = { version = "^2.26.0", optional = true }