vocodedev · ajar98 · Sep 12, 2023 · Sep 12, 2023
diff --git a/vocode/streaming/synthesizer/azure_synthesizer.py b/vocode/streaming/synthesizer/azure_synthesizer.py
@@ -219,7 +219,7 @@ def get_message_up_to(
         self,
         message: str,
         ssml: str,
-        seconds: int,
+        seconds: float,
         word_boundary_event_pool: WordBoundaryEventPool,
     ) -> str:
         events = word_boundary_event_pool.get_events_sorted()

diff --git a/vocode/streaming/synthesizer/bark_synthesizer.py b/vocode/streaming/synthesizer/bark_synthesizer.py
@@ -60,6 +60,7 @@ async def create_speech(
         write_wav(output_bytes_io, self.SAMPLE_RATE, int_audio_arr)
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/base_synthesizer.py b/vocode/streaming/synthesizer/base_synthesizer.py
@@ -67,7 +67,7 @@ def __init__(self, chunk: bytes, is_last_chunk: bool):
     def __init__(
         self,
         chunk_generator: AsyncGenerator[ChunkResult, None],
-        get_message_up_to: Callable[[int], str],
+        get_message_up_to: Callable[[float], str],
     ):
         self.chunk_generator = chunk_generator
         self.get_message_up_to = get_message_up_to
@@ -172,20 +172,23 @@ def ready_synthesizer(self):
         pass
 
     # given the number of seconds the message was allowed to go until, where did we get in the message?
+    @staticmethod
     def get_message_cutoff_from_total_response_length(
-        self, message: BaseMessage, seconds: int, size_of_output: int
+        synthesizer_config: SynthesizerConfig,
+        message: BaseMessage,
+        seconds: float,
+        size_of_output: int,
     ) -> str:
-        estimated_output_seconds = (
-            size_of_output / self.synthesizer_config.sampling_rate
-        )
+        estimated_output_seconds = size_of_output / synthesizer_config.sampling_rate
         if not message.text:
             return message.text
 
         estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
         return message.text[: int(seconds / estimated_output_seconds_per_char)]
 
+    @staticmethod
     def get_message_cutoff_from_voice_speed(
-        self, message: BaseMessage, seconds: int, words_per_minute: int
+        message: BaseMessage, seconds: float, words_per_minute: int
     ) -> str:
         words_per_second = words_per_minute / 60
         estimated_words_spoken = math.floor(words_per_second * seconds)
@@ -203,19 +206,21 @@ async def create_speech(
         raise NotImplementedError
 
     # @param file - a file-like object in wav format
+    @staticmethod
     def create_synthesis_result_from_wav(
-        self, file: Any, message: BaseMessage, chunk_size: int
+        synthesizer_config: SynthesizerConfig,
+        file: Any,
+        message: BaseMessage,
+        chunk_size: int,
     ) -> SynthesisResult:
         output_bytes = convert_wav(
             file,
-            output_sample_rate=self.synthesizer_config.sampling_rate,
-            output_encoding=self.synthesizer_config.audio_encoding,
+            output_sample_rate=synthesizer_config.sampling_rate,
+            output_encoding=synthesizer_config.audio_encoding,
         )
 
-        if self.synthesizer_config.should_encode_as_wav:
-            chunk_transform = lambda chunk: encode_as_wav(
-                chunk, self.synthesizer_config
-            )
+        if synthesizer_config.should_encode_as_wav:
+            chunk_transform = lambda chunk: encode_as_wav(chunk, synthesizer_config)
         else:
             chunk_transform = lambda chunk: chunk
 
@@ -232,8 +237,8 @@ async def chunk_generator(output_bytes):
 
         return SynthesisResult(
             chunk_generator(output_bytes),
-            lambda seconds: self.get_message_cutoff_from_total_response_length(
-                message, seconds, len(output_bytes)
+            lambda seconds: BaseSynthesizer.get_message_cutoff_from_total_response_length(
+                synthesizer_config, message, seconds, len(output_bytes)
             ),
         )
 

diff --git a/vocode/streaming/synthesizer/coqui_synthesizer.py b/vocode/streaming/synthesizer/coqui_synthesizer.py
@@ -86,6 +86,7 @@ async def create_speech(
                 )
 
                 result = self.create_synthesis_result_from_wav(
+                    synthesizer_config=self.synthesizer_config,
                     file=io.BytesIO(read_response),
                     message=message,
                     chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/coqui_tts_synthesizer.py b/vocode/streaming/synthesizer/coqui_tts_synthesizer.py
@@ -77,7 +77,10 @@ async def create_speech(
         audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
         result = self.create_synthesis_result_from_wav(
-            file=output_bytes_io, message=message, chunk_size=chunk_size
+            synthesizer_config=self.synthesizer_config,
+            file=output_bytes_io,
+            message=message,
+            chunk_size=chunk_size,
         )
 
         convert_span.end()

diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@@ -108,6 +108,7 @@ async def create_speech(
             output_bytes_io = decode_mp3(audio_data)
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/google_synthesizer.py b/vocode/streaming/synthesizer/google_synthesizer.py
@@ -106,6 +106,7 @@ async def create_speech(
         output_bytes_io.seek(0)
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/gtts_synthesizer.py b/vocode/streaming/synthesizer/gtts_synthesizer.py
@@ -60,6 +60,7 @@ def thread():
         audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/play_ht_synthesizer.py b/vocode/streaming/synthesizer/play_ht_synthesizer.py
@@ -92,6 +92,7 @@ async def create_speech(
             output_bytes_io = decode_mp3(read_response)
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/polly_synthesizer.py b/vocode/streaming/synthesizer/polly_synthesizer.py
@@ -70,7 +70,7 @@ def get_speech_marks(self, message: str) -> Any:
     def get_message_up_to(
         self,
         message: str,
-        seconds: int,
+        seconds: float,
         word_events,
     ) -> str:
         for event in word_events:

diff --git a/vocode/streaming/synthesizer/rime_synthesizer.py b/vocode/streaming/synthesizer/rime_synthesizer.py
@@ -80,7 +80,10 @@ async def create_speech(
             audio_file = io.BytesIO(base64.b64decode(data.get("audioContent")))
 
             result = self.create_synthesis_result_from_wav(
-                file=audio_file, message=message, chunk_size=chunk_size
+                synthesizer_config=self.synthesizer_config,
+                file=audio_file,
+                message=message,
+                chunk_size=chunk_size,
             )
             convert_span.end()
             return result
diff --git a/vocode/streaming/synthesizer/stream_elements_synthesizer.py b/vocode/streaming/synthesizer/stream_elements_synthesizer.py
@@ -64,6 +64,7 @@ async def create_speech(
             audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,