Add speech disfluencies (#7)

* hipaa compliance docs (vocodedev#390) * adds hipaa compliance docs * update hipaa doc * makes synthesizer methods static (vocodedev#394) * Update ivr-navigation.mdx (vocodedev#395) * simpler and less risky FILLER_PHRASES * add simple uh disfluency * Update autogenerated API reference (vocodedev#400) * improved make_disfluency --------- Co-authored-by: Ajay Raj <[email protected]> Co-authored-by: Diogo Braganca <[email protected]> Co-authored-by: Eliot Hsu <[email protected]>
dbraganca · Sep 20, 2023 · d78004d · d78004d
1 parent e07bcb1
commit d78004d
Show file tree

Hide file tree

Showing 18 changed files with 165 additions and 29 deletions.
diff --git a/docs/hipaa-compliance.mdx b/docs/hipaa-compliance.mdx
@@ -0,0 +1,23 @@
+---
+title: "HIPAA Compliance"
+---
+
+The `hipaa_compliant` flag in the Vocode outbound calls API configures the system to not
+persist any potentially sensitive information. Turning on the flag makes the API supportable for use-cases requiring HIPAA compliance.
+
+In particular, turning on the flag:
+
+- Redacts the content of the prompt on the `calls` resource
+- Prevents the transcript from being written to the `calls` resource
+- Configures the call not to be recorded
+
+# Turning on the flag
+
+```python
+vocode_client.calls.create_call(
+    from_number="<YOUR VOCODE NUMBER>",
+    to_number="15555555555",
+    agent=...,
+    hipaa_compliant=True
+)
+```
diff --git a/docs/ivr-navigation.mdx b/docs/ivr-navigation.mdx
@@ -4,7 +4,7 @@ description: "Navigate phone trees"
 ---
 
 Note: this feature is only useful on outbound calls, which are disabled by default. Please contact us through Discord or at [email protected]
-to discuss enabling this for your account!
+to discuss enabling outbound calls for your account!
 
 The Vocode API has many tools for agents to interact with IVRs (interactive voice responses) - navigating phone trees
 is one of the primary use-cases of the API.

diff --git a/docs/mint.json b/docs/mint.json
@@ -173,7 +173,8 @@
         "multilingual",
         "injecting-context",
         "machine-detection",
-        "ivr-navigation"
+        "ivr-navigation",
+        "hipaa-compliance"
       ]
     }
   ],

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -94,6 +94,13 @@
         "tags": ["numbers"],
         "summary": "Buy Number",
         "operationId": "buy_number",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": { "$ref": "#/components/schemas/BuyPhoneNumberRequest" }
+            }
+          }
+        },
         "responses": {
           "200": {
             "description": "Successful Response",
@@ -102,6 +109,14 @@
                 "schema": { "$ref": "#/components/schemas/PhoneNumber" }
               }
             }
+          },
+          "422": {
+            "description": "Validation Error",
+            "content": {
+              "application/json": {
+                "schema": { "$ref": "#/components/schemas/HTTPValidationError" }
+              }
+            }
           }
         },
         "security": [{ "HTTPBearer": [] }]
@@ -1439,6 +1454,11 @@
             "enum": ["default", "off"],
             "title": "Ivr Navigation Mode",
             "default": "off"
+          },
+          "conversation_speed": {
+            "type": "number",
+            "title": "Conversation Speed",
+            "default": 1.0
           }
         },
         "type": "object",
@@ -1542,6 +1562,11 @@
             "enum": ["default", "off"],
             "title": "Ivr Navigation Mode",
             "default": "off"
+          },
+          "conversation_speed": {
+            "type": "number",
+            "title": "Conversation Speed",
+            "default": 1.0
           }
         },
         "type": "object",
@@ -1668,6 +1693,13 @@
               { "$ref": "#/components/schemas/Undefined" }
             ],
             "title": "Ivr Navigation Mode"
+          },
+          "conversation_speed": {
+            "anyOf": [
+              { "type": "number" },
+              { "$ref": "#/components/schemas/Undefined" }
+            ],
+            "title": "Conversation Speed"
           }
         },
         "type": "object",
@@ -1738,6 +1770,13 @@
         "required": ["type"],
         "title": "AzureVoiceUpdateParams"
       },
+      "BuyPhoneNumberRequest": {
+        "properties": {
+          "area_code": { "type": "string", "title": "Area Code" }
+        },
+        "type": "object",
+        "title": "BuyPhoneNumberRequest"
+      },
       "Call": {
         "properties": {
           "id": { "type": "string", "format": "uuid", "title": "Id" },
@@ -1910,6 +1949,11 @@
             "enum": ["default", "off"],
             "title": "Ivr Navigation Mode",
             "default": "off"
+          },
+          "conversation_speed": {
+            "type": "number",
+            "title": "Conversation Speed",
+            "default": 1.0
           }
         },
         "type": "object",
@@ -2210,6 +2254,11 @@
             "enum": ["default", "off"],
             "title": "Ivr Navigation Mode",
             "default": "off"
+          },
+          "conversation_speed": {
+            "type": "number",
+            "title": "Conversation Speed",
+            "default": 1.0
           }
         },
         "type": "object",

diff --git a/vocode/streaming/agent/chat_gpt_agent.py b/vocode/streaming/agent/chat_gpt_agent.py
@@ -9,6 +9,7 @@
 from pydantic import BaseModel
 
 from vocode import getenv
+from vocode.streaming.utils.make_disfluencies import make_disfluency
 from vocode.streaming.action.factory import ActionFactory
 from vocode.streaming.agent.base_agent import RespondAgent
 from vocode.streaming.models.actions import FunctionCall, FunctionFragment
@@ -180,7 +181,6 @@ async def generate_response(
         ):
             if self.agent_config.remove_exclamation:
                 # replace ! by . because it sounds better when speaking.
-                self.logger.info(f"Message before: {message}")
                 message = message.replace('!','.')
-                self.logger.info(f"Message after: {message}")
+            message = make_disfluency(message)
             yield message, True
diff --git a/vocode/streaming/synthesizer/azure_synthesizer.py b/vocode/streaming/synthesizer/azure_synthesizer.py
@@ -221,7 +221,7 @@ def get_message_up_to(
         self,
         message: str,
         ssml: str,
-        seconds: int,
+        seconds: float,
         word_boundary_event_pool: WordBoundaryEventPool,
     ) -> str:
         events = word_boundary_event_pool.get_events_sorted()

diff --git a/vocode/streaming/synthesizer/bark_synthesizer.py b/vocode/streaming/synthesizer/bark_synthesizer.py
@@ -60,6 +60,7 @@ async def create_speech(
         write_wav(output_bytes_io, self.SAMPLE_RATE, int_audio_arr)
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/base_synthesizer.py b/vocode/streaming/synthesizer/base_synthesizer.py
@@ -32,12 +32,13 @@
 FILLER_PHRASES = [
     BaseMessage(text="Um - "),
     BaseMessage(text="Uh - "),
-    BaseMessage(text="Uh-huh - "),
-    BaseMessage(text="Mm-hmm - "),
-    BaseMessage(text="Hmm - "),
-    BaseMessage(text="Okay - "),
-    BaseMessage(text="Right - "),
-    BaseMessage(text="Let me see - "),
+    # BaseMessage(text="Uh-huh - "),
+    # BaseMessage(text="Mmhmm - "),
+    # BaseMessage(text="Hmm - "),
+    BaseMessage(text="Yeah - "),
+    # BaseMessage(text="Okay - "),
+    # BaseMessage(text="Right - "),
+    # BaseMessage(text="Let me see - "),
 ]
 FILLER_AUDIO_PATH = os.path.join(os.path.dirname(__file__), "filler_audio")
 TYPING_NOISE_PATH = "%s/typing-noise.wav" % FILLER_AUDIO_PATH
@@ -67,7 +68,7 @@ def __init__(self, chunk: bytes, is_last_chunk: bool):
     def __init__(
         self,
         chunk_generator: AsyncGenerator[ChunkResult, None],
-        get_message_up_to: Callable[[int], str],
+        get_message_up_to: Callable[[float], str],
     ):
         self.chunk_generator = chunk_generator
         self.get_message_up_to = get_message_up_to
@@ -172,20 +173,23 @@ def ready_synthesizer(self):
         pass
 
     # given the number of seconds the message was allowed to go until, where did we get in the message?
+    @staticmethod
     def get_message_cutoff_from_total_response_length(
-        self, message: BaseMessage, seconds: int, size_of_output: int
+        synthesizer_config: SynthesizerConfig,
+        message: BaseMessage,
+        seconds: float,
+        size_of_output: int,
     ) -> str:
-        estimated_output_seconds = (
-            size_of_output / self.synthesizer_config.sampling_rate
-        )
+        estimated_output_seconds = size_of_output / synthesizer_config.sampling_rate
         if not message.text:
             return message.text
 
         estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
         return message.text[: int(seconds / estimated_output_seconds_per_char)]
 
+    @staticmethod
     def get_message_cutoff_from_voice_speed(
-        self, message: BaseMessage, seconds: int, words_per_minute: int
+        message: BaseMessage, seconds: float, words_per_minute: int
     ) -> str:
         words_per_second = words_per_minute / 60
         estimated_words_spoken = math.floor(words_per_second * seconds)
@@ -203,19 +207,21 @@ async def create_speech(
         raise NotImplementedError
 
     # @param file - a file-like object in wav format
+    @staticmethod
     def create_synthesis_result_from_wav(
-        self, file: Any, message: BaseMessage, chunk_size: int
+        synthesizer_config: SynthesizerConfig,
+        file: Any,
+        message: BaseMessage,
+        chunk_size: int,
     ) -> SynthesisResult:
         output_bytes = convert_wav(
             file,
-            output_sample_rate=self.synthesizer_config.sampling_rate,
-            output_encoding=self.synthesizer_config.audio_encoding,
+            output_sample_rate=synthesizer_config.sampling_rate,
+            output_encoding=synthesizer_config.audio_encoding,
         )
 
-        if self.synthesizer_config.should_encode_as_wav:
-            chunk_transform = lambda chunk: encode_as_wav(
-                chunk, self.synthesizer_config
-            )
+        if synthesizer_config.should_encode_as_wav:
+            chunk_transform = lambda chunk: encode_as_wav(chunk, synthesizer_config)
         else:
             chunk_transform = lambda chunk: chunk
 
@@ -232,8 +238,8 @@ async def chunk_generator(output_bytes):
 
         return SynthesisResult(
             chunk_generator(output_bytes),
-            lambda seconds: self.get_message_cutoff_from_total_response_length(
-                message, seconds, len(output_bytes)
+            lambda seconds: BaseSynthesizer.get_message_cutoff_from_total_response_length(
+                synthesizer_config, message, seconds, len(output_bytes)
             ),
         )
 

diff --git a/vocode/streaming/synthesizer/coqui_synthesizer.py b/vocode/streaming/synthesizer/coqui_synthesizer.py
@@ -86,6 +86,7 @@ async def create_speech(
                 )
 
                 result = self.create_synthesis_result_from_wav(
+                    synthesizer_config=self.synthesizer_config,
                     file=io.BytesIO(read_response),
                     message=message,
                     chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/coqui_tts_synthesizer.py b/vocode/streaming/synthesizer/coqui_tts_synthesizer.py
@@ -77,7 +77,10 @@ async def create_speech(
         audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
         result = self.create_synthesis_result_from_wav(
-            file=output_bytes_io, message=message, chunk_size=chunk_size
+            synthesizer_config=self.synthesizer_config,
+            file=output_bytes_io,
+            message=message,
+            chunk_size=chunk_size,
         )
 
         convert_span.end()

diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@@ -183,6 +183,7 @@ async def create_speech(
             output_bytes_io = decode_mp3(audio_data)
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/google_synthesizer.py b/vocode/streaming/synthesizer/google_synthesizer.py
@@ -106,6 +106,7 @@ async def create_speech(
         output_bytes_io.seek(0)
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/gtts_synthesizer.py b/vocode/streaming/synthesizer/gtts_synthesizer.py
@@ -60,6 +60,7 @@ def thread():
         audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
         result = self.create_synthesis_result_from_wav(
+            synthesizer_config=self.synthesizer_config,
             file=output_bytes_io,
             message=message,
             chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/play_ht_synthesizer.py b/vocode/streaming/synthesizer/play_ht_synthesizer.py
@@ -92,6 +92,7 @@ async def create_speech(
             output_bytes_io = decode_mp3(read_response)
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,

diff --git a/vocode/streaming/synthesizer/polly_synthesizer.py b/vocode/streaming/synthesizer/polly_synthesizer.py
@@ -70,7 +70,7 @@ def get_speech_marks(self, message: str) -> Any:
     def get_message_up_to(
         self,
         message: str,
-        seconds: int,
+        seconds: float,
         word_events,
     ) -> str:
         for event in word_events:

diff --git a/vocode/streaming/synthesizer/rime_synthesizer.py b/vocode/streaming/synthesizer/rime_synthesizer.py
@@ -80,7 +80,10 @@ async def create_speech(
             audio_file = io.BytesIO(base64.b64decode(data.get("audioContent")))
 
             result = self.create_synthesis_result_from_wav(
-                file=audio_file, message=message, chunk_size=chunk_size
+                synthesizer_config=self.synthesizer_config,
+                file=audio_file,
+                message=message,
+                chunk_size=chunk_size,
             )
             convert_span.end()
             return result
diff --git a/vocode/streaming/synthesizer/stream_elements_synthesizer.py b/vocode/streaming/synthesizer/stream_elements_synthesizer.py
@@ -64,6 +64,7 @@ async def create_speech(
             audio_segment.export(output_bytes_io, format="wav")  # type: ignore
 
             result = self.create_synthesis_result_from_wav(
+                synthesizer_config=self.synthesizer_config,
                 file=output_bytes_io,
                 message=message,
                 chunk_size=chunk_size,