Skip to content

Commit

Permalink
Add speech disfluencies (#7)
Browse files Browse the repository at this point in the history
* hipaa compliance docs (vocodedev#390)

* adds hipaa compliance docs

* update hipaa doc

* makes synthesizer methods static (vocodedev#394)

* Update ivr-navigation.mdx (vocodedev#395)

* simpler and less risky FILLER_PHRASES

* add simple uh disfluency

* Update autogenerated API reference (vocodedev#400)

* improved make_disfluency

---------

Co-authored-by: Ajay Raj <[email protected]>
Co-authored-by: Diogo Braganca <[email protected]>
Co-authored-by: Eliot Hsu <[email protected]>
  • Loading branch information
4 people authored Sep 20, 2023
1 parent e07bcb1 commit d78004d
Show file tree
Hide file tree
Showing 18 changed files with 165 additions and 29 deletions.
23 changes: 23 additions & 0 deletions docs/hipaa-compliance.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
title: "HIPAA Compliance"
---

The `hipaa_compliant` flag in the Vocode outbound calls API configures the system to not
persist any potentially sensitive information. Turning on the flag makes the API supportable for use-cases requiring HIPAA compliance.

In particular, turning on the flag:

- Redacts the content of the prompt on the `calls` resource
- Prevents the transcript from being written to the `calls` resource
- Configures the call not to be recorded

# Turning on the flag

```python
vocode_client.calls.create_call(
from_number="<YOUR VOCODE NUMBER>",
to_number="15555555555",
agent=...,
hipaa_compliant=True
)
```
2 changes: 1 addition & 1 deletion docs/ivr-navigation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ description: "Navigate phone trees"
---

Note: this feature is only useful on outbound calls, which are disabled by default. Please contact us through Discord or at [email protected]
to discuss enabling this for your account!
to discuss enabling outbound calls for your account!

The Vocode API has many tools for agents to interact with IVRs (interactive voice responses) - navigating phone trees
is one of the primary use-cases of the API.
Expand Down
3 changes: 2 additions & 1 deletion docs/mint.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@
"multilingual",
"injecting-context",
"machine-detection",
"ivr-navigation"
"ivr-navigation",
"hipaa-compliance"
]
}
],
Expand Down
49 changes: 49 additions & 0 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@
"tags": ["numbers"],
"summary": "Buy Number",
"operationId": "buy_number",
"requestBody": {
"content": {
"application/json": {
"schema": { "$ref": "#/components/schemas/BuyPhoneNumberRequest" }
}
}
},
"responses": {
"200": {
"description": "Successful Response",
Expand All @@ -102,6 +109,14 @@
"schema": { "$ref": "#/components/schemas/PhoneNumber" }
}
}
},
"422": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": { "$ref": "#/components/schemas/HTTPValidationError" }
}
}
}
},
"security": [{ "HTTPBearer": [] }]
Expand Down Expand Up @@ -1439,6 +1454,11 @@
"enum": ["default", "off"],
"title": "Ivr Navigation Mode",
"default": "off"
},
"conversation_speed": {
"type": "number",
"title": "Conversation Speed",
"default": 1.0
}
},
"type": "object",
Expand Down Expand Up @@ -1542,6 +1562,11 @@
"enum": ["default", "off"],
"title": "Ivr Navigation Mode",
"default": "off"
},
"conversation_speed": {
"type": "number",
"title": "Conversation Speed",
"default": 1.0
}
},
"type": "object",
Expand Down Expand Up @@ -1668,6 +1693,13 @@
{ "$ref": "#/components/schemas/Undefined" }
],
"title": "Ivr Navigation Mode"
},
"conversation_speed": {
"anyOf": [
{ "type": "number" },
{ "$ref": "#/components/schemas/Undefined" }
],
"title": "Conversation Speed"
}
},
"type": "object",
Expand Down Expand Up @@ -1738,6 +1770,13 @@
"required": ["type"],
"title": "AzureVoiceUpdateParams"
},
"BuyPhoneNumberRequest": {
"properties": {
"area_code": { "type": "string", "title": "Area Code" }
},
"type": "object",
"title": "BuyPhoneNumberRequest"
},
"Call": {
"properties": {
"id": { "type": "string", "format": "uuid", "title": "Id" },
Expand Down Expand Up @@ -1910,6 +1949,11 @@
"enum": ["default", "off"],
"title": "Ivr Navigation Mode",
"default": "off"
},
"conversation_speed": {
"type": "number",
"title": "Conversation Speed",
"default": 1.0
}
},
"type": "object",
Expand Down Expand Up @@ -2210,6 +2254,11 @@
"enum": ["default", "off"],
"title": "Ivr Navigation Mode",
"default": "off"
},
"conversation_speed": {
"type": "number",
"title": "Conversation Speed",
"default": 1.0
}
},
"type": "object",
Expand Down
4 changes: 2 additions & 2 deletions vocode/streaming/agent/chat_gpt_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pydantic import BaseModel

from vocode import getenv
from vocode.streaming.utils.make_disfluencies import make_disfluency
from vocode.streaming.action.factory import ActionFactory
from vocode.streaming.agent.base_agent import RespondAgent
from vocode.streaming.models.actions import FunctionCall, FunctionFragment
Expand Down Expand Up @@ -180,7 +181,6 @@ async def generate_response(
):
if self.agent_config.remove_exclamation:
# replace ! by . because it sounds better when speaking.
self.logger.info(f"Message before: {message}")
message = message.replace('!','.')
self.logger.info(f"Message after: {message}")
message = make_disfluency(message)
yield message, True
2 changes: 1 addition & 1 deletion vocode/streaming/synthesizer/azure_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def get_message_up_to(
self,
message: str,
ssml: str,
seconds: int,
seconds: float,
word_boundary_event_pool: WordBoundaryEventPool,
) -> str:
events = word_boundary_event_pool.get_events_sorted()
Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/bark_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ async def create_speech(
write_wav(output_bytes_io, self.SAMPLE_RATE, int_audio_arr)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
48 changes: 27 additions & 21 deletions vocode/streaming/synthesizer/base_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@
FILLER_PHRASES = [
BaseMessage(text="Um - "),
BaseMessage(text="Uh - "),
BaseMessage(text="Uh-huh - "),
BaseMessage(text="Mm-hmm - "),
BaseMessage(text="Hmm - "),
BaseMessage(text="Okay - "),
BaseMessage(text="Right - "),
BaseMessage(text="Let me see - "),
# BaseMessage(text="Uh-huh - "),
# BaseMessage(text="Mmhmm - "),
# BaseMessage(text="Hmm - "),
BaseMessage(text="Yeah - "),
# BaseMessage(text="Okay - "),
# BaseMessage(text="Right - "),
# BaseMessage(text="Let me see - "),
]
FILLER_AUDIO_PATH = os.path.join(os.path.dirname(__file__), "filler_audio")
TYPING_NOISE_PATH = "%s/typing-noise.wav" % FILLER_AUDIO_PATH
Expand Down Expand Up @@ -67,7 +68,7 @@ def __init__(self, chunk: bytes, is_last_chunk: bool):
def __init__(
self,
chunk_generator: AsyncGenerator[ChunkResult, None],
get_message_up_to: Callable[[int], str],
get_message_up_to: Callable[[float], str],
):
self.chunk_generator = chunk_generator
self.get_message_up_to = get_message_up_to
Expand Down Expand Up @@ -172,20 +173,23 @@ def ready_synthesizer(self):
pass

# given the number of seconds the message was allowed to go until, where did we get in the message?
@staticmethod
def get_message_cutoff_from_total_response_length(
self, message: BaseMessage, seconds: int, size_of_output: int
synthesizer_config: SynthesizerConfig,
message: BaseMessage,
seconds: float,
size_of_output: int,
) -> str:
estimated_output_seconds = (
size_of_output / self.synthesizer_config.sampling_rate
)
estimated_output_seconds = size_of_output / synthesizer_config.sampling_rate
if not message.text:
return message.text

estimated_output_seconds_per_char = estimated_output_seconds / len(message.text)
return message.text[: int(seconds / estimated_output_seconds_per_char)]

@staticmethod
def get_message_cutoff_from_voice_speed(
self, message: BaseMessage, seconds: int, words_per_minute: int
message: BaseMessage, seconds: float, words_per_minute: int
) -> str:
words_per_second = words_per_minute / 60
estimated_words_spoken = math.floor(words_per_second * seconds)
Expand All @@ -203,19 +207,21 @@ async def create_speech(
raise NotImplementedError

# @param file - a file-like object in wav format
@staticmethod
def create_synthesis_result_from_wav(
self, file: Any, message: BaseMessage, chunk_size: int
synthesizer_config: SynthesizerConfig,
file: Any,
message: BaseMessage,
chunk_size: int,
) -> SynthesisResult:
output_bytes = convert_wav(
file,
output_sample_rate=self.synthesizer_config.sampling_rate,
output_encoding=self.synthesizer_config.audio_encoding,
output_sample_rate=synthesizer_config.sampling_rate,
output_encoding=synthesizer_config.audio_encoding,
)

if self.synthesizer_config.should_encode_as_wav:
chunk_transform = lambda chunk: encode_as_wav(
chunk, self.synthesizer_config
)
if synthesizer_config.should_encode_as_wav:
chunk_transform = lambda chunk: encode_as_wav(chunk, synthesizer_config)
else:
chunk_transform = lambda chunk: chunk

Expand All @@ -232,8 +238,8 @@ async def chunk_generator(output_bytes):

return SynthesisResult(
chunk_generator(output_bytes),
lambda seconds: self.get_message_cutoff_from_total_response_length(
message, seconds, len(output_bytes)
lambda seconds: BaseSynthesizer.get_message_cutoff_from_total_response_length(
synthesizer_config, message, seconds, len(output_bytes)
),
)

Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/coqui_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ async def create_speech(
)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=io.BytesIO(read_response),
message=message,
chunk_size=chunk_size,
Expand Down
5 changes: 4 additions & 1 deletion vocode/streaming/synthesizer/coqui_tts_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ async def create_speech(
audio_segment.export(output_bytes_io, format="wav") # type: ignore

result = self.create_synthesis_result_from_wav(
file=output_bytes_io, message=message, chunk_size=chunk_size
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
)

convert_span.end()
Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/eleven_labs_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ async def create_speech(
output_bytes_io = decode_mp3(audio_data)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/google_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ async def create_speech(
output_bytes_io.seek(0)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/gtts_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def thread():
audio_segment.export(output_bytes_io, format="wav") # type: ignore

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
1 change: 1 addition & 0 deletions vocode/streaming/synthesizer/play_ht_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ async def create_speech(
output_bytes_io = decode_mp3(read_response)

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
2 changes: 1 addition & 1 deletion vocode/streaming/synthesizer/polly_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_speech_marks(self, message: str) -> Any:
def get_message_up_to(
self,
message: str,
seconds: int,
seconds: float,
word_events,
) -> str:
for event in word_events:
Expand Down
5 changes: 4 additions & 1 deletion vocode/streaming/synthesizer/rime_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ async def create_speech(
audio_file = io.BytesIO(base64.b64decode(data.get("audioContent")))

result = self.create_synthesis_result_from_wav(
file=audio_file, message=message, chunk_size=chunk_size
synthesizer_config=self.synthesizer_config,
file=audio_file,
message=message,
chunk_size=chunk_size,
)
convert_span.end()
return result
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ async def create_speech(
audio_segment.export(output_bytes_io, format="wav") # type: ignore

result = self.create_synthesis_result_from_wav(
synthesizer_config=self.synthesizer_config,
file=output_bytes_io,
message=message,
chunk_size=chunk_size,
Expand Down
Loading

0 comments on commit d78004d

Please sign in to comment.