Skip to content

Commit

Permalink
Merge pull request #6 from m5a0r7/better-interruptions
Browse files Browse the repository at this point in the history
Better interruptions
  • Loading branch information
robotaref authored Nov 16, 2023
2 parents 7517fd3 + 5957415 commit f9e28d3
Show file tree
Hide file tree
Showing 24 changed files with 416 additions and 102 deletions.
55 changes: 46 additions & 9 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ langchain = "^0.0.198"
google-cloud-aiplatform = {version = "^1.26.0", optional = true}
miniaudio = "^1.59"
boto3 = "^1.28.28"
webrtcvad = "^2.0.10"


[tool.poetry.group.lint.dependencies]
Expand Down
2 changes: 2 additions & 0 deletions vocode/streaming/models/telephony.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
PunctuationEndpointingConfig,
TranscriberConfig,
)
from vocode.streaming.report.base_call_report import CallReporterConfig
from vocode.streaming.telephony.constants import (
DEFAULT_AUDIO_ENCODING,
DEFAULT_CHUNK_SIZE,
Expand Down Expand Up @@ -97,6 +98,7 @@ class BaseCallConfig(TypedModel, type=CallConfigType.BASE.value):
transcriber_config: TranscriberConfig
agent_config: AgentConfig
synthesizer_config: SynthesizerConfig
call_reporter_config: Optional[CallReporterConfig] = None
from_phone: str
to_phone: str

Expand Down
8 changes: 5 additions & 3 deletions vocode/streaming/models/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .audio_encoding import AudioEncoding
from .model import TypedModel
from vocode.utils.context_tracker import BaseContextTrackerConfig
from ...utils.voice_activity_detection import BaseVoiceActivityDetectorConfig

AZURE_DEFAULT_LANGUAGE = "en-US"

Expand Down Expand Up @@ -44,8 +45,8 @@ class TimeEndpointingConfig(EndpointingConfig, type=EndpointingType.TIME_BASED):

class PunctuationEndpointingConfig(
EndpointingConfig, type=EndpointingType.PUNCTUATION_BASED
):
time_cutoff_seconds: float = 0.4
):
time_cutoff_seconds: float = 0.4


class TranscriberConfig(TypedModel, type=TranscriberType.BASE.value):
Expand All @@ -57,9 +58,10 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE.value):
min_interrupt_confidence: Optional[float] = None
mute_during_speech: bool = False
context_tracker_config: Optional[BaseContextTrackerConfig] = None
voice_activity_detector_config: Optional[BaseVoiceActivityDetectorConfig] = None
interrupt_on_blockers: bool = False
skip_on_back_track_audio: bool = False
minimum_speaking_duration_to_interupt: float = 0
minimum_speaking_duration_to_interrupt: float = 0

@validator("min_interrupt_confidence")
def min_interrupt_confidence_must_be_between_0_and_1(cls, v):
Expand Down
6 changes: 3 additions & 3 deletions vocode/streaming/models/transcript.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import time
from typing import Any, Dict, List, Optional, Union
from typing import List, Optional

from pydantic import BaseModel, Field
from enum import Enum

from vocode.streaming.models.actions import ActionInput, ActionOutput
from vocode.streaming.models.events import ActionEvent, Sender, Event, EventType

from vocode.streaming.utils.events_manager import EventsManager


Expand Down
File renamed without changes.
27 changes: 27 additions & 0 deletions vocode/streaming/report/api_call_reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
from typing import Optional

import requests

from vocode.streaming.models.transcript import Transcript
from vocode.streaming.report.base_call_report import BaseCallReporter, CallReporterType, CallReporterConfig


class ApiCallReporterConfig(CallReporterConfig, type=CallReporterType.API.value):
url: str


class ApiCallReporter(BaseCallReporter[ApiCallReporterConfig]):
def __init__(self, config: ApiCallReporterConfig, logger: Optional[logging.Logger] = None):
super().__init__(config, logger)

def report(self, conversation_id: str, transcript: Transcript):
logs = self.get_event_logs(transcript)
data = {
"conversation_id": conversation_id,
"logs": logs,
"start_time": transcript.start_time,
}
self.logger.debug(f"Data to call reporter: {data}")
response = requests.post(self.config.url, json=data)
self.logger.debug(f"Response from call reporter: {response}")
44 changes: 44 additions & 0 deletions vocode/streaming/report/base_call_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging
from enum import Enum
from typing import TypeVar, Generic, Optional

from vocode.streaming.models.model import TypedModel
from vocode.streaming.models.transcript import Transcript, Message


class CallReporterType(str, Enum):
BASE = "base_call_report"
API = "api_call_report"


class CallReporterConfig(TypedModel, type=CallReporterType.BASE.value):
pass


CallReporterConfigType = TypeVar("CallReporterConfigType", bound=CallReporterConfig)


class BaseCallReporter(Generic[CallReporterConfigType]):
def __init__(self, config: CallReporterConfigType, logger: Optional[logging.Logger] = None):
self.logger: logging.Logger = logger or logging.getLogger(__name__)
self.config = config

def get_config(self) -> CallReporterConfig:
return self.config

def report(self, conversation_id: str, transcript: Transcript):
raise NotImplementedError

@staticmethod
def get_event_logs(transcript):
logs = []
for event_log in transcript.event_logs:
if isinstance(event_log, Message):
log = {
'text': event_log.text,
'sender': event_log.sender.value,
'timestamp': event_log.timestamp,
'confidence': event_log.confidence,
}
logs.append(log)
return logs
12 changes: 12 additions & 0 deletions vocode/streaming/report/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import logging
from typing import Optional

from vocode.streaming.report.api_call_reporter import ApiCallReporterConfig, ApiCallReporter
from vocode.streaming.report.base_call_report import CallReporterConfig


class CallReporterFactory:
@staticmethod
def create_call_reporter(config: CallReporterConfig, logger: Optional[logging.Logger] = None):
if isinstance(config, ApiCallReporterConfig):
return ApiCallReporter(config, logger)
24 changes: 18 additions & 6 deletions vocode/streaming/streaming_conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
TranscriptCompleteEvent,
)
from vocode.streaming.output_device.base_output_device import BaseOutputDevice
from vocode.streaming.report.base_call_report import CallReporterConfig, BaseCallReporter
from vocode.streaming.response_worker.random_response import RandomAudioManager
from vocode.streaming.synthesizer.base_synthesizer import (
BaseSynthesizer,
Expand All @@ -47,7 +48,7 @@
from vocode.streaming.telephony.noise_canceler.base_noise_canceler import BaseNoiseCanceler
from vocode.streaming.transcriber.base_transcriber import (
Transcription,
BaseTranscriber,
BaseTranscriber, HUMAN_ACTIVITY_DETECTED,
)
from vocode.streaming.utils import create_conversation_id, get_chunk_size_per_second
from vocode.streaming.utils.conversation_logger_adapter import wrap_logger
Expand Down Expand Up @@ -138,7 +139,9 @@ async def process(self, transcription: Transcription):
)
self.conversation.is_human_speaking = not transcription.is_final
if transcription.is_final:
# we use getattr here to avoid the dependency cycle between VonageCall and StreamingConversation
if self.conversation.transcriber.transcriber_config.voice_activity_detector_config and \
transcription.message == HUMAN_ACTIVITY_DETECTED:
return
event = self.interruptable_event_factory.create_interruptable_event(
TranscriptionAgentInput(
transcription=transcription,
Expand Down Expand Up @@ -185,7 +188,6 @@ async def process(self, item: InterruptableAgentResponseEvent[AgentResponse]):
return
try:
agent_response = item.payload
self.conversation.logger.debug("Got agent response: {}".format(agent_response))
if isinstance(agent_response, AgentResponseFillerAudio):
self.conversation.random_audio_manager.sync_send_filler_audio(item.agent_response_tracker)
return
Expand Down Expand Up @@ -296,6 +298,7 @@ def __init__(
agent: BaseAgent,
synthesizer: BaseSynthesizer,
noise_canceler: Optional[BaseNoiseCanceler] = None,
call_reporter: Optional[BaseCallReporter] = None,
conversation_id: Optional[str] = None,
per_chunk_allowance_seconds: float = PER_CHUNK_ALLOWANCE_SECONDS,
events_manager: Optional[EventsManager] = None,
Expand All @@ -307,6 +310,7 @@ def __init__(
logger or logging.getLogger(__name__),
conversation_id=self.id,
)
self.call_reporter = call_reporter
self.output_device = output_device
self.transcriber = transcriber
self.agent = agent
Expand Down Expand Up @@ -494,9 +498,12 @@ def broadcast_interrupt(self):
return num_interrupts > 0

def is_interrupt(self, transcription: Transcription):
return transcription.confidence >= (
self.transcriber.get_transcriber_config().min_interrupt_confidence or 0
)
interrupt_by_confidence = transcription.confidence >= (
self.transcriber.get_transcriber_config().min_interrupt_confidence or 0)
interrupt_by_vad = (
self.transcriber.transcriber_config.voice_activity_detector_config and
transcription.message == HUMAN_ACTIVITY_DETECTED)
return interrupt_by_confidence or interrupt_by_vad

async def send_speech_to_output(
self,
Expand Down Expand Up @@ -579,6 +586,10 @@ async def send_speech_to_output(
def mark_terminated(self):
self.active = False

def report_call(self):
if self.call_reporter:
self.call_reporter.report(self.id, self.transcript)

async def terminate(self):
self.mark_terminated()
self.broadcast_interrupt()
Expand Down Expand Up @@ -612,6 +623,7 @@ async def terminate(self):
self.logger.debug("Terminating speech transcriber")
self.transcriber.terminate()
self.logger.debug("Terminating transcriptions worker")
self.report_call()
self.transcriptions_worker.terminate()
self.logger.debug("Terminating final transcriptions worker")
self.agent_responses_worker.terminate()
Expand Down
3 changes: 0 additions & 3 deletions vocode/streaming/synthesizer/eleven_labs_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,6 @@ async def get_audio_data_from_cache_or_download(self, phrase: BaseMessage, base_
)
filler_audio_path = os.path.join(base_path, f"{cache_key}.wav")
if not os.path.exists(filler_audio_path):
print('$#@$!#@$!@#'*10)
print(filler_audio_path)
print('$#@$!#@$!@#'*10)
self.logger.debug(f"Generating cached audio for {phrase.text}")
audio_data = await self.download_filler_audio_data(phrase)

Expand Down
5 changes: 5 additions & 0 deletions vocode/streaming/telephony/conversation/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
)
from vocode.streaming.output_device.twilio_output_device import TwilioOutputDevice
from vocode.streaming.output_device.vonage_output_device import VonageOutputDevice
from vocode.streaming.report.base_call_report import CallReporterConfig
from vocode.streaming.report.factory import CallReporterFactory
from vocode.streaming.streaming_conversation import StreamingConversation
from vocode.streaming.synthesizer.factory import SynthesizerFactory
from vocode.streaming.telephony.config_manager.base_config_manager import (
Expand Down Expand Up @@ -47,9 +49,11 @@ def __init__(
agent_factory: AgentFactory = AgentFactory(),
synthesizer_factory: SynthesizerFactory = SynthesizerFactory(),
noise_canceler_factory: NoiseCancelerFactory = NoiseCancelerFactory(),
call_reporter_factory: CallReporterFactory = CallReporterFactory(),
events_manager: Optional[EventsManager] = None,
logger: Optional[logging.Logger] = None,
noise_canceling_config: Optional[NoiseCancelingConfig] = None,
call_reporter_config: Optional[CallReporterConfig] = None,
):
conversation_id = conversation_id or create_conversation_id()
logger = wrap_logger(
Expand All @@ -67,6 +71,7 @@ def __init__(
agent_factory.create_agent(agent_config, logger=logger),
synthesizer_factory.create_synthesizer(synthesizer_config, logger=logger),
noise_canceler_factory.create_noise_canceler(noise_canceling_config, logger=logger),
call_reporter_factory.create_call_reporter(call_reporter_config, logger=logger),
conversation_id=conversation_id,
per_chunk_allowance_seconds=0.01,
events_manager=events_manager,
Expand Down
45 changes: 25 additions & 20 deletions vocode/streaming/telephony/conversation/outbound_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from vocode.streaming.models.transcriber import (
TranscriberConfig,
)
from vocode.streaming.report.base_call_report import CallReporterConfig
from vocode.streaming.telephony.client.base_telephony_client import BaseTelephonyClient
from vocode.streaming.telephony.client.twilio_client import TwilioClient
from vocode.streaming.telephony.client.vonage_client import VonageClient
Expand All @@ -27,24 +28,25 @@

class OutboundCall:
def __init__(
self,
base_url: str,
to_phone: str,
from_phone: str,
config_manager: BaseConfigManager,
agent_config: AgentConfig,
twilio_config: Optional[TwilioConfig] = None,
vonage_config: Optional[VonageConfig] = None,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
mobile_only: bool = True,
digits: Optional[
str
] = None, # Keys to press when the call connects, see send_digits https://www.twilio.com/docs/voice/api/call-resource#create-a-call-resource
output_to_speaker: bool = False,
):
self,
base_url: str,
to_phone: str,
from_phone: str,
config_manager: BaseConfigManager,
agent_config: AgentConfig,
twilio_config: Optional[TwilioConfig] = None,
vonage_config: Optional[VonageConfig] = None,
transcriber_config: Optional[TranscriberConfig] = None,
synthesizer_config: Optional[SynthesizerConfig] = None,
conversation_id: Optional[str] = None,
logger: Optional[logging.Logger] = None,
mobile_only: bool = True,
digits: Optional[
str
] = None,
# Keys to press when the call connects, see send_digits https://www.twilio.com/docs/voice/api/call-resource#create-a-call-resource
output_to_speaker: bool = False,
call_reporter_config: [CallReporterConfig] = None):
self.base_url = base_url
self.to_phone = to_phone
self.digits = digits
Expand Down Expand Up @@ -72,6 +74,7 @@ def __init__(
self.synthesizer_config = self.create_synthesizer_config(synthesizer_config)
self.telephony_id = None
self.output_to_speaker = output_to_speaker
self.call_reporter_config = call_reporter_config

def create_telephony_client(self) -> BaseTelephonyClient:
if self.twilio_config is not None:
Expand All @@ -86,7 +89,7 @@ def create_telephony_client(self) -> BaseTelephonyClient:
raise ValueError("No telephony config provided")

def create_transcriber_config(
self, transcriber_config_override: Optional[TranscriberConfig]
self, transcriber_config_override: Optional[TranscriberConfig]
) -> TranscriberConfig:
if transcriber_config_override is not None:
return transcriber_config_override
Expand All @@ -98,7 +101,7 @@ def create_transcriber_config(
raise ValueError("No telephony config provided")

def create_synthesizer_config(
self, synthesizer_config_override: Optional[SynthesizerConfig]
self, synthesizer_config_override: Optional[SynthesizerConfig]
) -> SynthesizerConfig:
if synthesizer_config_override is not None:
return synthesizer_config_override
Expand Down Expand Up @@ -132,6 +135,7 @@ async def start(self):
twilio_sid=self.telephony_id,
from_phone=self.from_phone,
to_phone=self.to_phone,
call_reporter_config=self.call_reporter_config,
)
elif isinstance(self.telephony_client, VonageClient):
call_config = VonageCallConfig(
Expand All @@ -143,6 +147,7 @@ async def start(self):
from_phone=self.from_phone,
to_phone=self.to_phone,
output_to_speaker=self.output_to_speaker,
call_reporter_config=self.call_reporter_config,
)
else:
raise ValueError("Unknown telephony client")
Expand Down
3 changes: 3 additions & 0 deletions vocode/streaming/telephony/conversation/twilio_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from vocode.streaming.models.transcriber import (
TranscriberConfig,
)
from vocode.streaming.report.base_call_report import CallReporterConfig
from vocode.streaming.synthesizer.factory import SynthesizerFactory
from vocode.streaming.telephony.client.twilio_client import TwilioClient
from vocode.streaming.telephony.config_manager.base_config_manager import (
Expand Down Expand Up @@ -51,6 +52,7 @@ def __init__(
synthesizer_factory: SynthesizerFactory = SynthesizerFactory(),
events_manager: Optional[EventsManager] = None,
logger: Optional[logging.Logger] = None,
call_reporter_config: Optional[CallReporterConfig] = None,
):
noise_canceling_config = twilio_config.noise_canceling_config if twilio_config else None
super().__init__(
Expand All @@ -69,6 +71,7 @@ def __init__(
synthesizer_factory=synthesizer_factory,
logger=logger,
noise_canceling_config=noise_canceling_config,
call_reporter_config=call_reporter_config,
)
self.base_url = base_url
self.config_manager = config_manager
Expand Down
Loading

0 comments on commit f9e28d3

Please sign in to comment.