Merge pull request #6 from m5a0r7/better-interruptions

Better interruptions
m5a0r7 · Nov 16, 2023 · f9e28d3 · f9e28d3
2 parents 7517fd3 + 5957415
commit f9e28d3
Show file tree

Hide file tree

Showing 24 changed files with 416 additions and 102 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ langchain = "^0.0.198"
 google-cloud-aiplatform = {version = "^1.26.0", optional = true}
 miniaudio = "^1.59"
 boto3 = "^1.28.28"
+webrtcvad = "^2.0.10"
 
 
 [tool.poetry.group.lint.dependencies]

diff --git a/vocode/streaming/models/telephony.py b/vocode/streaming/models/telephony.py
@@ -12,6 +12,7 @@
     PunctuationEndpointingConfig,
     TranscriberConfig,
 )
+from vocode.streaming.report.base_call_report import CallReporterConfig
 from vocode.streaming.telephony.constants import (
     DEFAULT_AUDIO_ENCODING,
     DEFAULT_CHUNK_SIZE,
@@ -97,6 +98,7 @@ class BaseCallConfig(TypedModel, type=CallConfigType.BASE.value):
     transcriber_config: TranscriberConfig
     agent_config: AgentConfig
     synthesizer_config: SynthesizerConfig
+    call_reporter_config: Optional[CallReporterConfig] = None
     from_phone: str
     to_phone: str
 

diff --git a/vocode/streaming/models/transcriber.py b/vocode/streaming/models/transcriber.py
@@ -13,6 +13,7 @@
 from .audio_encoding import AudioEncoding
 from .model import TypedModel
 from vocode.utils.context_tracker import BaseContextTrackerConfig
+from ...utils.voice_activity_detection import BaseVoiceActivityDetectorConfig
 
 AZURE_DEFAULT_LANGUAGE = "en-US"
 
@@ -44,8 +45,8 @@ class TimeEndpointingConfig(EndpointingConfig, type=EndpointingType.TIME_BASED):
 
 class PunctuationEndpointingConfig(
     EndpointingConfig, type=EndpointingType.PUNCTUATION_BASED
-): 
-   time_cutoff_seconds: float = 0.4
+):
+    time_cutoff_seconds: float = 0.4
 
 
 class TranscriberConfig(TypedModel, type=TranscriberType.BASE.value):
@@ -57,9 +58,10 @@ class TranscriberConfig(TypedModel, type=TranscriberType.BASE.value):
     min_interrupt_confidence: Optional[float] = None
     mute_during_speech: bool = False
     context_tracker_config: Optional[BaseContextTrackerConfig] = None
+    voice_activity_detector_config: Optional[BaseVoiceActivityDetectorConfig] = None
     interrupt_on_blockers: bool = False
     skip_on_back_track_audio: bool = False
-    minimum_speaking_duration_to_interupt: float = 0
+    minimum_speaking_duration_to_interrupt: float = 0
 
     @validator("min_interrupt_confidence")
     def min_interrupt_confidence_must_be_between_0_and_1(cls, v):

diff --git a/vocode/streaming/models/transcript.py b/vocode/streaming/models/transcript.py
@@ -1,10 +1,10 @@
 import time
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional
+
 from pydantic import BaseModel, Field
-from enum import Enum
+
 from vocode.streaming.models.actions import ActionInput, ActionOutput
 from vocode.streaming.models.events import ActionEvent, Sender, Event, EventType
-
 from vocode.streaming.utils.events_manager import EventsManager
 
 

diff --git a/...s/context_tracker/base_context_tracker.py → vocode/streaming/report/__init__.py b/...s/context_tracker/base_context_tracker.py → vocode/streaming/report/__init__.py
diff --git a/vocode/streaming/report/api_call_reporter.py b/vocode/streaming/report/api_call_reporter.py
@@ -0,0 +1,27 @@
+import logging
+from typing import Optional
+
+import requests
+
+from vocode.streaming.models.transcript import Transcript
+from vocode.streaming.report.base_call_report import BaseCallReporter, CallReporterType, CallReporterConfig
+
+
+class ApiCallReporterConfig(CallReporterConfig, type=CallReporterType.API.value):
+    url: str
+
+
+class ApiCallReporter(BaseCallReporter[ApiCallReporterConfig]):
+    def __init__(self, config: ApiCallReporterConfig, logger: Optional[logging.Logger] = None):
+        super().__init__(config, logger)
+
+    def report(self, conversation_id: str, transcript: Transcript):
+        logs = self.get_event_logs(transcript)
+        data = {
+            "conversation_id": conversation_id,
+            "logs": logs,
+            "start_time": transcript.start_time,
+        }
+        self.logger.debug(f"Data to call reporter: {data}")
+        response = requests.post(self.config.url, json=data)
+        self.logger.debug(f"Response from call reporter: {response}")
diff --git a/vocode/streaming/report/base_call_report.py b/vocode/streaming/report/base_call_report.py
@@ -0,0 +1,44 @@
+import logging
+from enum import Enum
+from typing import TypeVar, Generic, Optional
+
+from vocode.streaming.models.model import TypedModel
+from vocode.streaming.models.transcript import Transcript, Message
+
+
+class CallReporterType(str, Enum):
+    BASE = "base_call_report"
+    API = "api_call_report"
+
+
+class CallReporterConfig(TypedModel, type=CallReporterType.BASE.value):
+    pass
+
+
+CallReporterConfigType = TypeVar("CallReporterConfigType", bound=CallReporterConfig)
+
+
+class BaseCallReporter(Generic[CallReporterConfigType]):
+    def __init__(self, config: CallReporterConfigType, logger: Optional[logging.Logger] = None):
+        self.logger: logging.Logger = logger or logging.getLogger(__name__)
+        self.config = config
+
+    def get_config(self) -> CallReporterConfig:
+        return self.config
+
+    def report(self, conversation_id: str, transcript: Transcript):
+        raise NotImplementedError
+
+    @staticmethod
+    def get_event_logs(transcript):
+        logs = []
+        for event_log in transcript.event_logs:
+            if isinstance(event_log, Message):
+                log = {
+                    'text': event_log.text,
+                    'sender': event_log.sender.value,
+                    'timestamp': event_log.timestamp,
+                    'confidence': event_log.confidence,
+                }
+                logs.append(log)
+        return logs
diff --git a/vocode/streaming/report/factory.py b/vocode/streaming/report/factory.py
@@ -0,0 +1,12 @@
+import logging
+from typing import Optional
+
+from vocode.streaming.report.api_call_reporter import ApiCallReporterConfig, ApiCallReporter
+from vocode.streaming.report.base_call_report import CallReporterConfig
+
+
+class CallReporterFactory:
+    @staticmethod
+    def create_call_reporter(config: CallReporterConfig, logger: Optional[logging.Logger] = None):
+        if isinstance(config, ApiCallReporterConfig):
+            return ApiCallReporter(config, logger)
diff --git a/vocode/streaming/streaming_conversation.py b/vocode/streaming/streaming_conversation.py
@@ -39,6 +39,7 @@
     TranscriptCompleteEvent,
 )
 from vocode.streaming.output_device.base_output_device import BaseOutputDevice
+from vocode.streaming.report.base_call_report import CallReporterConfig, BaseCallReporter
 from vocode.streaming.response_worker.random_response import RandomAudioManager
 from vocode.streaming.synthesizer.base_synthesizer import (
     BaseSynthesizer,
@@ -47,7 +48,7 @@
 from vocode.streaming.telephony.noise_canceler.base_noise_canceler import BaseNoiseCanceler
 from vocode.streaming.transcriber.base_transcriber import (
     Transcription,
-    BaseTranscriber,
+    BaseTranscriber, HUMAN_ACTIVITY_DETECTED,
 )
 from vocode.streaming.utils import create_conversation_id, get_chunk_size_per_second
 from vocode.streaming.utils.conversation_logger_adapter import wrap_logger
@@ -138,7 +139,9 @@ async def process(self, transcription: Transcription):
             )
             self.conversation.is_human_speaking = not transcription.is_final
             if transcription.is_final:
-                # we use getattr here to avoid the dependency cycle between VonageCall and StreamingConversation
+                if self.conversation.transcriber.transcriber_config.voice_activity_detector_config and \
+                        transcription.message == HUMAN_ACTIVITY_DETECTED:
+                    return
                 event = self.interruptable_event_factory.create_interruptable_event(
                     TranscriptionAgentInput(
                         transcription=transcription,
@@ -185,7 +188,6 @@ async def process(self, item: InterruptableAgentResponseEvent[AgentResponse]):
                 return
             try:
                 agent_response = item.payload
-                self.conversation.logger.debug("Got agent response: {}".format(agent_response))
                 if isinstance(agent_response, AgentResponseFillerAudio):
                     self.conversation.random_audio_manager.sync_send_filler_audio(item.agent_response_tracker)
                     return
@@ -296,6 +298,7 @@ def __init__(
             agent: BaseAgent,
             synthesizer: BaseSynthesizer,
             noise_canceler: Optional[BaseNoiseCanceler] = None,
+            call_reporter: Optional[BaseCallReporter] = None,
             conversation_id: Optional[str] = None,
             per_chunk_allowance_seconds: float = PER_CHUNK_ALLOWANCE_SECONDS,
             events_manager: Optional[EventsManager] = None,
@@ -307,6 +310,7 @@ def __init__(
             logger or logging.getLogger(__name__),
             conversation_id=self.id,
         )
+        self.call_reporter = call_reporter
         self.output_device = output_device
         self.transcriber = transcriber
         self.agent = agent
@@ -494,9 +498,12 @@ def broadcast_interrupt(self):
         return num_interrupts > 0
 
     def is_interrupt(self, transcription: Transcription):
-        return transcription.confidence >= (
-                self.transcriber.get_transcriber_config().min_interrupt_confidence or 0
-        )
+        interrupt_by_confidence = transcription.confidence >= (
+                self.transcriber.get_transcriber_config().min_interrupt_confidence or 0)
+        interrupt_by_vad = (
+                self.transcriber.transcriber_config.voice_activity_detector_config and
+                transcription.message == HUMAN_ACTIVITY_DETECTED)
+        return interrupt_by_confidence or interrupt_by_vad
 
     async def send_speech_to_output(
             self,
@@ -579,6 +586,10 @@ async def send_speech_to_output(
     def mark_terminated(self):
         self.active = False
 
+    def report_call(self):
+        if self.call_reporter:
+            self.call_reporter.report(self.id, self.transcript)
+
     async def terminate(self):
         self.mark_terminated()
         self.broadcast_interrupt()
@@ -612,6 +623,7 @@ async def terminate(self):
         self.logger.debug("Terminating speech transcriber")
         self.transcriber.terminate()
         self.logger.debug("Terminating transcriptions worker")
+        self.report_call()
         self.transcriptions_worker.terminate()
         self.logger.debug("Terminating final transcriptions worker")
         self.agent_responses_worker.terminate()

diff --git a/vocode/streaming/synthesizer/eleven_labs_synthesizer.py b/vocode/streaming/synthesizer/eleven_labs_synthesizer.py
@@ -174,9 +174,6 @@ async def get_audio_data_from_cache_or_download(self, phrase: BaseMessage, base_
         )
         filler_audio_path = os.path.join(base_path, f"{cache_key}.wav")
         if not os.path.exists(filler_audio_path):
-            print('$#@$!#@$!@#'*10)
-            print(filler_audio_path)
-            print('$#@$!#@$!@#'*10)       
             self.logger.debug(f"Generating cached audio for {phrase.text}")
             audio_data = await self.download_filler_audio_data(phrase)
 

diff --git a/vocode/streaming/telephony/conversation/call.py b/vocode/streaming/telephony/conversation/call.py
@@ -14,6 +14,8 @@
 )
 from vocode.streaming.output_device.twilio_output_device import TwilioOutputDevice
 from vocode.streaming.output_device.vonage_output_device import VonageOutputDevice
+from vocode.streaming.report.base_call_report import CallReporterConfig
+from vocode.streaming.report.factory import CallReporterFactory
 from vocode.streaming.streaming_conversation import StreamingConversation
 from vocode.streaming.synthesizer.factory import SynthesizerFactory
 from vocode.streaming.telephony.config_manager.base_config_manager import (
@@ -47,9 +49,11 @@ def __init__(
             agent_factory: AgentFactory = AgentFactory(),
             synthesizer_factory: SynthesizerFactory = SynthesizerFactory(),
             noise_canceler_factory: NoiseCancelerFactory = NoiseCancelerFactory(),
+            call_reporter_factory: CallReporterFactory = CallReporterFactory(),
             events_manager: Optional[EventsManager] = None,
             logger: Optional[logging.Logger] = None,
             noise_canceling_config: Optional[NoiseCancelingConfig] = None,
+            call_reporter_config: Optional[CallReporterConfig] = None,
     ):
         conversation_id = conversation_id or create_conversation_id()
         logger = wrap_logger(
@@ -67,6 +71,7 @@ def __init__(
             agent_factory.create_agent(agent_config, logger=logger),
             synthesizer_factory.create_synthesizer(synthesizer_config, logger=logger),
             noise_canceler_factory.create_noise_canceler(noise_canceling_config, logger=logger),
+            call_reporter_factory.create_call_reporter(call_reporter_config, logger=logger),
             conversation_id=conversation_id,
             per_chunk_allowance_seconds=0.01,
             events_manager=events_manager,

diff --git a/vocode/streaming/telephony/conversation/outbound_call.py b/vocode/streaming/telephony/conversation/outbound_call.py
@@ -16,6 +16,7 @@
 from vocode.streaming.models.transcriber import (
     TranscriberConfig,
 )
+from vocode.streaming.report.base_call_report import CallReporterConfig
 from vocode.streaming.telephony.client.base_telephony_client import BaseTelephonyClient
 from vocode.streaming.telephony.client.twilio_client import TwilioClient
 from vocode.streaming.telephony.client.vonage_client import VonageClient
@@ -27,24 +28,25 @@
 
 class OutboundCall:
     def __init__(
-        self,
-        base_url: str,
-        to_phone: str,
-        from_phone: str,
-        config_manager: BaseConfigManager,
-        agent_config: AgentConfig,
-        twilio_config: Optional[TwilioConfig] = None,
-        vonage_config: Optional[VonageConfig] = None,
-        transcriber_config: Optional[TranscriberConfig] = None,
-        synthesizer_config: Optional[SynthesizerConfig] = None,
-        conversation_id: Optional[str] = None,
-        logger: Optional[logging.Logger] = None,
-        mobile_only: bool = True,
-        digits: Optional[
-            str
-        ] = None,  # Keys to press when the call connects, see send_digits https://www.twilio.com/docs/voice/api/call-resource#create-a-call-resource
-        output_to_speaker: bool = False,
-    ):
+            self,
+            base_url: str,
+            to_phone: str,
+            from_phone: str,
+            config_manager: BaseConfigManager,
+            agent_config: AgentConfig,
+            twilio_config: Optional[TwilioConfig] = None,
+            vonage_config: Optional[VonageConfig] = None,
+            transcriber_config: Optional[TranscriberConfig] = None,
+            synthesizer_config: Optional[SynthesizerConfig] = None,
+            conversation_id: Optional[str] = None,
+            logger: Optional[logging.Logger] = None,
+            mobile_only: bool = True,
+            digits: Optional[
+                str
+            ] = None,
+            # Keys to press when the call connects, see send_digits https://www.twilio.com/docs/voice/api/call-resource#create-a-call-resource
+            output_to_speaker: bool = False,
+            call_reporter_config: [CallReporterConfig] = None):
         self.base_url = base_url
         self.to_phone = to_phone
         self.digits = digits
@@ -72,6 +74,7 @@ def __init__(
         self.synthesizer_config = self.create_synthesizer_config(synthesizer_config)
         self.telephony_id = None
         self.output_to_speaker = output_to_speaker
+        self.call_reporter_config = call_reporter_config
 
     def create_telephony_client(self) -> BaseTelephonyClient:
         if self.twilio_config is not None:
@@ -86,7 +89,7 @@ def create_telephony_client(self) -> BaseTelephonyClient:
             raise ValueError("No telephony config provided")
 
     def create_transcriber_config(
-        self, transcriber_config_override: Optional[TranscriberConfig]
+            self, transcriber_config_override: Optional[TranscriberConfig]
     ) -> TranscriberConfig:
         if transcriber_config_override is not None:
             return transcriber_config_override
@@ -98,7 +101,7 @@ def create_transcriber_config(
             raise ValueError("No telephony config provided")
 
     def create_synthesizer_config(
-        self, synthesizer_config_override: Optional[SynthesizerConfig]
+            self, synthesizer_config_override: Optional[SynthesizerConfig]
     ) -> SynthesizerConfig:
         if synthesizer_config_override is not None:
             return synthesizer_config_override
@@ -132,6 +135,7 @@ async def start(self):
                 twilio_sid=self.telephony_id,
                 from_phone=self.from_phone,
                 to_phone=self.to_phone,
+                call_reporter_config=self.call_reporter_config,
             )
         elif isinstance(self.telephony_client, VonageClient):
             call_config = VonageCallConfig(
@@ -143,6 +147,7 @@ async def start(self):
                 from_phone=self.from_phone,
                 to_phone=self.to_phone,
                 output_to_speaker=self.output_to_speaker,
+                call_reporter_config=self.call_reporter_config,
             )
         else:
             raise ValueError("Unknown telephony client")

diff --git a/vocode/streaming/telephony/conversation/twilio_call.py b/vocode/streaming/telephony/conversation/twilio_call.py
@@ -18,6 +18,7 @@
 from vocode.streaming.models.transcriber import (
     TranscriberConfig,
 )
+from vocode.streaming.report.base_call_report import CallReporterConfig
 from vocode.streaming.synthesizer.factory import SynthesizerFactory
 from vocode.streaming.telephony.client.twilio_client import TwilioClient
 from vocode.streaming.telephony.config_manager.base_config_manager import (
@@ -51,6 +52,7 @@ def __init__(
             synthesizer_factory: SynthesizerFactory = SynthesizerFactory(),
             events_manager: Optional[EventsManager] = None,
             logger: Optional[logging.Logger] = None,
+            call_reporter_config: Optional[CallReporterConfig] = None,
     ):
         noise_canceling_config = twilio_config.noise_canceling_config if twilio_config else None
         super().__init__(
@@ -69,6 +71,7 @@ def __init__(
             synthesizer_factory=synthesizer_factory,
             logger=logger,
             noise_canceling_config=noise_canceling_config,
+            call_reporter_config=call_reporter_config,
         )
         self.base_url = base_url
         self.config_manager = config_manager