Skip to content

Commit

Permalink
feat/extract_speech (#139)
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl authored Sep 8, 2023
1 parent 53923c7 commit 16f19d4
Showing 1 changed file with 76 additions and 0 deletions.
76 changes: 76 additions & 0 deletions ovos_plugin_manager/templates/vad.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,32 @@
import abc
import collections

from ovos_config import Configuration
from ovos_utils import classproperty
from ovos_utils.process_utils import RuntimeRequirements


class AudioFrame:
"""Represents a "frame" of audio data."""

def __init__(self, audio: bytes, timestamp: float, duration: int):
self.bytes = audio
self.timestamp = timestamp
self.duration = duration


class VADEngine:
def __init__(self, config=None, sample_rate=None):
self.config_core = Configuration()
self.config = config or {}
self.sample_rate = sample_rate or \
self.config_core.get("listener", {}).get("sample_rate", 16000)

self.padding_duration_ms = self.config.get("padding_duration_ms", 300)
self.frame_duration_ms = self.config.get("frame_duration_ms", 30)
self.thresh = self.config.get("thresh", 0.8)
self.num_padding_frames = int(self.padding_duration_ms / self.frame_duration_ms)

@classproperty
def runtime_requirements(self):
""" skill developers should override this if they do not require connectivity
Expand Down Expand Up @@ -45,6 +62,65 @@ def runtime_requirements(self):
no_internet_fallback=True,
no_network_fallback=True)

def _frame_generator(self, audio: bytes):
"""Generates audio frames from PCM audio data.
Takes the desired frame duration in milliseconds, the PCM data, and
the sample rate.
Yields Frames of the requested duration.
"""
n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) * 2)
offset = 0
timestamp = 0.0
duration = (float(n) / self.sample_rate) / 2.0

while offset + n <= len(audio):
yield AudioFrame(audio[offset:offset + n], timestamp, duration)
timestamp += duration
offset += n

def extract_speech(self, audio: bytes):
"""returns the audio data with speech only, removing all noise before and after speech"""
# We use a deque for our sliding window/ring buffer.
ring_buffer = collections.deque(maxlen=self.num_padding_frames)
triggered = False
is_speech = False
voiced_frames = []

for frame in self._frame_generator(audio):

is_speech = not self.is_silence(frame.bytes)

if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
# If we're NOTTRIGGERED and more than 90% of the frames in
# the ring buffer are voiced frames, then enter the
# TRIGGERED state.
if num_voiced > self.thresh * ring_buffer.maxlen:
triggered = True
# We want to yield all the audio we see from now until
# we are NOTTRIGGERED, but we have to start with the
# audio that's already in the ring buffer.
for f, s in ring_buffer:
voiced_frames.append(f)
ring_buffer.clear()
else:
# We're in the TRIGGERED state, so collect the audio data
# and add it to the ring buffer.
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])

# If more than 90% of the frames in the ring buffer are
# unvoiced, then enter NOTTRIGGERED and yield whatever
# audio we've collected.
if num_unvoiced > self.thresh * ring_buffer.maxlen:
return b''.join([f.bytes for f in voiced_frames])

@abc.abstractmethod
def is_silence(self, chunk):
# return True or False
return False

def reset(self):
pass

0 comments on commit 16f19d4

Please sign in to comment.