Skip to content

Commit

Permalink
Speed up Feature.process_*() (#103)
Browse files Browse the repository at this point in the history
* avoid use of pd.concat()

* TST: test sliding window on file
  • Loading branch information
frankenjoe authored Mar 13, 2023
1 parent 6722003 commit f4687e1
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 69 deletions.
124 changes: 81 additions & 43 deletions audinterface/core/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,36 +767,97 @@ def _reshape_3d(

def _series_to_frame(
self,
series: pd.Series,
y: pd.Series,
) -> pd.DataFrame:

if series.empty:
if y.empty:
return pd.DataFrame(
columns=self.column_names,
dtype=object,
)

frames = [None] * len(series)
if len(series.index.levels) == 3:
for idx, ((file, start, end), values) in enumerate(series.items()):
frames[idx] = self._values_to_frame(
values, file=file, start=start, end=end,
)
num = len(y)

if (
self.win_dur is not None and
self.process_func_applies_sliding_window
):

win_dur = utils.to_timedelta(
self.win_dur,
self.process.sampling_rate,
)
hop_dur = utils.to_timedelta(
self.hop_dur,
self.process.sampling_rate,
)

starts = []
ends = []
data = []

if len(y.index.levels) == 3:

files = []

for idx, ((file, start, end), values) in enumerate(y.items()):

frames = self._values_to_frame(values)
data.append(frames)

times = pd.timedelta_range(
start,
freq=hop_dur,
periods=frames.shape[0],
)

starts.extend(times.to_list())
ends.extend((times + win_dur).to_list())
files.extend([file] * len(times))

index = audformat.segmented_index(files, starts, ends)

else:

for idx, ((start, end), values) in enumerate(y.items()):
frames = self._values_to_frame(values)
data.append(frames)

times = pd.timedelta_range(
start,
freq=hop_dur,
periods=frames.shape[0],
)

starts.extend(times.to_list())
ends.extend((times + win_dur).to_list())

index = utils.signal_index(starts, ends)

data = np.concatenate(data)

else:
for idx, ((start, end), values) in enumerate(series.items()):
frames[idx] = self._values_to_frame(
values, start=start, end=end,
)
return pd.concat(frames, axis='index')

index = y.index
dtype = self._values_to_frame(y[0]).dtype
shape = (num, len(self.column_names))
data = np.empty(shape, dtype)

for idx, values in enumerate(y):
data[idx, :] = self._values_to_frame(values)

df = pd.DataFrame(
data,
index=index,
columns=self.column_names,
)

return df

def _values_to_frame(
self,
features: np.ndarray,
start: pd.Timedelta,
end: pd.Timedelta,
*,
file: str = None,
) -> pd.DataFrame:
) -> np.ndarray:

# Convert features to a pd.DataFrame
# Assumed formats are:
Expand All @@ -805,12 +866,6 @@ def _values_to_frame(
# [n_features, n_frames]
# [n_features]

win_dur = self.win_dur
hop_dur = self.hop_dur
if win_dur is not None:
win_dur = utils.to_timedelta(win_dur, self.process.sampling_rate)
hop_dur = utils.to_timedelta(hop_dur, self.process.sampling_rate)

features = self._reshape_3d(features)
n_channels, n_features, n_frames = features.shape

Expand All @@ -819,31 +874,14 @@ def _values_to_frame(
new_shape = (n_channels * n_features, n_frames)
features = features.reshape(new_shape).T

if n_frames > 1 and win_dur is None:
if n_frames > 1 and self.win_dur is None:
raise RuntimeError(
f"Got "
f"{n_frames} "
f"frames, but 'win_dur' is not set."
)

if win_dur is not None:
starts = pd.timedelta_range(
start,
freq=hop_dur,
periods=n_frames,
)
ends = starts + win_dur
else:
starts = [start]
ends = [end]

if file is None:
index = utils.signal_index(starts, ends)
else:
files = [file] * len(starts)
index = audformat.segmented_index(files, starts, ends)

return pd.DataFrame(features, index, columns=self.column_names)
return features

def __call__(
self,
Expand Down
78 changes: 52 additions & 26 deletions tests/test_feature.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

import audiofile
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -968,6 +969,13 @@ def test_process_signal_from_index(index, expected_features):
np.testing.assert_array_equal(features.values, expected_features)


@pytest.mark.parametrize(
'signal, num_channels, sampling_rate',
[
(SIGNAL_1D, 1, SAMPLING_RATE),
(SIGNAL_2D, 2, SAMPLING_RATE),
]
)
@pytest.mark.parametrize(
'process_func, is_mono, applies_sliding_window, feature_names',
[
Expand All @@ -991,47 +999,65 @@ def test_process_signal_from_index(index, expected_features):
(f'{SAMPLING_RATE // 2}', f'{SAMPLING_RATE // 4}'),
],
)
def test_signal_sliding_window(process_func, is_mono, applies_sliding_window,
def test_signal_sliding_window(tmpdir, signal, num_channels,
sampling_rate, process_func, is_mono,
applies_sliding_window,
feature_names, win_dur, hop_dur):

interface = audinterface.Feature(
feature_names=feature_names,
process_func=process_func,
process_func_is_mono=is_mono,
process_func_applies_sliding_window=applies_sliding_window,
channels=range(NUM_CHANNELS),
channels=range(num_channels),
win_dur=win_dur,
hop_dur=hop_dur,
sampling_rate=SAMPLING_RATE,
sampling_rate=sampling_rate,
)

for signal in [SIGNAL_1D, SIGNAL_2D]:
# signal

df = interface.process_signal(
SIGNAL_2D,
SAMPLING_RATE,
)
n_time_steps = len(df)
df = interface.process_signal(
signal,
sampling_rate,
)
n_time_steps = len(df)

win_dur = audinterface.utils.to_timedelta(win_dur, SAMPLING_RATE)
if hop_dur is None:
hop_dur = win_dur / 2
hop_dur = audinterface.utils.to_timedelta(hop_dur, SAMPLING_RATE)
win_dur = audinterface.utils.to_timedelta(win_dur, sampling_rate)
if hop_dur is None:
hop_dur = win_dur / 2
hop_dur = audinterface.utils.to_timedelta(hop_dur, sampling_rate)

starts = pd.timedelta_range(
pd.to_timedelta(0),
freq=hop_dur,
periods=n_time_steps,
)
ends = starts + win_dur
starts = pd.timedelta_range(
pd.to_timedelta(0),
freq=hop_dur,
periods=n_time_steps,
)
ends = starts + win_dur

index = audinterface.utils.signal_index(starts, ends)
expected = pd.DataFrame(
np.ones((n_time_steps, len(interface.column_names))),
index=index,
columns=interface.column_names,
)
pd.testing.assert_frame_equal(df, expected)
index = audinterface.utils.signal_index(starts, ends)
expected = pd.DataFrame(
np.ones((n_time_steps, len(interface.column_names))),
index=index,
columns=interface.column_names,
)
pd.testing.assert_frame_equal(df, expected)

# file

file = audeer.path(tmpdir, 'tmp.wav')
audiofile.write(file, signal, sampling_rate, bit_depth=32)

df = interface.process_file(file)

index = audformat.segmented_index([file] * len(starts), starts, ends)
expected = pd.DataFrame(
np.ones((n_time_steps, len(interface.column_names))),
index=index,
columns=interface.column_names,
dtype=np.float32,
)
pd.testing.assert_frame_equal(df, expected)


def test_signal_sliding_window_error():
Expand Down

0 comments on commit f4687e1

Please sign in to comment.