From 52551b14ff8d1c3f5906b73dbaab4b792053cd96 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 24 Jun 2022 10:28:17 +0200 Subject: [PATCH] Deprecate unit, specify samples as string (#53) --- audinterface/core/feature.py | 120 ++++++++++++++++++++++++----------- audinterface/core/process.py | 43 +++++++++---- audinterface/core/segment.py | 32 ++++++++-- audinterface/core/utils.py | 40 +++++++++++- tests/test_feature.py | 80 ++++++++++++++++------- 5 files changed, 238 insertions(+), 77 deletions(-) diff --git a/audinterface/core/feature.py b/audinterface/core/feature.py index 2755196..9539248 100644 --- a/audinterface/core/feature.py +++ b/audinterface/core/feature.py @@ -60,15 +60,24 @@ class Feature: sampling_rate: sampling rate in Hz. If ``None`` it will call ``process_func`` with the actual sampling rate of the signal - win_dur: window size in ``unit``, - if features are extracted with a sliding window - hop_dur: hop size in ``unit``, + win_dur: window size, + if features are extracted with a sliding window. + If value is as a float or integer + it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` + hop_dur: hop size, if features are extracted with a sliding window. This defines the shift between two windows. - Defaults to ``win_dur / 2``. - unit: unit of ``win_dur`` and ``hop_dur``. - Can be ``'samples'``, - or any unit supported by :func:`pandas.to_timedelta` + If value is as a float or integer + it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide a string without unit, + e.g. ``'2000'``. + Defaults to ``win_dur / 2`` resample: if ``True`` enforces given sampling rate by resampling channels: channel selection, see :func:`audresample.remix` mixdown: apply mono mix-down on selection @@ -86,8 +95,8 @@ class Feature: verbose: show debug messages Raises: - ValueError: if ``unit == 'samples'``, ``sampling_rate is None`` - and ``win_dur is not None`` + ValueError: if ``win_dur`` or ``hop_dur`` are given in samples + and ``sampling_rate is None`` ValueError: if ``hop_dur`` is specified, but not ``win_dur`` Example: @@ -127,9 +136,8 @@ def __init__( process_func_args: typing.Dict[str, typing.Any] = None, process_func_is_mono: bool = False, sampling_rate: int = None, - win_dur: typing.Union[int, float] = None, - hop_dur: typing.Union[int, float] = None, - unit: str = 'seconds', + win_dur: Timestamp = None, + hop_dur: Timestamp = None, resample: bool = False, channels: typing.Union[int, typing.Sequence[int]] = 0, mixdown: bool = False, @@ -142,6 +150,33 @@ def __init__( ): feature_names = audeer.to_list(feature_names) + # ------ + # Handle deprecated 'unit' keyword argument + def add_unit(dur, unit): + if unit == 'samples': + return str(dur) + else: + return f'{dur}{unit}' + + if 'unit' in kwargs: + message = ( + "'unit' argument is deprecated " + "and will be removed with version '1.2.0'." + "The unit can now directly specified " + "within the 'win_dur' and 'hop_dur' arguments." + ) + warnings.warn( + message, + category=UserWarning, + stacklevel=2, + ) + unit = kwargs.pop('unit') + if win_dur is not None: + win_dur = add_unit(win_dur, unit) + if hop_dur is not None: + hop_dur = add_unit(hop_dur, unit) + # ------ + process_func_args = process_func_args or {} if kwargs: warnings.warn( @@ -156,11 +191,6 @@ def __init__( raise ValueError( "You have to specify 'win_dur' if 'hop_dur' is given." ) - if unit == 'samples' and sampling_rate is None and win_dur is not None: - raise ValueError( - "You have specified 'samples' as unit, " - "but haven't provided a sampling rate." - ) if process_func is None: def process_func(signal, _): @@ -214,9 +244,8 @@ def process_func(signal, _): self.hop_dur = hop_dur r"""Hop duration.""" if win_dur is not None and hop_dur is None: - self.hop_dur = win_dur // 2 if unit == 'samples' else win_dur / 2 - self.unit = unit - r"""Unit of ``win_dur`` and ``hop dur``""" + win_dur = utils.to_timedelta(win_dur, sampling_rate) + self.hop_dur = win_dur / 2 self.verbose = verbose r"""Show debug messages.""" @@ -233,9 +262,17 @@ def process_file( Args: file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` root: root folder to expand relative file path Raises: @@ -267,9 +304,17 @@ def process_files( files: list of file paths starts: segment start positions. Time values given as float or integers are treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files ends: segment end positions. Time values given as float or integers are treated as seconds + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files root: root folder to expand relative file paths @@ -395,9 +440,17 @@ def process_signal( sampling_rate: sampling rate in Hz file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` Raises: RuntimeError: if sampling rates do not match @@ -585,20 +638,11 @@ def _values_to_frame( # [n_features, n_frames] # [n_features] - if self.win_dur is not None: - if self.unit == 'samples': - win_dur = pd.to_timedelta( - self.win_dur / self.process.sampling_rate, unit='seconds', - ) - hop_dur = pd.to_timedelta( - self.hop_dur / self.process.sampling_rate, unit='seconds', - ) - else: - win_dur = pd.to_timedelta(self.win_dur, unit=self.unit) - hop_dur = pd.to_timedelta(self.hop_dur, unit=self.unit) - else: - win_dur = None - hop_dur = None + win_dur = self.win_dur + hop_dur = self.hop_dur + if win_dur is not None: + win_dur = utils.to_timedelta(win_dur, self.process.sampling_rate) + hop_dur = utils.to_timedelta(hop_dur, self.process.sampling_rate) features = self._reshape_3d(features) n_channels, n_features, n_frames = features.shape diff --git a/audinterface/core/process.py b/audinterface/core/process.py index 2931f91..c5fbbd2 100644 --- a/audinterface/core/process.py +++ b/audinterface/core/process.py @@ -183,9 +183,17 @@ def process_file( Args: file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` root: root folder to expand relative file path Returns: @@ -198,8 +206,6 @@ def process_file( .. _audformat: https://audeering.github.io/audformat/data-format.html """ - start = utils.to_timedelta(start) - end = utils.to_timedelta(end) if self.segment is not None: index = self.segment.process_file( file, @@ -209,6 +215,8 @@ def process_file( ) return self._process_index_wo_segment(index, root) else: + start = utils.to_timedelta(start, self.sampling_rate) + end = utils.to_timedelta(end, self.sampling_rate) return self._process_file(file, start=start, end=end, root=root) def process_files( @@ -225,9 +233,17 @@ def process_files( files: list of file paths starts: segment start positions. Time values given as float or integers are treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files ends: segment end positions. Time values given as float or integers are treated as seconds + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files root: root folder to expand relative file paths @@ -249,9 +265,6 @@ def process_files( if isinstance(ends, (type(None), float, int, str, pd.Timedelta)): ends = [ends] * len(files) - starts = utils.to_timedelta(starts) - ends = utils.to_timedelta(ends) - params = [ ( (file, ), @@ -449,9 +462,17 @@ def process_signal( sampling_rate: sampling rate in Hz file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` + If value is as a float or integer it is treated as seconds. Returns: Series with processed signal conform to audformat_ @@ -463,8 +484,6 @@ def process_signal( .. _audformat: https://audeering.github.io/audformat/data-format.html """ - start = utils.to_timedelta(start) - end = utils.to_timedelta(end) if self.segment is not None: index = self.segment.process_signal( signal, @@ -479,6 +498,8 @@ def process_signal( index, ) else: + start = utils.to_timedelta(start, sampling_rate) + end = utils.to_timedelta(end, sampling_rate) return self._process_signal( signal, sampling_rate, diff --git a/audinterface/core/segment.py b/audinterface/core/segment.py index 0ab7d72..1e79fb0 100644 --- a/audinterface/core/segment.py +++ b/audinterface/core/segment.py @@ -229,9 +229,17 @@ def process_file( Args: file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` root: root folder to expand relative file path Returns: @@ -280,9 +288,17 @@ def process_files( files: list of file paths starts: segment start positions. Time values given as float or integers are treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files ends: segment end positions. Time values given as float or integers are treated as seconds + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'``. If a scalar is given, it is applied to all files root: root folder to expand relative file paths @@ -413,9 +429,17 @@ def process_signal( sampling_rate: sampling rate in Hz file: file path start: start processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` end: end processing at this position. - If value is as a float or integer it is treated as seconds + If value is as a float or integer it is treated as seconds. + To specify a unit provide as string, + e.g. ``'2ms'``. + To specify in samples provide as string without unit, + e.g. ``'2000'`` Returns: Segmented index conform to audformat_ diff --git a/audinterface/core/utils.py b/audinterface/core/utils.py index c278418..576c8a4 100644 --- a/audinterface/core/utils.py +++ b/audinterface/core/utils.py @@ -1,3 +1,4 @@ +import collections import os import typing @@ -289,8 +290,43 @@ def to_array(value: typing.Any) -> np.ndarray: return value -def to_timedelta(times: Timestamps): - r"""Convert time value to pd.Timedelta.""" +def to_timedelta( + times: Timestamps, + sampling_rate: int = None, +) -> typing.Union[pd.Timedelta, typing.Sequence[pd.Timedelta]]: + r"""Convert time value to pd.Timedelta. + + If time is given as string without unit, + it is treated as samples + and requires that ``'sampling_rate'`` is not ``None``. + + """ + + def convert_samples_to_seconds(time): + if isinstance(time, str): + # ensure we have a str and not numpy.str_ + time = str(time) + # string without unit represents samples + if all(t.isdigit() for t in time): + if sampling_rate is None: + raise ValueError( + "You have to provide 'sampling_rate' " + "when specifying the duration in samples " + f"as you did with '{time}'." + ) + time = int(time) / sampling_rate + return time + + if ( + not isinstance(times, str) + and isinstance(times, collections.abc.Iterable) + ): + # sequence of time entries + times = [convert_samples_to_seconds(t) for t in times] + else: + # single time entry + times = convert_samples_to_seconds(times) + try: return pd.to_timedelta(times, unit='s') except ValueError: # catches values like '1s' diff --git a/tests/test_feature.py b/tests/test_feature.py index 21a82fd..54546aa 100644 --- a/tests/test_feature.py +++ b/tests/test_feature.py @@ -4,6 +4,7 @@ import pandas as pd import pytest +import audeer import audformat import audinterface import audiofile as af @@ -33,19 +34,43 @@ def features_extractor_sliding_window(signal, _, hop_size): return np.ones((NUM_CHANNELS, NUM_FEATURES, num_time_steps)) +def test_deprecated_unit_argument(): + if ( + audeer.LooseVersion(audinterface.__version__) + < audeer.LooseVersion('1.2.0') + ): + with pytest.warns(UserWarning, match='is deprecated'): + interface = audinterface.Feature( + ['a'], + win_dur=1000, + unit='samples', + sampling_rate=16000, + ) + assert interface.win_dur == '1000' + interface = audinterface.Feature( + ['a'], + win_dur=1000, + hop_dur=500, + unit='milliseconds', + ) + assert interface.win_dur == '1000milliseconds' + assert interface.hop_dur == '500milliseconds' + else: + with pytest.raises(TypeError, match='unexpected keyword argument'): + audinterface.Feature(['a'], unit='samples') + + def test_feature(): - # You have to specify sampling rate with unit == 'samples' and win_dur + # You have to specify sampling rate when win_dur is in samples with pytest.raises(ValueError): audinterface.Feature( feature_names=('o1', 'o2', 'o3'), sampling_rate=None, - unit='samples', - win_dur=2048, + win_dur='2048', ) # If no win_dur is given, no error should occur audinterface.Feature( feature_names=('o1', 'o2', 'o3'), - unit='samples', sampling_rate=None, ) # Only hop_dur is given @@ -56,8 +81,7 @@ def test_feature(): ) audinterface.Feature( feature_names=('o1', 'o2', 'o3'), - win_dur=2048, - unit='samples', + win_dur='2048', sampling_rate=8000, ) @@ -824,20 +848,21 @@ def test_process_index(tmpdir): @pytest.mark.parametrize( - 'win_dur, hop_dur, unit', + 'win_dur, hop_dur', [ - (1, 0.5, 'seconds'), - (1, None, 'seconds'), - (16000, None, 'samples'), - (1000, 500, 'milliseconds'), - (SAMPLING_RATE, SAMPLING_RATE // 2, 'samples'), + (1, 0.5), + (pd.to_timedelta(1, unit='s'), None), + ('16000', None), + ('1000ms', '500ms'), + ('1000milliseconds', '500milliseconds'), + (f'{SAMPLING_RATE}', f'{SAMPLING_RATE // 2}'), pytest.param( # multiple frames, but win_dur is None - None, None, 'seconds', + None, None, marks=pytest.mark.xfail(raises=RuntimeError), ), ], ) -def test_signal_sliding_window(win_dur, hop_dur, unit): +def test_signal_sliding_window(win_dur, hop_dur): # Test sliding window with two time steps expected_features = np.ones((NUM_CHANNELS, 2 * NUM_FEATURES)) extractor = audinterface.Feature( @@ -850,7 +875,6 @@ def test_signal_sliding_window(win_dur, hop_dur, unit): win_dur=win_dur, hop_dur=hop_dur, sampling_rate=SAMPLING_RATE, - unit=unit, ) features = extractor.process_signal( SIGNAL_2D, @@ -858,20 +882,32 @@ def test_signal_sliding_window(win_dur, hop_dur, unit): ) n_time_steps = len(features) - if unit == 'samples': - win_dur = win_dur / SAMPLING_RATE - if hop_dur is not None: - hop_dur /= SAMPLING_RATE - unit = 'seconds' + if isinstance(win_dur, str): + if all(s.isdigit() for s in win_dur): + # samples + win_dur = pd.to_timedelta(int(win_dur) / SAMPLING_RATE, unit='s') + else: + win_dur = pd.to_timedelta(win_dur) + else: + win_dur = pd.to_timedelta(win_dur, unit='s') + if hop_dur is None: hop_dur = win_dur / 2 + elif isinstance(hop_dur, str): + if all(s.isdigit() for s in hop_dur): + # samples + hop_dur = pd.to_timedelta(int(hop_dur) / SAMPLING_RATE, unit='s') + else: + hop_dur = pd.to_timedelta(hop_dur) + else: + hop_dur = pd.to_timedelta(hop_dur, unit='s') starts = pd.timedelta_range( pd.to_timedelta(0), - freq=pd.to_timedelta(hop_dur, unit=unit), + freq=hop_dur, periods=n_time_steps, ) - ends = starts + pd.to_timedelta(win_dur, unit=unit) + ends = starts + win_dur index = audinterface.utils.signal_index(starts, ends) pd.testing.assert_frame_equal(