From c2622228460132bb940ae59bc9e7486163a005cf Mon Sep 17 00:00:00 2001 From: Jesse Engel Date: Fri, 17 Jul 2020 20:23:15 -0700 Subject: [PATCH] Fix cumsum error accumulation in sinusoidal oscillators. This resulted in audio degradation after 10s of seconds of synthesis. Also was a blocker for lower precision javascript implementations. Needs to be set explicitly globally for the moment `oscillator_bank.chunk_size = 1000`. PiperOrigin-RevId: 321901658 --- ddsp/colab/demos/timbre_transfer.ipynb | 1 + ddsp/core.py | 99 +++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 3 deletions(-) diff --git a/ddsp/colab/demos/timbre_transfer.ipynb b/ddsp/colab/demos/timbre_transfer.ipynb index 4a9529d2..a2a53b8e 100644 --- a/ddsp/colab/demos/timbre_transfer.ipynb +++ b/ddsp/colab/demos/timbre_transfer.ipynb @@ -289,6 +289,7 @@ " 'Additive.n_samples = {}'.format(n_samples),\n", " 'FilteredNoise.n_samples = {}'.format(n_samples),\n", " 'DefaultPreprocessor.time_steps = {}'.format(time_steps),\n", + " 'oscillator_bank.chunk_size = 1000', # Avoids cumsum accumulation errors.\n", "]\n", "\n", "with gin.unlock_config():\n", diff --git a/ddsp/core.py b/ddsp/core.py index fe59c98e..90582b58 100644 --- a/ddsp/core.py +++ b/ddsp/core.py @@ -81,6 +81,24 @@ def nested_lookup(nested_key: Text, return value +def pad_axis(x, padding=(0, 0), axis=0, **pad_kwargs): + """Pads only one axis of a tensor. + + Args: + x: Input tensor. + padding: Tuple of number of samples to pad (before, after). + axis: Which axis to pad. + **pad_kwargs: Other kwargs to pass to tf.pad. + + Returns: + A tensor padded with padding along axis. + """ + n_end_dims = len(x.shape) - axis - 1 + n_end_dims *= n_end_dims > 0 + paddings = [[0, 0]] * axis + [list(padding)] + [[0, 0]] * n_end_dims + return tf.pad(x, paddings, **pad_kwargs) + + # Math ------------------------------------------------------------------------- def safe_divide(numerator, denominator, eps=1e-7): """Avoid dividing by zero by adding a small epsilon.""" @@ -599,6 +617,70 @@ def harmonic_to_sinusoidal(harm_amp, harm_dist, f0_hz, sample_rate=16000): # Additive Synthesizer --------------------------------------------------------- +def angular_cumsum(angular_frequency, chunk_size=1000): + """Get phase by cummulative sumation of angular frequency. + + Custom cumsum splits first axis into chunks to avoid accumulation error. + Just taking tf.sin(tf.cumsum(angular_frequency)) leads to accumulation of + phase errors that are audible for long segments or at high sample rates. Also, + in reduced precision settings, cumsum can overflow the threshold. + + Given that we are going to take the sin of the accumulated phase anyways, we + don't care about the phase modulo 2 pi. This code chops the incoming frequency + into chunks, applies cumsum to each chunk, takes mod 2pi, and then stitches + them back together by adding the cumulative values of the final step of each + chunk to the next chunk. + + Seems to be ~30% faster on CPU, but at least 40% slower on TPU. + + Args: + angular_frequency: Radians per a sample. Shape [batch, time, ...]. + If there is no batch dimension, one will be temporarily added. + chunk_size: Number of samples per a chunk. to avoid overflow at low + precision [chunk_size <= (accumulation_threshold / pi)]. + + Returns: + The accumulated phase in range [0, 2*pi], shape [batch, time, ...]. + """ + # Get tensor shapes. + n_batch = angular_frequency.shape[0] + n_time = angular_frequency.shape[1] + n_dims = len(angular_frequency.shape) + n_ch_dims = n_dims - 2 + + # Pad if needed. + remainder = n_time % chunk_size + if remainder: + pad = chunk_size - remainder + angular_frequency = pad_axis(angular_frequency, [0, pad], axis=1) + + # Split input into chunks. + length = angular_frequency.shape[1] + n_chunks = int(length / chunk_size) + chunks = tf.reshape(angular_frequency, + [n_batch, n_chunks, chunk_size] + [-1] * n_ch_dims) + phase = tf.cumsum(chunks, axis=2) + + # Add offsets. + # Offset of the next row is the last entry of the previous row. + offsets = phase[:, :, -1:, ...] % (2.0 * np.pi) + offsets = pad_axis(offsets, [1, 0], axis=1) + offsets = offsets[:, :-1, ...] + + # Offset is cumulative among the rows. + offsets = tf.cumsum(offsets, axis=1) % (2.0 * np.pi) + phase = phase + offsets + + # Put back in original shape. + phase = phase % (2.0 * np.pi) + phase = tf.reshape(phase, [n_batch, length] + [-1] * n_ch_dims) + + # Remove padding if added it. + if remainder: + phase = phase[:, :n_time] + return phase + + def remove_above_nyquist(frequency_envelopes: tf.Tensor, amplitude_envelopes: tf.Tensor, sample_rate: int = 16000) -> tf.Tensor: @@ -624,10 +706,13 @@ def remove_above_nyquist(frequency_envelopes: tf.Tensor, return amplitude_envelopes +# TODO(jesseengel): Remove reliance on global injection of chunk_size. +@gin.configurable def oscillator_bank(frequency_envelopes: tf.Tensor, amplitude_envelopes: tf.Tensor, sample_rate: int = 16000, - sum_sinusoids: bool = True) -> tf.Tensor: + sum_sinusoids: bool = True, + chunk_size: int = 0) -> tf.Tensor: """Generates audio from sample-wise frequencies for a bank of oscillators. Args: @@ -637,6 +722,8 @@ def oscillator_bank(frequency_envelopes: tf.Tensor, n_samples, n_sinusoids]. sample_rate: Sample rate in samples per a second. sum_sinusoids: Add up audio from all the sinusoids. + chunk_size: Perform cumsum in chunks using angular_cumsum if chunk_size > 0. + Avoids accumulation of errors during summation but slower on accelerators. Returns: wav: Sample-wise audio. Shape [batch_size, n_samples, n_sinusoids] if @@ -650,12 +737,18 @@ def oscillator_bank(frequency_envelopes: tf.Tensor, amplitude_envelopes, sample_rate) - # Change Hz to radians per sample. + # Angular frequency, Hz -> radians per sample. omegas = frequency_envelopes * (2.0 * np.pi) # rad / sec omegas = omegas / float(sample_rate) # rad / sample # Accumulate phase and synthesize. - phases = tf.cumsum(omegas, axis=1) + if chunk_size > 0: + # Avoids accumulation errors. + phases = angular_cumsum(omegas, chunk_size=chunk_size) + else: + phases = tf.cumsum(omegas, axis=1) + + # Convert to waveforms. wavs = tf.sin(phases) audio = amplitude_envelopes * wavs # [mb, n_samples, n_sinusoids] if sum_sinusoids: