Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fastpitch style pitch and energy variation #17

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ model:
guided_attn_loss_lambda: 1.0

### FastSpeech
energy_embed_kernel_size: 1
energy_embed_dropout: 0.0
pitch_embed_kernel_size: 1
pitch_embed_dropout: 0.0
duration_predictor_layers : 2
duration_predictor_chans : 256
duration_predictor_kernel_size : 3
Expand All @@ -110,7 +114,7 @@ train:
# optimization related
eos: False #True
opt: 'noam'
accum_grad: 4
accum_grad: 1
grad_clip: 1.0
weight_decay: 0.001
patience: 0
Expand All @@ -126,13 +130,13 @@ train:
seed: 1 # random seed number
resume: "" # the snapshot path to resume (if set empty, no effect)
use_phonemes: True
batch_size : 16
batch_size : 48
# other
melgan_vocoder : True
save_interval : 1000
chkpt_dir : './checkpoints'
log_dir : './logs'
summary_interval : 200
validation_step : 500
validation_step : 1000
tts_max_mel_len : 870 # if you have a couple of extremely long spectrograms you might want to use this
tts_bin_lengths : True # bins the spectrogram lengths before sampling in data loader - speeds up training
tts_bin_lengths : True # bins the spectrogram lengths before sampling in data loader - speeds up training
4 changes: 2 additions & 2 deletions core/variance_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):

"""
out = self.predictor.inference(xs, False, alpha=alpha)
return self.to_one_hot(out) # Need to do One hot code
return out # Need to do One hot code

def to_one_hot(self, x):
# e = de_norm_mean_std(e, hp.e_mean, hp.e_std)
Expand Down Expand Up @@ -222,7 +222,7 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):

"""
out = self.predictor.inference(xs, False, alpha=alpha)
return self.to_one_hot(out)
return out

def to_one_hot(self, x: torch.Tensor):
# e = de_norm_mean_std(e, hp.e_mean, hp.e_std)
Expand Down
Empty file added dataset/audio/__init__.py
Empty file.
File renamed without changes.
90 changes: 90 additions & 0 deletions dataset/audio/energy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import Any
from typing import Dict
from typing import Optional
from typing import Tuple
import torch
import torch.nn.functional as F
from typeguard import check_argument_types
from utils.stft import TacotronSTFT

class Energy():
"""Energy extractor."""
def __init__(
self,
fs: int= 22050,
n_fft: int = 1024,
win_length: int = 1024,
hop_length: int = 256,
window: Optional[str] = "hann",
num_mel: int = 80,
fmin: int = 0,
fmax: int = 8000,
use_token_averaged_energy: bool = True,
):
assert check_argument_types()
super().__init__()
self.fs = fs
self.n_fft = n_fft
self.hop_length = hop_length
self.win_length = win_length
self.window = window
self.use_token_averaged_energy = use_token_averaged_energy

self.stft = TacotronSTFT(
filter_length=n_fft,
hop_length=hop_length,
win_length=win_length,
n_mel_channels=num_mel,
sampling_rate=fs,
mel_fmin=fmin,
mel_fmax=fmax,
)

def output_size(self) -> int:
return 1

def get_parameters(self) -> Dict[str, Any]:
return dict(
fs=self.fs,
n_fft=self.n_fft,
hop_length=self.hop_length,
window=self.window,
win_length=self.win_length,
use_token_averaged_energy=self.use_token_averaged_energy,
)

def forward(
self,
mag: torch.Tensor,
durations: torch.Tensor = None
) -> Tuple[torch.Tensor, torch.Tensor]:
# Domain-conversion: e.g. Stft: time -> time-freq
#input_stft, energy_lengths = self.stft(input, input_lengths)
#input mag shape - (h, T)
energy = torch.norm(mag, dim=0)

# (Optional): Average by duration to calculate token-wise energy
if self.use_token_averaged_energy:
energy = self._average_by_duration(energy, durations)

# Return with the shape (B, T, 1)
return energy

@staticmethod
def _average_by_duration(x: torch.Tensor, d: torch.Tensor) -> torch.Tensor:
#print(d.sum(), len(x))
assert d.sum() == len(x)
d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
x_avg = [
x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0)
for start, end in zip(d_cumsum[:-1], d_cumsum[1:])
]
return torch.stack(x_avg)

@staticmethod
def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tensor:
if num_frames > len(x):
x = F.pad(x, (0, num_frames - len(x)))
elif num_frames < len(x):
x = x[:num_frames]
return x
155 changes: 155 additions & 0 deletions dataset/audio/pitch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
"""F0 extractor using DIO + Stonemask algorithm."""

import logging

from typing import Any
from typing import Dict
from typing import Optional
from typing import Tuple
import numpy as np
import pyworld
import torch
import torch.nn.functional as F

from scipy.interpolate import interp1d
from typeguard import check_argument_types


class Dio():
"""F0 estimation with dio + stonemask algortihm.
This is f0 extractor based on dio + stonmask algorithm introduced in `WORLD:
a vocoder-based high-quality speech synthesis system for real-time applications`_.
.. _`WORLD: a vocoder-based high-quality speech synthesis system for real-time
applications`: https://doi.org/10.1587/transinf.2015EDP7457
Note:
This module is based on NumPy implementation. Therefore, the computational graph
is not connected.
Todo:
Replace this module with PyTorch-based implementation.
"""


def __init__(
self,
fs: int = 22050,
n_fft: int = 1024,
hop_length: int = 256,
f0min: Optional[int] = 71,
f0max: Optional[int] = 400,
use_token_averaged_f0: bool = True,
use_continuous_f0: bool = True,
use_log_f0: bool = True,
):
assert check_argument_types()
super().__init__()
self.fs = fs
self.n_fft = n_fft
self.hop_length = hop_length
self.frame_period = 1000 * hop_length / fs
self.f0min = f0min
self.f0max = f0max
self.use_token_averaged_f0 = use_token_averaged_f0
self.use_continuous_f0 = use_continuous_f0
self.use_log_f0 = use_log_f0

def output_size(self) -> int:
return 1

def get_parameters(self) -> Dict[str, Any]:
return dict(
fs=self.fs,
n_fft=self.n_fft,
hop_length=self.hop_length,
f0min=self.f0min,
f0max=self.f0max,
use_token_averaged_f0=self.use_token_averaged_f0,
use_continuous_f0=self.use_continuous_f0,
use_log_f0=self.use_log_f0,
)

def forward(
self,
input: torch.Tensor,
feats_lengths: torch.Tensor = None,
durations: torch.Tensor = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
# If not provide, we assume that the inputs have the same length
# F0 extraction

# input shape = [T,]
pitch = self._calculate_f0(input)
# (Optional): Adjust length to match with the mel-spectrogram
if feats_lengths is not None:
pitch = [
self._adjust_num_frames(p, fl).view(-1)
for p, fl in zip(pitch, feats_lengths)
]
# (Optional): Average by duration to calculate token-wise f0
if self.use_token_averaged_f0:
pitch = self._average_by_duration(pitch, durations)
pitch_lengths = len(durations)
else:
pitch_lengths = input.new_tensor([len(p) for p in pitch], dtype=torch.long)
# Return with the shape (B, T, 1)
return pitch


def _calculate_f0(self, input: torch.Tensor) -> torch.Tensor:
x = input.cpu().numpy().astype(np.double)
#print(self.frame_period)
f0, timeaxis = pyworld.dio(
x,
self.fs,
f0_floor=self.f0min,
f0_ceil=self.f0max,
frame_period=self.frame_period,
)

f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
if self.use_continuous_f0:
f0 = self._convert_to_continuous_f0(f0)
if self.use_log_f0:
nonzero_idxs = np.where(f0 != 0)[0]
f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
return input.new_tensor(f0.reshape(-1), dtype=torch.float)


@staticmethod
def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tensor:
if num_frames > len(x):
x = F.pad(x, (0, num_frames - len(x)))
elif num_frames < len(x):
x = x[:num_frames]
return x


@staticmethod
def _convert_to_continuous_f0(f0: np.array) -> np.array:
if (f0 == 0).all():
logging.warn("All frames seems to be unvoiced.")
return f0

# padding start and end of f0 sequence
start_f0 = f0[f0 != 0][0]
end_f0 = f0[f0 != 0][-1]
start_idx = np.where(f0 == start_f0)[0][0]
end_idx = np.where(f0 == end_f0)[0][-1]
f0[:start_idx] = start_f0
f0[end_idx:] = end_f0
# get non-zero frame index
nonzero_idxs = np.where(f0 != 0)[0]
# perform linear interpolation
interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
f0 = interp_fn(np.arange(0, f0.shape[0]))
return f0

@staticmethod
def _average_by_duration(x: torch.Tensor, d: torch.Tensor) -> torch.Tensor:
#print(d.sum(), len(x))
assert d.sum() == len(x)
d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
x_avg = [
x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0)
for start, end in zip(d_cumsum[:-1], d_cumsum[1:])
]
return torch.stack(x_avg)
2 changes: 1 addition & 1 deletion dataset/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def collate_tts(batch):
# scale spectrograms to -4 <--> 4
# mels = (mels * 8.) - 4

return inputs, ilens, mels, labels, olens, ids, durations, energys, pitches
return inputs, ilens, mels, labels, olens, ids, durations, energys.unsqueeze(-1), pitches.unsqueeze(-1)


class BinnedLengthSampler(Sampler):
Expand Down
4 changes: 3 additions & 1 deletion evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ def evaluate(hp, validloader, model):

with torch.no_grad():
ilens = torch.tensor([x_[-1].shape[0]], dtype=torch.long, device=x_.device)
_, after_outs, d_outs, e_outs, p_outs = model._forward(x_.cuda(), ilens.cuda(), out_length_.cuda(), dur_.cuda(), es=e_.cuda(), ps=p_.cuda(), is_inference=False) # [T, num_mel]
_, after_outs, d_outs, e_outs, p_outs = model._forward(x_.cuda(), ilens.cuda(), out_length_.cuda(),
dur_.cuda(), es=e_.cuda(), ps=p_.cuda(),
is_inference=False) # [T, num_mel]

# e_orig = model.energy_predictor.to_one_hot(e_).squeeze()
# p_orig = model.pitch_predictor.to_one_hot(p_).squeeze()
Expand Down
Loading