rishikksh20 · rishikksh20 · Sep 10, 2020 · Sep 11, 2020 · Sep 11, 2020 · Sep 11, 2020
diff --git a/configs/default.yaml b/configs/default.yaml
@@ -95,6 +95,10 @@ model:
   guided_attn_loss_lambda: 1.0
 
   ### FastSpeech
+  energy_embed_kernel_size: 1
+  energy_embed_dropout: 0.0
+  pitch_embed_kernel_size: 1
+  pitch_embed_dropout: 0.0
   duration_predictor_layers : 2
   duration_predictor_chans : 256
   duration_predictor_kernel_size : 3
@@ -110,7 +114,7 @@ train:
   # optimization related
   eos: False #True
   opt: 'noam'
-  accum_grad: 4
+  accum_grad: 1
   grad_clip: 1.0
   weight_decay: 0.001
   patience: 0
@@ -126,13 +130,13 @@ train:
   seed: 1       # random seed number
   resume: ""    # the snapshot path to resume (if set empty, no effect)
   use_phonemes: True
-  batch_size : 16
+  batch_size : 48
   # other
   melgan_vocoder : True
   save_interval : 1000
   chkpt_dir : './checkpoints'
   log_dir : './logs'
   summary_interval : 200
-  validation_step : 500
+  validation_step : 1000
   tts_max_mel_len : 870              # if you have a couple of extremely long spectrograms you might want to use this
-  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
+  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
diff --git a/core/variance_predictor.py b/core/variance_predictor.py
@@ -149,7 +149,7 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):
 
         """
         out = self.predictor.inference(xs, False, alpha=alpha)
-        return self.to_one_hot(out)  # Need to do One hot code
+        return out  # Need to do One hot code
 
     def to_one_hot(self, x):
         # e = de_norm_mean_std(e, hp.e_mean, hp.e_std)
@@ -222,7 +222,7 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):
 
         """
         out = self.predictor.inference(xs, False, alpha=alpha)
-        return self.to_one_hot(out)
+        return out
 
     def to_one_hot(self, x: torch.Tensor):
         # e = de_norm_mean_std(e, hp.e_mean, hp.e_std)

diff --git a/dataset/audio/__init__.py b/dataset/audio/__init__.py
diff --git a/dataset/audio_processing.py → dataset/audio/audio_processing.py b/dataset/audio_processing.py → dataset/audio/audio_processing.py
diff --git a/dataset/audio/energy.py b/dataset/audio/energy.py
@@ -0,0 +1,90 @@
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+from typeguard import check_argument_types
+from utils.stft import TacotronSTFT
+
+class Energy():
+    """Energy extractor."""
+    def __init__(
+        self,
+        fs: int= 22050,
+        n_fft: int = 1024,
+        win_length: int = 1024,
+        hop_length: int = 256,
+        window: Optional[str] = "hann",
+        num_mel: int = 80,
+        fmin: int = 0,
+        fmax: int = 8000,
+        use_token_averaged_energy: bool = True,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.use_token_averaged_energy = use_token_averaged_energy
+
+        self.stft = TacotronSTFT(
+            filter_length=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_mel_channels=num_mel,
+            sampling_rate=fs,
+            mel_fmin=fmin,
+            mel_fmax=fmax,
+        )
+
+    def output_size(self) -> int:
+        return 1
+
+    def get_parameters(self) -> Dict[str, Any]:
+        return dict(
+            fs=self.fs,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=self.window,
+            win_length=self.win_length,
+            use_token_averaged_energy=self.use_token_averaged_energy,
+        )
+
+    def forward(
+        self,
+        mag: torch.Tensor,
+        durations: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Domain-conversion: e.g. Stft: time -> time-freq
+        #input_stft, energy_lengths = self.stft(input, input_lengths)
+        #input mag shape - (h, T)
+        energy = torch.norm(mag, dim=0)
+
+        # (Optional): Average by duration to calculate token-wise energy
+        if self.use_token_averaged_energy:
+            energy = self._average_by_duration(energy, durations)
+
+        # Return with the shape (B, T, 1)
+        return energy
+
+    @staticmethod
+    def _average_by_duration(x: torch.Tensor, d: torch.Tensor) -> torch.Tensor:
+        #print(d.sum(), len(x))
+        assert d.sum() == len(x)
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [
+            x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0)
+            for start, end in zip(d_cumsum[:-1], d_cumsum[1:])
+        ]
+        return torch.stack(x_avg)
+
+    @staticmethod
+    def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tensor:
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x
diff --git a/dataset/audio/pitch.py b/dataset/audio/pitch.py
@@ -0,0 +1,155 @@
+"""F0 extractor using DIO + Stonemask algorithm."""
+
+import logging
+
+from typing import Any
+from typing import Dict
+from typing import Optional
+from typing import Tuple
+import numpy as np
+import pyworld
+import torch
+import torch.nn.functional as F
+
+from scipy.interpolate import interp1d
+from typeguard import check_argument_types
+
+
+class Dio():
+    """F0 estimation with dio + stonemask algortihm.
+    This is f0 extractor based on dio + stonmask algorithm introduced in `WORLD:
+    a vocoder-based high-quality speech synthesis system for real-time applications`_.
+    .. _`WORLD: a vocoder-based high-quality speech synthesis system for real-time
+        applications`: https://doi.org/10.1587/transinf.2015EDP7457
+    Note:
+        This module is based on NumPy implementation. Therefore, the computational graph
+        is not connected.
+    Todo:
+        Replace this module with PyTorch-based implementation.
+    """
+
+
+    def __init__(
+            self,
+            fs: int = 22050,
+            n_fft: int = 1024,
+            hop_length: int = 256,
+            f0min: Optional[int] = 71,
+            f0max: Optional[int] = 400,
+            use_token_averaged_f0: bool = True,
+            use_continuous_f0: bool = True,
+            use_log_f0: bool = True,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.frame_period = 1000 * hop_length / fs
+        self.f0min = f0min
+        self.f0max = f0max
+        self.use_token_averaged_f0 = use_token_averaged_f0
+        self.use_continuous_f0 = use_continuous_f0
+        self.use_log_f0 = use_log_f0
+
+    def output_size(self) -> int:
+        return 1
+
+    def get_parameters(self) -> Dict[str, Any]:
+        return dict(
+            fs=self.fs,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            f0min=self.f0min,
+            f0max=self.f0max,
+            use_token_averaged_f0=self.use_token_averaged_f0,
+            use_continuous_f0=self.use_continuous_f0,
+            use_log_f0=self.use_log_f0,
+        )
+
+    def forward(
+            self,
+            input: torch.Tensor,
+            feats_lengths: torch.Tensor = None,
+            durations: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # If not provide, we assume that the inputs have the same length
+        # F0 extraction
+
+        # input shape = [T,]
+        pitch = self._calculate_f0(input)
+        # (Optional): Adjust length to match with the mel-spectrogram
+        if feats_lengths is not None:
+            pitch = [
+                self._adjust_num_frames(p, fl).view(-1)
+                for p, fl in zip(pitch, feats_lengths)
+            ]
+        # (Optional): Average by duration to calculate token-wise f0
+        if self.use_token_averaged_f0:
+            pitch = self._average_by_duration(pitch, durations)
+            pitch_lengths = len(durations)
+        else:
+            pitch_lengths = input.new_tensor([len(p) for p in pitch], dtype=torch.long)
+        # Return with the shape (B, T, 1)
+        return pitch
+
+
+    def _calculate_f0(self, input: torch.Tensor) -> torch.Tensor:
+        x = input.cpu().numpy().astype(np.double)
+        #print(self.frame_period)
+        f0, timeaxis = pyworld.dio(
+            x,
+            self.fs,
+            f0_floor=self.f0min,
+            f0_ceil=self.f0max,
+            frame_period=self.frame_period,
+        )
+
+        f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
+        if self.use_continuous_f0:
+            f0 = self._convert_to_continuous_f0(f0)
+        if self.use_log_f0:
+            nonzero_idxs = np.where(f0 != 0)[0]
+            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
+        return input.new_tensor(f0.reshape(-1), dtype=torch.float)
+
+
+    @staticmethod
+    def _adjust_num_frames(x: torch.Tensor, num_frames: torch.Tensor) -> torch.Tensor:
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x
+
+
+    @staticmethod
+    def _convert_to_continuous_f0(f0: np.array) -> np.array:
+        if (f0 == 0).all():
+            logging.warn("All frames seems to be unvoiced.")
+            return f0
+
+        # padding start and end of f0 sequence
+        start_f0 = f0[f0 != 0][0]
+        end_f0 = f0[f0 != 0][-1]
+        start_idx = np.where(f0 == start_f0)[0][0]
+        end_idx = np.where(f0 == end_f0)[0][-1]
+        f0[:start_idx] = start_f0
+        f0[end_idx:] = end_f0
+        # get non-zero frame index
+        nonzero_idxs = np.where(f0 != 0)[0]
+        # perform linear interpolation
+        interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
+        f0 = interp_fn(np.arange(0, f0.shape[0]))
+        return f0
+
+    @staticmethod
+    def _average_by_duration(x: torch.Tensor, d: torch.Tensor) -> torch.Tensor:
+        #print(d.sum(), len(x))
+        assert d.sum() == len(x)
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [
+            x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0)
+            for start, end in zip(d_cumsum[:-1], d_cumsum[1:])
+        ]
+        return torch.stack(x_avg)
diff --git a/dataset/dataloader.py b/dataset/dataloader.py
@@ -115,7 +115,7 @@ def collate_tts(batch):
     # scale spectrograms to -4 <--> 4
     # mels = (mels * 8.) - 4
 
-    return inputs, ilens, mels, labels, olens, ids, durations, energys, pitches
+    return inputs, ilens, mels, labels, olens, ids, durations, energys.unsqueeze(-1), pitches.unsqueeze(-1)
 
 
 class BinnedLengthSampler(Sampler):

diff --git a/evaluation.py b/evaluation.py
@@ -21,7 +21,9 @@ def evaluate(hp, validloader, model):
 
         with torch.no_grad():
             ilens = torch.tensor([x_[-1].shape[0]], dtype=torch.long, device=x_.device)
-            _, after_outs, d_outs, e_outs, p_outs = model._forward(x_.cuda(), ilens.cuda(), out_length_.cuda(), dur_.cuda(), es=e_.cuda(), ps=p_.cuda(), is_inference=False)  # [T, num_mel]
+            _, after_outs, d_outs, e_outs, p_outs = model._forward(x_.cuda(), ilens.cuda(), out_length_.cuda(),
+                                                                   dur_.cuda(), es=e_.cuda(), ps=p_.cuda(),
+                                                                   is_inference=False)  # [T, num_mel]
 
             # e_orig = model.energy_predictor.to_one_hot(e_).squeeze()
             # p_orig = model.pitch_predictor.to_one_hot(p_).squeeze()