rishikksh20 · rishikksh20 · Nov 25, 2020 · Nov 25, 2020 · Nov 27, 2020 · Nov 27, 2020
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,27 @@ idea/*
 /trace_loss_nvidia.txt
 /conf
 /etc
+.ipynb_checkpoints/Untitled-checkpoint.ipynb
+dataset/audio/__pycache__/__init__.cpython-36.pyc
+*.pyc
+Untitled.ipynb
+mel.npy
+*.png
+*.npy
+Testing/2log_v2/no_exp_before_bins_fs2v2_2_31k_test_tts.wav
+Testing/exp_log/test_tts.wav
+Testing/exp_log_v2/exp_before_bins_fs2v2_2_31k_test_tts.wav
+mel.png
+mel.npy
+Testing/v2_2/test_tts.wav
+*.npy
+*.png
+mel.png
+*.wav
+*.npy
+.ipynb_checkpoints/pitch_cwt-checkpoint.ipynb
+pitch_cwt.ipynb
+*.wav
+Testing/test_tts.wav
+*.wav
+Testing/test_tts.wav
diff --git a/Testing/test_tts.wav b/Testing/test_tts.wav
diff --git a/configs/default.yaml b/configs/default.yaml
@@ -1,6 +1,6 @@
 data:
-  data_dir: 'H:\Deepsync\backup\fastspeech\data\'
-  wav_dir: 'H:\Deepsync\backup\deepsync\LJSpeech-1.1\wavs\'
+  data_dir: './data/LJSpeech/good_file/'
+  wav_dir: '/mnt/Karan/LJSpeech-1.1/wavs/'
   # Compute statistics
   e_mean: 21.578571319580078
   e_std: 18.916799545288086
@@ -10,7 +10,7 @@ data:
   f0_mean: 206.5135564772342
   f0_std:  53.633228905750336
   p_min: 71.0
-  p_max: 676.2260946528305 # 799.8901977539062
+  p_max: 500.0 # 799.8901977539062
   train_filelist: "./filelists/train_filelist.txt"
   valid_filelist: "./filelists/valid_filelist.txt"
   tts_cleaner_names: ['english_cleaners']
@@ -30,6 +30,7 @@ audio:
   bits : 9                            # bit depth of signal
   mu_law : True                       # Recommended to suppress noise if using raw bits in hp.voc_mode below
   peak_norm : False                   # Normalise to the peak of each wav file
+  cwt_bins : 10
 
 
 
@@ -46,7 +47,7 @@ model:
   aheads: 2
   elayers: 4
   eunits: 1024
-  ddim: 384
+  ddim: 256
   dlayers: 4
   dunits: 1024
   positionwise_layer_type : "conv1d" # linear
@@ -110,7 +111,7 @@ train:
   # optimization related
   eos: False #True
   opt: 'noam'
-  accum_grad: 4
+  accum_grad: 1
   grad_clip: 1.0
   weight_decay: 0.001
   patience: 0
@@ -126,7 +127,7 @@ train:
   seed: 1       # random seed number
   resume: ""    # the snapshot path to resume (if set empty, no effect)
   use_phonemes: True
-  batch_size : 16
+  batch_size : 48
   # other
   melgan_vocoder : True
   save_interval : 1000
@@ -135,4 +136,4 @@ train:
   summary_interval : 200
   validation_step : 500
   tts_max_mel_len : 870              # if you have a couple of extremely long spectrograms you might want to use this
-  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
+  tts_bin_lengths : True              # bins the spectrogram lengths before sampling in data loader - speeds up training
diff --git a/core/variance_predictor.py b/core/variance_predictor.py
@@ -2,7 +2,9 @@
 import torch.nn.functional as F
 from typing import Optional
 from core.modules import LayerNorm
-
+#import pycwt
+import numpy as np
+from sklearn import preprocessing
 
 class VariancePredictor(torch.nn.Module):
     def __init__(
@@ -149,7 +151,11 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):
 
         """
         out = self.predictor.inference(xs, False, alpha=alpha)
-        return self.to_one_hot(out)  # Need to do One hot code
+        #print(out.shape, type(out))
+        #out = torch.from_numpy(np.load("/results/chkpts/LJ/Fastspeech2_V2/data/energy/LJ001-0001.npy")).cuda()
+        #print(out, "Energy Pricted")
+        out = torch.exp(out)
+        return self.to_one_hot(out), out  # Need to do One hot code
 
     def to_one_hot(self, x):
         # e = de_norm_mean_std(e, hp.e_mean, hp.e_std)
@@ -171,6 +177,7 @@ def __init__(
         min=0,
         max=0,
         n_bins=256,
+        out=5,
     ):
         """Initilize pitch predictor module.
 
@@ -195,9 +202,29 @@ def __init__(
                 )
             ),
         )
-        self.predictor = VariancePredictor(idim)
+        self.offset = offset
+        self.conv = torch.nn.ModuleList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [
+                torch.nn.Sequential(
+                    torch.nn.Conv1d(
+                        in_chans,
+                        n_chans,
+                        kernel_size,
+                        stride=1,
+                        padding=(kernel_size - 1) // 2,
+                    ),
+                    torch.nn.ReLU(),
+                    LayerNorm(n_chans),
+                    torch.nn.Dropout(dropout_rate),
+                )
+            ]
+        self.spectrogram_out = torch.nn.Linear(n_chans, out)
+        self.mean = torch.nn.Linear(n_chans, 1)
+        self.std = torch.nn.Linear(n_chans, 1)
 
-    def forward(self, xs: torch.Tensor, x_masks: torch.Tensor):
+    def forward(self, xs: torch.Tensor, olens: torch.Tensor, x_masks: torch.Tensor):
         """Calculate forward propagation.
 
         Args:
@@ -208,9 +235,42 @@ def forward(self, xs: torch.Tensor, x_masks: torch.Tensor):
             Tensor: Batch of predicted durations in log domain (B, Tmax).
 
         """
-        return self.predictor(xs, x_masks)
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
 
-    def inference(self, xs: torch.Tensor, alpha: float = 1.0):
+        # NOTE: calculate in log domain
+        xs = xs.transpose(1, -1)
+        f0_spec = self.spectrogram_out(xs)  # (B, Tmax, 10)
+
+        if x_masks is not None:
+            # print("olen:", olens)
+            #f0_spec = f0_spec.transpose(1, -1)
+            # print("F0 spec dimension:", f0_spec.shape)
+            # print("x_masks dimension:", x_masks.shape)
+            f0_spec = f0_spec.masked_fill(x_masks, 0.0)
+            #f0_spec = f0_spec.transpose(1, -1)
+            # print("F0 spec dimension:", f0_spec.shape)
+            #xs = xs.transpose(1, -1)
+            xs = xs.masked_fill(x_masks, 0.0)
+            #xs = xs.transpose(1, -1)
+            # print("xs dimension:", xs.shape)
+        x_avg = xs.sum(dim=1).squeeze(1)
+        # print(x_avg)
+        # print("xs dim :", x_avg.shape)
+        # print("olens ;", olens.shape)
+        if olens is not None:
+            x_avg = x_avg / olens.unsqueeze(1)
+        # print(x_avg)
+        f0_mean = self.mean(x_avg).squeeze(-1)
+        f0_std = self.std(x_avg).squeeze(-1)
+
+        # if x_masks is not None:
+        #     f0_spec = f0_spec.masked_fill(x_masks, 0.0)
+
+        return f0_spec, f0_mean, f0_std
+
+    def inference(self, xs: torch.Tensor, olens = None, alpha: float = 1.0):
         """Inference duration.
 
         Args:
@@ -221,8 +281,14 @@ def inference(self, xs: torch.Tensor, alpha: float = 1.0):
             LongTensor: Batch of predicted durations in linear domain (B, Tmax).
 
         """
-        out = self.predictor.inference(xs, False, alpha=alpha)
-        return self.to_one_hot(out)
+        f0_spec, f0_mean, f0_std = self.forward(xs, olens, x_masks=None)  # (B, Tmax, 10)
+        #print(f0_spec)
+        f0_reconstructed = self.inverse(f0_spec, f0_mean, f0_std)
+        #print(f0_reconstructed)
+        #f0_reconstructed = torch.from_numpy(np.load("/results/chkpts/LJ/Fastspeech2_V2/data/pitch/LJ001-0001.npy").reshape(1,-1)).cuda()
+        #print(f0_reconstructed, "Pitch coef output")
+
+        return self.to_one_hot(f0_reconstructed), f0_reconstructed
 
     def to_one_hot(self, x: torch.Tensor):
         # e = de_norm_mean_std(e, hp.e_mean, hp.e_std)
@@ -231,6 +297,24 @@ def to_one_hot(self, x: torch.Tensor):
         quantize = torch.bucketize(x, self.pitch_bins).to(device=x.device)  # .cuda()
         return F.one_hot(quantize.long(), 256).float()
 
+    def inverse(self, Wavelet_lf0, f0_mean, f0_std):
+        scales =  np.array([0.01, 0.02, 0.04, 0.08, 0.16])  #np.arange(1,11)
+        #print(Wavelet_lf0.shape)
+        Wavelet_lf0 = Wavelet_lf0.squeeze(0).cpu().numpy()
+        lf0_rec = np.zeros([Wavelet_lf0.shape[0], len(scales)])
+        for i in range(0,len(scales)):
+            lf0_rec[:,i] = Wavelet_lf0[:,i]*((i+200+2.5)**(-2.5))
+
+        lf0_rec_sum = np.sum(lf0_rec,axis = 1)
+        lf0_rec_sum_norm = preprocessing.scale(lf0_rec_sum)
+
+        f0_reconstructed = (torch.Tensor(lf0_rec_sum_norm).cuda()*f0_std) + f0_mean
+
+        f0_reconstructed = torch.exp(f0_reconstructed)
+        #print(f0_reconstructed.shape)
+        #print(f0_reconstructed.shape)
+        return f0_reconstructed.reshape(1,-1)
+
 
 class PitchPredictorLoss(torch.nn.Module):
     """Loss function module for duration predictor.

diff --git a/dataset/audio/__init__.py b/dataset/audio/__init__.py