Skip to content

Commit

Permalink
congratulations for our new ACL 2022 work. & some updates
Browse files Browse the repository at this point in the history
  • Loading branch information
MoonInTheRiver committed Mar 1, 2022
1 parent b7c14f1 commit fa17f25
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 10 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ This repository is the official PyTorch implementation of our AAAI-2022 [paper](
</table>

:tada: :tada: :tada: **Updates**:
- Feb.21, 2022: update [MIDI](usr/configs/midi/readme.md) version SVS.
- Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), for singing voice beautifying, has been released :sparkles: :sparkles: :sparkles: .
- Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), the improved code framework, which contains the implementations of DiffSpeech and our NeurIPS-2021 work [PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) has been released :sparkles: :sparkles: :sparkles:.
- Jan.29, 2022: support [MIDI](usr/configs/midi/readme.md) version SVS.
- Jan.29, 2022: support [MIDI](usr/configs/midi/readme.md) version SVS. **Keep Updating**. :construction: :pick: :hammer_and_wrench:
- Jan.13, 2022: support SVS, release PopCS dataset.
- Dec.19, 2021: support TTS. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)

:rocket: **News**:
- Feb.24, 2022: Our new work, NeuralSVB was accepted by ACL-2022. [Demo Page](https://neuralsvb.github.io).
- Feb.24, 2022: Our new work, NeuralSVB was accepted by ACL-2022 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2202.13277). [Demo Page](https://neuralsvb.github.io).
- Dec.01, 2021: DiffSinger was accepted by AAAI-2022.
- Sep.29, 2021: Our recent work `PortaSpeech: Portable and High-Quality Generative Text-to-Speech` was accepted by NeurIPS-2021 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2109.15166) .
- May.06, 2021: We submitted DiffSinger to Arxiv [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446).
Expand Down
113 changes: 113 additions & 0 deletions modules/commons/espnet_positional_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import math
import torch


class PositionalEncoding(torch.nn.Module):
"""Positional encoding.
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
reverse (bool): Whether to reverse the input position.
"""

def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
"""Construct an PositionalEncoding object."""
super(PositionalEncoding, self).__init__()
self.d_model = d_model
self.reverse = reverse
self.xscale = math.sqrt(self.d_model)
self.dropout = torch.nn.Dropout(p=dropout_rate)
self.pe = None
self.extend_pe(torch.tensor(0.0).expand(1, max_len))

def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
if self.pe.size(1) >= x.size(1):
if self.pe.dtype != x.dtype or self.pe.device != x.device:
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return
pe = torch.zeros(x.size(1), self.d_model)
if self.reverse:
position = torch.arange(
x.size(1) - 1, -1, -1.0, dtype=torch.float32
).unsqueeze(1)
else:
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.float32)
* -(math.log(10000.0) / self.d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.pe = pe.to(device=x.device, dtype=x.dtype)

def forward(self, x: torch.Tensor):
"""Add positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale + self.pe[:, : x.size(1)]
return self.dropout(x)


class ScaledPositionalEncoding(PositionalEncoding):
"""Scaled positional encoding module.
See Sec. 3.2 https://arxiv.org/abs/1809.08895
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""

def __init__(self, d_model, dropout_rate, max_len=5000):
"""Initialize class."""
super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
self.alpha = torch.nn.Parameter(torch.tensor(1.0))

def reset_parameters(self):
"""Reset parameters."""
self.alpha.data = torch.tensor(1.0)

def forward(self, x):
"""Add positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x + self.alpha * self.pe[:, : x.size(1)]
return self.dropout(x)


class RelPositionalEncoding(PositionalEncoding):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""

def __init__(self, d_model, dropout_rate, max_len=5000):
"""Initialize class."""
super().__init__(d_model, dropout_rate, max_len, reverse=True)

def forward(self, x):
"""Compute positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
torch.Tensor: Positional embedding tensor (1, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
pos_emb = self.pe[:, : x.size(1)]
return self.dropout(x) + self.dropout(pos_emb)
7 changes: 5 additions & 2 deletions modules/diffsinger_midi/fs2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ def forward_embedding(self, txt_tokens, midi_embedding, midi_dur_embedding, slur
x = self.embed_scale * self.embed_tokens(txt_tokens)
x = x + midi_embedding + midi_dur_embedding + slur_embedding
if hparams['use_pos_embed']:
positions = self.embed_positions(txt_tokens)
x = x + positions
if hparams.get('rel_pos') is not None and hparams['rel_pos']:
x = self.embed_positions(x)
else:
positions = self.embed_positions(txt_tokens)
x = x + positions
x = F.dropout(x, p=self.dropout, training=self.training)
return x

Expand Down
11 changes: 7 additions & 4 deletions modules/fastspeech/tts_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch.nn as nn
from torch.nn import functional as F

from modules.commons.common_layers import ConvNorm, Embedding
from modules.commons.espnet_positional_embedding import RelPositionalEncoding
from modules.commons.common_layers import SinusoidalPositionalEmbedding, Linear, EncSALayer, DecSALayer, BatchNorm1dTBC
from utils.hparams import hparams

Expand Down Expand Up @@ -317,9 +317,12 @@ def __init__(self, embed_tokens, hidden_size=None, num_layers=None, kernel_size=
self.embed_tokens = embed_tokens
self.embed_scale = math.sqrt(hidden_size)
self.padding_idx = 0
self.embed_positions = SinusoidalPositionalEmbedding(
hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS,
)
if hparams.get('rel_pos') is not None and hparams['rel_pos']:
self.embed_positions = RelPositionalEncoding(hidden_size, dropout_rate=0.0)
else:
self.embed_positions = SinusoidalPositionalEmbedding(
hidden_size, self.padding_idx, init_size=DEFAULT_MAX_TARGET_POSITIONS,
)

def forward(self, txt_tokens):
"""
Expand Down
2 changes: 1 addition & 1 deletion usr/configs/midi/e2e/opencpop/ds100_256.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ lambda_f0: 0.
lambda_uv: 0.
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
pe_ckpt: 'checkpoints/0102_xiaoma_pe_256'
35 changes: 35 additions & 0 deletions usr/configs/midi/e2e/opencpop/ds100_adj.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml

binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'

#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *


fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask

K_step: 100
max_tokens: 40000
max_updates: 160000
gaussian_start: True

use_pitch_embed: false
use_gt_f0: false # for midi exp

lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 4 # *
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
36 changes: 36 additions & 0 deletions usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml

binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'

#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *


fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask

K_step: 100
max_tokens: 40000
max_updates: 160000
gaussian_start: True

use_pitch_embed: false
use_gt_f0: false # for midi exp

lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 4 # *
rel_pos: true
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'

0 comments on commit fa17f25

Please sign in to comment.