Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Adibian committed Feb 28, 2022
0 parents commit c70b6ed
Show file tree
Hide file tree
Showing 78 changed files with 7,087 additions and 0 deletions.
22 changes: 22 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
*.pyc
*.aux
*.log
*.out
*.synctex.gz
*.suo
*__pycache__
*.idea
*.ipynb_checkpoints
*.pickle
*.npy
*.blg
*.bbl
*.bcf
*.toc
*.wav
*.sh
encoder/saved_models/*
synthesizer/saved_models/*
vocoder/saved_models/*
saved_models/*
dataset/*
208 changes: 208 additions & 0 deletions demo_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import argparse
import os
from pathlib import Path

import librosa
import numpy as np
import soundfile as sf
import torch

from encoder import inference as encoder
from encoder.params_model import model_embedding_size as speaker_embedding_size
from synthesizer.inference import Synthesizer
from utils.argutils import print_args
from utils.default_models import ensure_default_models
from vocoder import inference as vocoder


if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("-e", "--enc_model_fpath", type=Path,
default="saved_models/default/encoder.pt",
help="Path to a saved encoder")
parser.add_argument("-s", "--syn_model_fpath", type=Path,
default="saved_models/default/synthesizer.pt",
help="Path to a saved synthesizer")
parser.add_argument("-v", "--voc_model_fpath", type=Path,
default="saved_models/default/vocoder.pt",
help="Path to a saved vocoder")
parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.")
parser.add_argument("--no_sound", action="store_true", help=\
"If True, audio won't be played.")
parser.add_argument("--seed", type=int, default=None, help=\
"Optional random number seed value to make toolbox deterministic.")
args = parser.parse_args()
arg_dict = vars(args)
print_args(args, parser)

# Hide GPUs from Pytorch to force CPU processing
if arg_dict.pop("cpu"):
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

print("Running a test of your configuration...\n")

if torch.cuda.is_available():
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
## Print some environment information (for debugging purposes)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
(torch.cuda.device_count(),
device_id,
gpu_properties.name,
gpu_properties.major,
gpu_properties.minor,
gpu_properties.total_memory / 1e9))
else:
print("Using CPU for inference.\n")

## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
ensure_default_models(Path("saved_models"))
encoder.load_model(args.enc_model_fpath)
synthesizer = Synthesizer(args.syn_model_fpath)
vocoder.load_model(args.voc_model_fpath)


## Run a test
print("Testing your configuration with small inputs.")
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
# sampling rate, which may differ.
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
# The sampling rate is the number of values (samples) recorded per second, it is set to
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
# to an audio of 1 second.
print("\tTesting the encoder...")
encoder.embed_utterance(np.zeros(encoder.sampling_rate))

# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
# returns, but here we're going to make one ourselves just for the sake of showing that it's
# possible.
embed = np.random.rand(speaker_embedding_size)
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
# embeddings it will be).
embed /= np.linalg.norm(embed)
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
# illustrate that
embeds = [embed, np.zeros(speaker_embedding_size)]
texts = ["test 1", "test 2"]
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
mels = synthesizer.synthesize_spectrograms(texts, embeds)

# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
# can concatenate the mel spectrograms to a single one.
mel = np.concatenate(mels, axis=1)
# The vocoder can take a callback function to display the generation. More on that later. For
# now we'll simply hide it like this:
no_action = lambda *args: None
print("\tTesting the vocoder...")
# For the sake of making this test short, we'll pass a short target length. The target length
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
# that has a detrimental effect on the quality of the audio. The default parameters are
# recommended in general.
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)

print("All test passed! You can now synthesize speech.\n\n")


## Interactive speech generation
print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
"show how you can interface this project easily with your own. See the source code for "
"an explanation of what is happening.\n")

print("Interactive generation loop")
num_generated = 0
while True:
try:
# Get the reference audio filepath
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
"wav, m4a, flac, ...):\n"
in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))

## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is
# important: there is preprocessing that must be applied.

# The following two methods are equivalent:
# - Directly load from the filepath:
preprocessed_wav = encoder.preprocess_wav(in_fpath)
# - If the wav is already loaded:
original_wav, sampling_rate = librosa.load(str(in_fpath))
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully")

# Then we derive the embedding. There are many functions and parameters that the
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
embed = encoder.embed_utterance(preprocessed_wav)
print("Created the embedding")


## Generating the spectrogram
text = input("Write a sentence (+-20 words) to be synthesized:\n")

# If seed is specified, reset torch seed and force synthesizer reload
if args.seed is not None:
torch.manual_seed(args.seed)
synthesizer = Synthesizer(args.syn_model_fpath)

# The synthesizer works in batch, so you need to put your data in a list or numpy array
texts = [text]
embeds = [embed]
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds)
spec = specs[0]
print("Created the mel spectrogram")


## Generating the waveform
print("Synthesizing the waveform:")

# If seed is specified, reset torch seed and reload vocoder
if args.seed is not None:
torch.manual_seed(args.seed)
vocoder.load_model(args.voc_model_fpath)

# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
generated_wav = vocoder.infer_waveform(spec)


## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

# Trim excess silences to compensate for gaps in spectrograms (issue #53)
generated_wav = encoder.preprocess_wav(generated_wav)

# Play the audio (non-blocking)
if not args.no_sound:
import sounddevice as sd
try:
sd.stop()
sd.play(generated_wav, synthesizer.sample_rate)
except sd.PortAudioError as e:
print("\nCaught exception: %s" % repr(e))
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
except:
raise

# Save it on the disk
filename = "demo_output_%02d.wav" % num_generated
print(generated_wav.dtype)
sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
num_generated += 1
print("\nSaved output as %s\n\n" % filename)


except Exception as e:
print("Caught exception: %s" % repr(e))
print("Restarting\n")
37 changes: 37 additions & 0 deletions demo_toolbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import argparse
import os
from pathlib import Path

from toolbox import Toolbox
from utils.argutils import print_args
from utils.default_models import ensure_default_models


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Runs the toolbox.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

parser.add_argument("-d", "--datasets_root", type=Path, help= \
"Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
"supported datasets.", default=None)
parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
help="Directory containing all saved models")
parser.add_argument("--cpu", action="store_true", help=\
"If True, all inference will be done on CPU")
parser.add_argument("--seed", type=int, default=None, help=\
"Optional random number seed value to make toolbox deterministic.")
args = parser.parse_args()
arg_dict = vars(args)
print_args(args, parser)

# Hide GPUs from Pytorch to force CPU processing
if arg_dict.pop("cpu"):
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Remind the user to download pretrained models if needed
ensure_default_models(args.models_dir)

# Launch the toolbox
Toolbox(**arg_dict)
Empty file added encoder/__init__.py
Empty file.
117 changes: 117 additions & 0 deletions encoder/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from scipy.ndimage.morphology import binary_dilation
from encoder.params_data import *
from pathlib import Path
from typing import Optional, Union
from warnings import warn
import numpy as np
import librosa
import struct

try:
import webrtcvad
except:
warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
webrtcvad=None

int16_max = (2 ** 15) - 1


def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
source_sr: Optional[int] = None,
normalize: Optional[bool] = True,
trim_silence: Optional[bool] = True):
"""
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
just .wav), either the waveform as a numpy array of floats.
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
preprocessing. After preprocessing, the waveform's sampling rate will match the data
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
this argument will be ignored.
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
else:
wav = fpath_or_wav

# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)

# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
if webrtcvad and trim_silence:
wav = trim_long_silences(wav)

return wav


def wav_to_mel_spectrogram(wav):
"""
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
Note: this not a log-mel spectrogram.
"""
frames = librosa.feature.melspectrogram(
wav,
sampling_rate,
n_fft=int(sampling_rate * mel_window_length / 1000),
hop_length=int(sampling_rate * mel_window_step / 1000),
n_mels=mel_n_channels
)
return frames.astype(np.float32).T


def trim_long_silences(wav):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wav: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wav length)
"""
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000

# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]

# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))

# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)

# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width

audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)

# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)

return wav[audio_mask == True]


def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
return wav
return wav * (10 ** (dBFS_change / 20))
Loading

0 comments on commit c70b6ed

Please sign in to comment.