first commit

MahtaFetrat · Feb 28, 2022 · c70b6ed · c70b6ed
commit c70b6ed
Show file tree

Hide file tree

Showing 78 changed files with 7,087 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,22 @@
+*.pyc
+*.aux
+*.log
+*.out
+*.synctex.gz
+*.suo
+*__pycache__
+*.idea
+*.ipynb_checkpoints
+*.pickle
+*.npy
+*.blg
+*.bbl
+*.bcf
+*.toc
+*.wav
+*.sh
+encoder/saved_models/*
+synthesizer/saved_models/*
+vocoder/saved_models/*
+saved_models/*
+dataset/*
diff --git a/demo_cli.py b/demo_cli.py
@@ -0,0 +1,208 @@
+import argparse
+import os
+from pathlib import Path
+
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+
+from encoder import inference as encoder
+from encoder.params_model import model_embedding_size as speaker_embedding_size
+from synthesizer.inference import Synthesizer
+from utils.argutils import print_args
+from utils.default_models import ensure_default_models
+from vocoder import inference as vocoder
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("-e", "--enc_model_fpath", type=Path,
+                        default="saved_models/default/encoder.pt",
+                        help="Path to a saved encoder")
+    parser.add_argument("-s", "--syn_model_fpath", type=Path,
+                        default="saved_models/default/synthesizer.pt",
+                        help="Path to a saved synthesizer")
+    parser.add_argument("-v", "--voc_model_fpath", type=Path,
+                        default="saved_models/default/vocoder.pt",
+                        help="Path to a saved vocoder")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, processing is done on CPU, even when a GPU is available.")
+    parser.add_argument("--no_sound", action="store_true", help=\
+        "If True, audio won't be played.")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    args = parser.parse_args()
+    arg_dict = vars(args)
+    print_args(args, parser)
+
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+    print("Running a test of your configuration...\n")
+
+    if torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        ## Print some environment information (for debugging purposes)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" %
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Using CPU for inference.\n")
+
+    ## Load the models one by one.
+    print("Preparing the encoder, the synthesizer and the vocoder...")
+    ensure_default_models(Path("saved_models"))
+    encoder.load_model(args.enc_model_fpath)
+    synthesizer = Synthesizer(args.syn_model_fpath)
+    vocoder.load_model(args.voc_model_fpath)
+
+
+    ## Run a test
+    print("Testing your configuration with small inputs.")
+    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
+    # sampling rate, which may differ.
+    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
+    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
+    # The sampling rate is the number of values (samples) recorded per second, it is set to
+    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
+    # to an audio of 1 second.
+    print("\tTesting the encoder...")
+    encoder.embed_utterance(np.zeros(encoder.sampling_rate))
+
+    # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
+    # returns, but here we're going to make one ourselves just for the sake of showing that it's
+    # possible.
+    embed = np.random.rand(speaker_embedding_size)
+    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
+    # embeddings it will be).
+    embed /= np.linalg.norm(embed)
+    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
+    # illustrate that
+    embeds = [embed, np.zeros(speaker_embedding_size)]
+    texts = ["test 1", "test 2"]
+    print("\tTesting the synthesizer... (loading the model will output a lot of text)")
+    mels = synthesizer.synthesize_spectrograms(texts, embeds)
+
+    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
+    # can concatenate the mel spectrograms to a single one.
+    mel = np.concatenate(mels, axis=1)
+    # The vocoder can take a callback function to display the generation. More on that later. For
+    # now we'll simply hide it like this:
+    no_action = lambda *args: None
+    print("\tTesting the vocoder...")
+    # For the sake of making this test short, we'll pass a short target length. The target length
+    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
+    # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
+    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
+    # that has a detrimental effect on the quality of the audio. The default parameters are
+    # recommended in general.
+    vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
+
+    print("All test passed! You can now synthesize speech.\n\n")
+
+
+    ## Interactive speech generation
+    print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
+          "show how you can interface this project easily with your own. See the source code for "
+          "an explanation of what is happening.\n")
+
+    print("Interactive generation loop")
+    num_generated = 0
+    while True:
+        try:
+            # Get the reference audio filepath
+            message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
+                      "wav, m4a, flac, ...):\n"
+            in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
+
+            ## Computing the embedding
+            # First, we load the wav using the function that the speaker encoder provides. This is
+            # important: there is preprocessing that must be applied.
+
+            # The following two methods are equivalent:
+            # - Directly load from the filepath:
+            preprocessed_wav = encoder.preprocess_wav(in_fpath)
+            # - If the wav is already loaded:
+            original_wav, sampling_rate = librosa.load(str(in_fpath))
+            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
+            print("Loaded file succesfully")
+
+            # Then we derive the embedding. There are many functions and parameters that the
+            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
+            # only use this function (with its default parameters):
+            embed = encoder.embed_utterance(preprocessed_wav)
+            print("Created the embedding")
+
+
+            ## Generating the spectrogram
+            text = input("Write a sentence (+-20 words) to be synthesized:\n")
+
+            # If seed is specified, reset torch seed and force synthesizer reload
+            if args.seed is not None:
+                torch.manual_seed(args.seed)
+                synthesizer = Synthesizer(args.syn_model_fpath)
+
+            # The synthesizer works in batch, so you need to put your data in a list or numpy array
+            texts = [text]
+            embeds = [embed]
+            # If you know what the attention layer alignments are, you can retrieve them here by
+            # passing return_alignments=True
+            specs = synthesizer.synthesize_spectrograms(texts, embeds)
+            spec = specs[0]
+            print("Created the mel spectrogram")
+
+
+            ## Generating the waveform
+            print("Synthesizing the waveform:")
+
+            # If seed is specified, reset torch seed and reload vocoder
+            if args.seed is not None:
+                torch.manual_seed(args.seed)
+                vocoder.load_model(args.voc_model_fpath)
+
+            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
+            # spectrogram, the more time-efficient the vocoder.
+            generated_wav = vocoder.infer_waveform(spec)
+
+
+            ## Post-generation
+            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
+            # pad it.
+            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
+
+            # Trim excess silences to compensate for gaps in spectrograms (issue #53)
+            generated_wav = encoder.preprocess_wav(generated_wav)
+
+            # Play the audio (non-blocking)
+            if not args.no_sound:
+                import sounddevice as sd
+                try:
+                    sd.stop()
+                    sd.play(generated_wav, synthesizer.sample_rate)
+                except sd.PortAudioError as e:
+                    print("\nCaught exception: %s" % repr(e))
+                    print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
+                except:
+                    raise
+
+            # Save it on the disk
+            filename = "demo_output_%02d.wav" % num_generated
+            print(generated_wav.dtype)
+            sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
+            num_generated += 1
+            print("\nSaved output as %s\n\n" % filename)
+
+
+        except Exception as e:
+            print("Caught exception: %s" % repr(e))
+            print("Restarting\n")
diff --git a/demo_toolbox.py b/demo_toolbox.py
@@ -0,0 +1,37 @@
+import argparse
+import os
+from pathlib import Path
+
+from toolbox import Toolbox
+from utils.argutils import print_args
+from utils.default_models import ensure_default_models
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Runs the toolbox.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument("-d", "--datasets_root", type=Path, help= \
+        "Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
+        "supported datasets.", default=None)
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
+                        help="Directory containing all saved models")
+    parser.add_argument("--cpu", action="store_true", help=\
+        "If True, all inference will be done on CPU")
+    parser.add_argument("--seed", type=int, default=None, help=\
+        "Optional random number seed value to make toolbox deterministic.")
+    args = parser.parse_args()
+    arg_dict = vars(args)
+    print_args(args, parser)
+
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+    # Remind the user to download pretrained models if needed
+    ensure_default_models(args.models_dir)
+
+    # Launch the toolbox
+    Toolbox(**arg_dict)
diff --git a/encoder/__init__.py b/encoder/__init__.py
diff --git a/encoder/audio.py b/encoder/audio.py
@@ -0,0 +1,117 @@
+from scipy.ndimage.morphology import binary_dilation
+from encoder.params_data import *
+from pathlib import Path
+from typing import Optional, Union
+from warnings import warn
+import numpy as np
+import librosa
+import struct
+
+try:
+    import webrtcvad
+except:
+    warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")
+    webrtcvad=None
+
+int16_max = (2 ** 15) - 1
+
+
+def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
+                   source_sr: Optional[int] = None,
+                   normalize: Optional[bool] = True,
+                   trim_silence: Optional[bool] = True):
+    """
+    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
+    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
+
+    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
+    just .wav), either the waveform as a numpy array of floats.
+    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
+    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
+    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
+    this argument will be ignored.
+    """
+    # Load the wav from disk if needed
+    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
+    else:
+        wav = fpath_or_wav
+
+    # Resample the wav if needed
+    if source_sr is not None and source_sr != sampling_rate:
+        wav = librosa.resample(wav, source_sr, sampling_rate)
+
+    # Apply the preprocessing: normalize volume and shorten long silences 
+    if normalize:
+        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
+    if webrtcvad and trim_silence:
+        wav = trim_long_silences(wav)
+
+    return wav
+
+
+def wav_to_mel_spectrogram(wav):
+    """
+    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
+    Note: this not a log-mel spectrogram.
+    """
+    frames = librosa.feature.melspectrogram(
+        wav,
+        sampling_rate,
+        n_fft=int(sampling_rate * mel_window_length / 1000),
+        hop_length=int(sampling_rate * mel_window_step / 1000),
+        n_mels=mel_n_channels
+    )
+    return frames.astype(np.float32).T
+
+
+def trim_long_silences(wav):
+    """
+    Ensures that segments without voice in the waveform remain no longer than a 
+    threshold determined by the VAD parameters in params.py.
+
+    :param wav: the raw waveform as a numpy array of floats 
+    :return: the same waveform with silences trimmed away (length <= original wav length)
+    """
+    # Compute the voice detection window size
+    samples_per_window = (vad_window_length * sampling_rate) // 1000
+
+    # Trim the end of the audio to have a multiple of the window size
+    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
+
+    # Convert the float waveform to 16-bit mono PCM
+    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
+
+    # Perform voice activation detection
+    voice_flags = []
+    vad = webrtcvad.Vad(mode=3)
+    for window_start in range(0, len(wav), samples_per_window):
+        window_end = window_start + samples_per_window
+        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
+                                         sample_rate=sampling_rate))
+    voice_flags = np.array(voice_flags)
+
+    # Smooth the voice detection with a moving average
+    def moving_average(array, width):
+        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
+        ret = np.cumsum(array_padded, dtype=float)
+        ret[width:] = ret[width:] - ret[:-width]
+        return ret[width - 1:] / width
+
+    audio_mask = moving_average(voice_flags, vad_moving_average_width)
+    audio_mask = np.round(audio_mask).astype(np.bool)
+
+    # Dilate the voiced regions
+    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
+    audio_mask = np.repeat(audio_mask, samples_per_window)
+
+    return wav[audio_mask == True]
+
+
+def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
+    if increase_only and decrease_only:
+        raise ValueError("Both increase only and decrease only are set")
+    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
+    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
+        return wav
+    return wav * (10 ** (dBFS_change / 20))