diff --git a/CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py b/CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py index ae51f1932..1455de285 100644 --- a/CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py +++ b/CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py @@ -107,7 +107,7 @@ def __getitem__(self, idx): # Audio processing wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len) - + if self.mels_path: mel = np.load(os.path.join(self.mels_path, name + ".mel.npy")) else: diff --git a/CUDA-Optimized/FastSpeech/generate.py b/CUDA-Optimized/FastSpeech/generate.py index 66114c6c1..c931b910d 100644 --- a/CUDA-Optimized/FastSpeech/generate.py +++ b/CUDA-Optimized/FastSpeech/generate.py @@ -28,7 +28,7 @@ import time import fire -import librosa +import soundfile import torch from fastspeech.data_load import PadDataLoader @@ -158,7 +158,7 @@ def generate(hparam='infer.yaml', wav = wav[:wav_len] path = os.path.join(results_path, text[:MAX_FILESIZE] + ".wav") - librosa.output.write_wav(path, wav, hp.sr) + soundfile.write(path, wav, hp.sr) except StopIteration: tprint("Generation has been done.") diff --git a/CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py b/CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py index 9cc6f051b..5f8486afe 100644 --- a/CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py +++ b/CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py @@ -79,7 +79,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800, # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 - win_sq = librosa_util.pad_center(win_sq, n_fft) + win_sq = librosa_util.pad_center(win_sq, size=n_fft) # Fill the envelope for i in range(n_frames): diff --git a/CUDA-Optimized/FastSpeech/tacotron2/layers.py b/CUDA-Optimized/FastSpeech/tacotron2/layers.py index cc195919d..2253d4446 100644 --- a/CUDA-Optimized/FastSpeech/tacotron2/layers.py +++ b/CUDA-Optimized/FastSpeech/tacotron2/layers.py @@ -31,7 +31,6 @@ """https://github.com/NVIDIA/tacotron2""" import torch -from librosa.filters import mel as librosa_mel_fn class LinearNorm(torch.nn.Module): diff --git a/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb b/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb index 8d611f560..22fd8a1e4 100644 --- a/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb +++ b/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb @@ -511,10 +511,10 @@ " \"\"\"\n", " samples = self._convert_samples_to_float32(samples)\n", " if target_sr is not None and target_sr != sample_rate:\n", - " samples = librosa.core.resample(samples, sample_rate, target_sr)\n", + " samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n", " sample_rate = target_sr\n", " if trim:\n", - " samples, _ = librosa.effects.trim(samples, trim_db)\n", + " samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n", " self._samples = samples\n", " self._sample_rate = sample_rate\n", " if self._samples.ndim >= 2:\n", diff --git a/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb b/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb index caa0a3206..a2ff8af47 100644 --- a/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb +++ b/Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb @@ -640,10 +640,10 @@ " \"\"\"\n", " samples = self._convert_samples_to_float32(samples)\n", " if target_sr is not None and target_sr != sample_rate:\n", - " samples = librosa.core.resample(samples, sample_rate, target_sr)\n", + " samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n", " sample_rate = target_sr\n", " if trim:\n", - " samples, _ = librosa.effects.trim(samples, trim_db)\n", + " samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n", " self._samples = samples\n", " self._sample_rate = sample_rate\n", " if self._samples.ndim >= 2:\n", diff --git a/PyTorch/SpeechRecognition/Jasper/requirements.txt b/PyTorch/SpeechRecognition/Jasper/requirements.txt index cc1b1d5cb..3b58660b8 100755 --- a/PyTorch/SpeechRecognition/Jasper/requirements.txt +++ b/PyTorch/SpeechRecognition/Jasper/requirements.txt @@ -1,6 +1,6 @@ inflect==5.3.0 ipdb -librosa==0.9.0 +librosa>=0.9.0 pandas==1.5.2 pyyaml>=5.4 soundfile diff --git a/PyTorch/SpeechRecognition/QuartzNet/requirements.txt b/PyTorch/SpeechRecognition/QuartzNet/requirements.txt index cc1b1d5cb..3b58660b8 100644 --- a/PyTorch/SpeechRecognition/QuartzNet/requirements.txt +++ b/PyTorch/SpeechRecognition/QuartzNet/requirements.txt @@ -1,6 +1,6 @@ inflect==5.3.0 ipdb -librosa==0.9.0 +librosa>=0.9.0 pandas==1.5.2 pyyaml>=5.4 soundfile diff --git a/PyTorch/SpeechRecognition/wav2vec2/requirements.txt b/PyTorch/SpeechRecognition/wav2vec2/requirements.txt index d8091ebbe..4c793a688 100644 --- a/PyTorch/SpeechRecognition/wav2vec2/requirements.txt +++ b/PyTorch/SpeechRecognition/wav2vec2/requirements.txt @@ -1,5 +1,5 @@ editdistance==0.6.0 -librosa==0.10.1 +librosa>=0.10.1 omegaconf==2.0.6 # optional for handling certain Fairseq ckpts pyarrow==6.0.1 soundfile==0.12.1 diff --git a/PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py b/PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py index ac77c7bd6..d8f542e45 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py +++ b/PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py @@ -69,7 +69,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, global mel_basis, hann_window fmax_key = f'{fmax}_{y.device}' if fmax_key not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device) hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) diff --git a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt index 2066bde65..bfe3c964b 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/requirements.txt +++ b/PyTorch/SpeechSynthesis/FastPitch/requirements.txt @@ -1,5 +1,5 @@ inflect -librosa==0.9.0 +librosa>=0.9.0 matplotlib numpy pynvml==11.0.0 diff --git a/PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt b/PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt index 8b0115ec8..20dfc5c1f 100644 --- a/PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt +++ b/PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt @@ -1,5 +1,5 @@ inflect -librosa==0.9.0 +librosa>=0.9.0 numpy pandas pynvml==11.0.0 diff --git a/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py b/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py index c746187da..2821c84ba 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py @@ -383,10 +383,10 @@ def __init__(self, samples, sample_rate, target_sr=16000, trim=False, """ samples = self._convert_samples_to_float32(samples) if target_sr is not None and target_sr != sample_rate: - samples = librosa.core.resample(samples, sample_rate, target_sr) + samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr) sample_rate = target_sr if trim: - samples, _ = librosa.effects.trim(samples, trim_db) + samples, _ = librosa.effects.trim(samples, top_db=trim_db) self._samples = samples self._sample_rate = sample_rate if self._samples.ndim >= 2: diff --git a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt index 6859d0219..5b2eed65d 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt +++ b/PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt @@ -1,4 +1,4 @@ torch==1.3.0 onnx==1.5.0 scipy==1.3.1 -librosa==0.7.0 +librosa