Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

https://github.com/NVIDIA/DeepLearningExamples/issues/1369 Updated De… #1370

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __getitem__(self, idx):

# Audio processing
wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len)

if self.mels_path:
mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
else:
Expand Down
4 changes: 2 additions & 2 deletions CUDA-Optimized/FastSpeech/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import time

import fire
import librosa
import soundfile
import torch

from fastspeech.data_load import PadDataLoader
Expand Down Expand Up @@ -158,7 +158,7 @@ def generate(hparam='infer.yaml',
wav = wav[:wav_len]

path = os.path.join(results_path, text[:MAX_FILESIZE] + ".wav")
librosa.output.write_wav(path, wav, hp.sr)
soundfile.write(path, wav, hp.sr)

except StopIteration:
tprint("Generation has been done.")
Expand Down
2 changes: 1 addition & 1 deletion CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
win_sq = librosa_util.pad_center(win_sq, n_fft)
win_sq = librosa_util.pad_center(win_sq, size=n_fft)

# Fill the envelope
for i in range(n_frames):
Expand Down
1 change: 0 additions & 1 deletion CUDA-Optimized/FastSpeech/tacotron2/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
"""https://github.com/NVIDIA/tacotron2"""

import torch
from librosa.filters import mel as librosa_mel_fn


class LinearNorm(torch.nn.Module):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,10 @@
" \"\"\"\n",
" samples = self._convert_samples_to_float32(samples)\n",
" if target_sr is not None and target_sr != sample_rate:\n",
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
" sample_rate = target_sr\n",
" if trim:\n",
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
" self._samples = samples\n",
" self._sample_rate = sample_rate\n",
" if self._samples.ndim >= 2:\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,10 @@
" \"\"\"\n",
" samples = self._convert_samples_to_float32(samples)\n",
" if target_sr is not None and target_sr != sample_rate:\n",
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
" sample_rate = target_sr\n",
" if trim:\n",
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
" self._samples = samples\n",
" self._sample_rate = sample_rate\n",
" if self._samples.ndim >= 2:\n",
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/Jasper/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
inflect==5.3.0
ipdb
librosa==0.9.0
librosa>=0.9.0
pandas==1.5.2
pyyaml>=5.4
soundfile
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/QuartzNet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
inflect==5.3.0
ipdb
librosa==0.9.0
librosa>=0.9.0
pandas==1.5.2
pyyaml>=5.4
soundfile
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/wav2vec2/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
editdistance==0.6.0
librosa==0.10.1
librosa>=0.10.1
omegaconf==2.0.6 # optional for handling certain Fairseq ckpts
pyarrow==6.0.1
soundfile==0.12.1
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
global mel_basis, hann_window
fmax_key = f'{fmax}_{y.device}'
if fmax_key not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device)
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/FastPitch/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
inflect
librosa==0.9.0
librosa>=0.9.0
matplotlib
numpy
pynvml==11.0.0
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
inflect
librosa==0.9.0
librosa>=0.9.0
numpy
pandas
pynvml==11.0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,10 @@ def __init__(self, samples, sample_rate, target_sr=16000, trim=False,
"""
samples = self._convert_samples_to_float32(samples)
if target_sr is not None and target_sr != sample_rate:
samples = librosa.core.resample(samples, sample_rate, target_sr)
samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
sample_rate = target_sr
if trim:
samples, _ = librosa.effects.trim(samples, trim_db)
samples, _ = librosa.effects.trim(samples, top_db=trim_db)
self._samples = samples
self._sample_rate = sample_rate
if self._samples.ndim >= 2:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch==1.3.0
onnx==1.5.0
scipy==1.3.1
librosa==0.7.0
librosa