forked from hujinsen/StarGAN-Voice-Conversion
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
executable file
·207 lines (165 loc) · 6.81 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import librosa
import numpy as np
import os
import pyworld
import pyworld as pw
import glob
from utility import *
import argparse
FEATURE_DIM = 36
SAMPLE_RATE = 16000
FRAMES = 512
FFTSIZE = 1024
SPEAKERS_NUM = 4 # in our experiment, we use four speakers
EPSILON = 1e-10
MODEL_NAME = 'starganvc_model'
def load_wavs(dataset: str, sr):
'''
data dict contains all audios file path
resdict contains all wav files
'''
data = {}
with os.scandir(dataset) as it:
for entry in it:
if entry.is_dir():
data[entry.name] = []
# print(entry.name, entry.path)
with os.scandir(entry.path) as it_f:
for onefile in it_f:
if onefile.is_file():
# print(onefile.path)
data[entry.name].append(onefile.path)
print(f'loaded keys: {data.keys()}')
# data like {TM1:[xx,xx,xxx,xxx]}
resdict = {}
cnt = 0
for key, value in data.items():
resdict[key] = {}
for one_file in value:
filename = os.path.normpath(one_file).split(os.sep)[-1].split('.')[0] # like 100061
newkey = f'{filename}'
wav, _ = librosa.load(one_file, sr=sr, mono=True, dtype=np.float64)
resdict[key][newkey] = wav
# resdict[key].append(temp_dict) #like TM1:{100062:[xxxxx], .... }
print('.', end='')
cnt += 1
print(f'\nTotal {cnt} aduio files!')
return resdict
def wav_to_mcep_file(dataset: str, sr=16000, ispad: bool = False, processed_filepath: str = './data/processed'):
'''convert wavs to mcep feature using image repr'''
# if no processed_filepath, create it ,or delete all npz files
if not os.path.exists(processed_filepath):
os.makedirs(processed_filepath)
else:
filelist = glob.glob(os.path.join(processed_filepath, "*.npy"))
for f in filelist:
os.remove(f)
allwavs_cnt = len(glob.glob(f'{dataset}/*/*.wav'))
# allwavs_cnt = allwavs_cnt//4*3 * 12+200 #about this number not precise
print(f'Total {allwavs_cnt} audio files!')
d = load_wavs(dataset, sr)
cnt = 1 #
for one_speaker in d.keys():
for audio_name, audio_wav in d[one_speaker].items():
# cal source audio feature
audio_mcep_dict = cal_mcep(
audio_wav, fs=sr, ispad=ispad, frame_period=0.005, dim=FEATURE_DIM)
newname = f'{one_speaker}-{audio_name}'
# save the dict as npz
file_path_z = f'{processed_filepath}/{newname}'
print(f'save file: {file_path_z}')
np.savez(file_path_z, audio_mcep_dict)
# save every 36*FRAMES blocks
print(f'audio mcep shape {audio_mcep_dict["coded_sp"].shape}')
# TODO step may be FRAMES//2
for start_idx in range(0, audio_mcep_dict["coded_sp"].shape[1] - FRAMES + 1, FRAMES):
one_audio_seg = audio_mcep_dict["coded_sp"][:,
start_idx:start_idx + FRAMES]
if one_audio_seg.shape[1] == FRAMES:
temp_name = f'{newname}_{start_idx}'
filePath = f'{processed_filepath}/{temp_name}'
print(f'[{cnt}:{allwavs_cnt}]svaing file: {filePath}.npy')
np.save(filePath, one_audio_seg)
cnt += 1
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE):
'''cal mcep given wav singnal
the frame_period used only for pad_wav_to_get_fixed_frames
'''
if ispad:
wav, pad_length = pad_wav_to_get_fixed_frames(
wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs)
else:
wav = wav_ori
# Harvest F0 extraction algorithm.
f0, timeaxis = pyworld.harvest(wav, fs)
# CheapTrick harmonic spectral envelope estimation algorithm.
sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)
# D4C aperiodicity estimation algorithm.
ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
# feature reduction nxdim
coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
# log
coded_sp = coded_sp.T # dim x n
res = {
'f0': f0, # n
'ap': ap, # n*fftsize//2+1
'sp': sp, # n*fftsize//2+1
'coded_sp': coded_sp, # dim * n
}
return res
def pad_wav_to_get_fixed_frames(x: np.ndarray, frames: int = 128, frame_period: float = 0.005, sr: int = 16000):
# one frame's points
frame_length = frame_period * sr
# frames points
frames_points = frames * frame_length
wav_len = len(x)
# pad amount
pieces = wav_len // frames_points
need_pad = 0
if wav_len % frames_points != 0:
# can't devide need pad
need_pad = int((pieces + 1) * frames_points - wav_len)
afterpad_len = wav_len + need_pad
# print(f'need pad: {need_pad}, after pad: {afterpad_len}')
# padding process
tempx = x.tolist()
if need_pad <= len(x):
tempx.extend(x[:need_pad])
else:
temp1, temp2 = need_pad // len(x), need_pad / len(x)
tempx = tempx * (temp1 + 1)
samll_pad_len = int(np.ceil((temp2 - temp1) * len(x)))
tempx.extend(x[:samll_pad_len])
diff = 0
if afterpad_len != len(tempx):
diff = afterpad_len - len(tempx)
if diff > 0:
tempx.extend(tempx[:diff])
elif diff < 0:
tempx = tempx[:diff]
# print(f'padding length: {len(x)}-->length: {len(tempx)}')
# remove last point for calculate convience:the frame length are 128*(some integer).
tempx = tempx[:-1]
return np.asarray(tempx, dtype=np.float), need_pad
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert the wav waveform to mel-cepstral coefficients(MCCs)\
and calculate the speech statistical characteristics')
input_dir = './data/fourspeakers'
output_dir = './data/processed'
ispad = True
parser.add_argument('--input_dir', type=str,
help='the direcotry contains data need to be processed', default=input_dir)
parser.add_argument('--output_dir', type=str,
help='the directory stores the processed data', default=output_dir)
parser.add_argument(
'--ispad', type=bool, help='whether to pad the wavs to get fixed length MCEP', default=ispad)
argv = parser.parse_args()
input_dir = argv.input_dir
output_dir = argv.output_dir
ispad = argv.ispad
wav_to_mcep_file(input_dir, SAMPLE_RATE, ispad=ispad,
processed_filepath=output_dir)
# input_dir is train dataset. we need to calculate and save the speech\
# statistical characteristics for each speaker.
generator = GenerateStatics(output_dir)
generator.generate_stats()