-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_collection.py
163 lines (141 loc) · 6.98 KB
/
data_collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import tarfile
import requests
import librosa
import soundfile as sf
import logging
# Setup logging
logging.basicConfig(filename='RealTime_Denoise_App/logs/data_collection.log', level=logging.INFO,
format='%(asctime)s %(levelname)s:%(message)s')
# Directory setup
BASE_DIR = 'RealTime_Denoise_App/datasets'
LIBRISPEECH_DIR = os.path.join(BASE_DIR, 'LibriSpeech')
URBANSOUND_DIR = os.path.join(BASE_DIR, 'UrbanSound8K')
# URLs for datasets
LIBRISPEECH_URLS = [
"http://www.openslr.org/resources/12/train-clean-100.tar.gz",
"http://www.openslr.org/resources/12/dev-clean.tar.gz"
]
URBANSOUND8K_URL = "https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz"
# Create directories if they don't exist
def create_directories():
os.makedirs(LIBRISPEECH_DIR, exist_ok=True)
os.makedirs(URBANSOUND_DIR, exist_ok=True)
os.makedirs('RealTime_Denoise_App/logs', exist_ok=True)
# Function to check if the dataset has been extracted
def is_extracted(directory, filename):
extracted_flag_file = os.path.join(directory, f".{filename}.extracted")
return os.path.exists(extracted_flag_file)
# Function to mark a dataset as extracted
def mark_as_extracted(directory, filename):
extracted_flag_file = os.path.join(directory, f".{filename}.extracted")
with open(extracted_flag_file, 'w') as f:
f.write('')
# Function to check if all .flac files have corresponding .wav files
def check_flac_wav_conversion(directory):
all_converted = True
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.flac'):
flac_path = os.path.join(root, file)
wav_path = flac_path.replace('.flac', '.wav')
if not os.path.exists(wav_path):
all_converted = False
print(f"Missing WAV file for {flac_path}")
logging.warning(f"Missing WAV file for {flac_path}")
return all_converted
# Function to delete all .flac files if corresponding .wav files exist
def delete_flac_files(directory):
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.flac'):
flac_path = os.path.join(root, file)
wav_path = flac_path.replace('.flac', '.wav')
if os.path.exists(wav_path):
try:
os.remove(flac_path)
print(f"Deleted {flac_path}")
logging.info(f"Deleted {flac_path}")
except Exception as e:
print(f"Error deleting {flac_path}: {e}")
logging.error(f"Error deleting {flac_path}: {e}")
# Download and extract LibriSpeech dataset, convert .flac to .wav
def download_librispeech():
for url in LIBRISPEECH_URLS:
filename = url.split('/')[-1]
filepath = os.path.join(LIBRISPEECH_DIR, filename)
extract_dir = os.path.join(LIBRISPEECH_DIR, filename.replace('.tar.gz', ''))
# Check if the dataset is already extracted
if os.path.exists(extract_dir) and is_extracted(LIBRISPEECH_DIR, filename):
print(f"{filename} already extracted. Checking for .flac to .wav conversions.")
logging.info(f"{filename} already extracted. Checking for .flac to .wav conversions.")
# Check if all .flac files have been converted to .wav
if check_flac_wav_conversion(extract_dir):
delete_flac_files(extract_dir)
continue
# Download the dataset
if not os.path.exists(filepath):
print(f"Downloading {filename}...")
logging.info(f"Downloading {filename}...")
response = requests.get(url, stream=True)
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"{filename} downloaded successfully.")
logging.info(f"{filename} downloaded successfully.")
# Extract and convert .flac to .wav
print(f"Extracting {filename} and converting .flac files to .wav...")
logging.info(f"Extracting {filename} and converting .flac files to .wav...")
with tarfile.open(filepath, 'r:gz') as tar:
for member in tar.getmembers():
if member.name.endswith('.flac'):
tar.extract(member, LIBRISPEECH_DIR)
flac_path = os.path.join(LIBRISPEECH_DIR, member.name)
wav_path = flac_path.replace('.flac', '.wav')
try:
# Load the FLAC file and save as WAV
audio, sr = librosa.load(flac_path, sr=16000)
sf.write(wav_path, audio, 16000)
print(f"Converted {flac_path} to {wav_path}")
logging.info(f"Converted {flac_path} to {wav_path}")
# Remove the original FLAC file
os.remove(flac_path)
print(f"Removed {flac_path}")
logging.info(f"Removed {flac_path}")
except Exception as e:
print(f"Error converting {flac_path} to WAV: {e}")
logging.error(f"Error converting {flac_path} to WAV: {e}")
mark_as_extracted(LIBRISPEECH_DIR, filename)
print(f"{filename} extracted and processed successfully.")
logging.info(f"{filename} extracted and processed successfully.")
# Download and extract UrbanSound8K dataset
def download_urbansound8k():
filename = URBANSOUND8K_URL.split('/')[-1]
filepath = os.path.join(URBANSOUND_DIR, filename)
if not os.path.exists(filepath):
print(f"Downloading UrbanSound8K dataset...")
logging.info(f"Downloading UrbanSound8K dataset...")
response = requests.get(URBANSOUND8K_URL, stream=True)
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"UrbanSound8K dataset downloaded successfully.")
logging.info(f"UrbanSound8K dataset downloaded successfully.")
if not is_extracted(URBANSOUND_DIR, filename):
print(f"Extracting UrbanSound8K dataset...")
logging.info(f"Extracting UrbanSound8K dataset...")
with tarfile.open(filepath, 'r:gz') as tar:
tar.extractall(URBANSOUND_DIR)
mark_as_extracted(URBANSOUND_DIR, filename)
print(f"UrbanSound8K dataset extracted successfully.")
logging.info(f"UrbanSound8K dataset extracted successfully.")
else:
print("UrbanSound8K dataset already extracted. Skipping extraction.")
logging.info("UrbanSound8K dataset already extracted. Skipping extraction.")
# Main function to coordinate the process
def main():
create_directories()
download_librispeech()
download_urbansound8k()
print("Data collection and processing completed successfully.")
logging.info("Data collection and processing completed successfully.")
if __name__ == '__main__':
main()