forked from LAION-AI/audio-dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_MACS_marianna.py
96 lines (83 loc) · 2.99 KB
/
preprocess_MACS_marianna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
Code for preprocess MACS dataset.
use condition: download MACS.ymal from https://zenodo.org/record/5114771#.yq4kbnbmlb1
author:
Marianna 90%
Yuchen Hui 10%
'''
from p_tqdm import p_map
import os
import pandas as pd
import multiprocessing as mp
import time
import sys
import yaml
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from utils.file_utils import json_load, json_dump
from utils.audio_utils import audio_to_flac
from utils.dataset_parameters import AUDIO_SAVE_SAMPLE_RATE
def process_data(meta, output_dir, audio_dir):
file_id = 0
for row in tqdm(meta.iterrows(), total=len(meta)):
filename, annotations = row[1].values
audio_path = audio_dir+'/'+filename
ids, texts, tags = [], [], []
for annotation in annotations:
annotator_id, sentence, tags = annotation.values()
tags.extend(tags)
texts.append(sentence)
tags = list(set(tags))
texts = list(set(texts))
original_data = {
'title': 'MACS ',
'license':'Other (Non-Commercial)',
'filename':filename,
'annotations':annotations
}
audio_json = {
'text': texts,
'tag': tags,
'original_data':original_data,
}
audio_json_save_path = f'{output_dir}/{file_id}.json'
audio_save_path = f'{output_dir}/{file_id}.flac'
json_dump(audio_json, audio_json_save_path)
audio_to_flac(audio_path, audio_save_path,
AUDIO_SAVE_SAMPLE_RATE)
file_id += 1
def process(dataset_name):
output_dir = f'/home/ubuntu/marianna/clap/processed_datasets/{dataset_name}'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
audio_dir = f'/home/ubuntu/marianna/clap/raw_datasets/MACS/AUDIO'
meta_dir = f'/home/ubuntu/marianna/clap/raw_datasets/MACS/meta/MACS.yaml'
with open(meta_dir) as f:
meta = pd.DataFrame(yaml.safe_load(f)['files'])
N = len(meta)
file_id = 0
num_process = 5
processes = []
out_dirs = [f'{output_dir}/{i} 'for i in range(num_process)]
for out_dir in out_dirs:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
rngs = [(i*int(N/num_process), (i+1)*int(N/num_process))
for i in range(num_process)]
print(rngs)
s = time.time()
for rng, out_dir in zip(rngs, out_dirs):
start, end = rng
p = mp.Process(target=process_data, args=[
meta.iloc[start:end], out_dir, audio_dir])
p.start()
processes.append(p)
for p in processes:
p.join()
e = time.time()
print(f'Processed in {round(e-s, 2)} seconds')
return output_dir
if __name__ == '__main__':
from utils.merge_dirs import merge_dirs
from utils.unzip import unzip_file
dataset_name = 'MACS'
process(dataset_name)