diff --git a/.gitignore b/.gitignore index 026b7af..1d36432 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ # data files *.csv -*.json *.xls *.xlsx *.pkl diff --git a/amt/audio.py b/amt/audio.py index 6c37f3f..447822e 100644 --- a/amt/audio.py +++ b/amt/audio.py @@ -191,15 +191,15 @@ def __init__( max_snr: int = 50, max_dist_gain: int = 25, min_dist_gain: int = 0, - noise_ratio: float = 0.95, - reverb_ratio: float = 0.95, + noise_ratio: float = 0.75, + reverb_ratio: float = 0.75, applause_ratio: float = 0.01, bandpass_ratio: float = 0.15, distort_ratio: float = 0.15, reduce_ratio: float = 0.01, - detune_ratio: float = 0.1, - detune_max_shift: float = 0.15, - spec_aug_ratio: float = 0.5, + detune_ratio: float = 0.0, + detune_max_shift: float = 0.0, + spec_aug_ratio: float = 0.9, ): super().__init__() self.tokenizer = AmtTokenizer() @@ -223,7 +223,10 @@ def __init__( self.detune_ratio = detune_ratio self.detune_max_shift = detune_max_shift self.spec_aug_ratio = spec_aug_ratio - self.reduction_resample_rate = 6000 # Hardcoded? + + self.time_mask_param = 2500 + self.freq_mask_param = 15 + self.reduction_resample_rate = 6000 # Audio aug impulse_paths = self._get_paths( @@ -263,10 +266,10 @@ def __init__( ) self.spec_aug = torch.nn.Sequential( torchaudio.transforms.FrequencyMasking( - freq_mask_param=15, iid_masks=True + freq_mask_param=self.freq_mask_param, iid_masks=True ), torchaudio.transforms.TimeMasking( - time_mask_param=1000, iid_masks=True + time_mask_param=self.time_mask_param, iid_masks=True ), ) @@ -281,6 +284,8 @@ def get_params(self): "detune_ratio": self.detune_ratio, "detune_max_shift": self.detune_max_shift, "spec_aug_ratio": self.spec_aug_ratio, + "time_mask_param": self.time_mask_param, + "freq_mask_param": self.freq_mask_param, } def _get_paths(self, dir_path): diff --git a/amt/data.py b/amt/data.py index 377b760..ad8660f 100644 --- a/amt/data.py +++ b/amt/data.py @@ -1,6 +1,8 @@ import mmap import os import io +import random +import shlex import base64 import shutil import orjson @@ -16,7 +18,14 @@ from amt.audio import pad_or_trim -# Occasionally the worker util goes to 0 for some reason, debug this +def _check_onset_threshold(seq: list, onset: int): + for tok_1, tok_2 in zip(seq, seq[1:]): + if isinstance(tok_1, tuple) and tok_1[0] in ("on", "off"): + _onset = tok_2[1] + if _onset > onset: + return True + + return False def get_wav_mid_segments( @@ -24,6 +33,7 @@ def get_wav_mid_segments( mid_path: str = "", return_json: bool = False, stride_factor: int | None = None, + pad_last=False, ): """This function yields tuples of matched log mel spectrograms and tokenized sequences (np.array, list). If it is given only an audio path @@ -61,10 +71,12 @@ def get_wav_mid_segments( # Create features total_samples = wav.shape[-1] + pad_factor = 2 if pad_last is True else 1 res = [] for idx in range( 0, - total_samples - (num_samples - num_samples // stride_factor), + total_samples + - (num_samples - pad_factor * (num_samples // stride_factor)), num_samples // stride_factor, ): audio_feature = pad_or_trim(wav[idx:], length=num_samples) @@ -75,6 +87,12 @@ def get_wav_mid_segments( end_ms=(idx + num_samples) / samples_per_ms, max_pedal_len_ms=10000, ) + + # Hardcoded to 2.5s + if _check_onset_threshold(mid_feature, 2500) is False: + print("No note messages after 2.5s - skipping") + continue + else: mid_feature = [] @@ -86,6 +104,56 @@ def get_wav_mid_segments( return res +def pianoteq_cmd_fn(mid_path: str, wav_path: str): + presets = [ + "C. Bechstein", + "C. Bechstein Close Mic", + "C. Bechstein Under Lid", + "C. Bechstein 440", + "C. Bechstein Recording", + "C. Bechstein Werckmeister III", + "C. Bechstein Neidhardt III", + "C. Bechstein mesotonic", + "C. Bechstein well tempered", + "HB Steinway D Blues", + "HB Steinway D Pop", + "HB Steinway D New Age", + "HB Steinway D Prelude", + "HB Steinway D Felt I", + "HB Steinway D Felt II", + "HB Steinway Model D", + "HB Steinway D Classical Recording", + "HB Steinway D Jazz Recording", + "HB Steinway D Chamber Recording", + "HB Steinway D Studio Recording", + "HB Steinway D Intimate", + "HB Steinway D Cinematic", + "HB Steinway D Close Mic Classical", + "HB Steinway D Close Mic Jazz", + "HB Steinway D Player Wide", + "HB Steinway D Player Clean", + "HB Steinway D Trio", + "HB Steinway D Duo", + "HB Steinway D Cabaret", + "HB Steinway D Bright", + "HB Steinway D Hyper Bright", + "HB Steinway D Prepared", + "HB Steinway D Honky Tonk", + ] + + preset = random.choice(presets) + + # Safely quote the preset name, MIDI path, and WAV path + safe_preset = shlex.quote(preset) + safe_mid_path = shlex.quote(mid_path) + safe_wav_path = shlex.quote(wav_path) + + # Construct the command + command = f"/home/mchorse/pianoteq/x86-64bit/Pianoteq\\ 8\\ STAGE --preset {safe_preset} --midi {safe_mid_path} --wav {safe_wav_path}" + + return command + + def write_features(audio_path: str, mid_path: str, save_path: str): features = get_wav_mid_segments( audio_path=audio_path, @@ -121,7 +189,7 @@ def write_synth_features(cli_cmd_fn: Callable, mid_path: str, save_path: str): try: get_synth_audio( - cli_cmd=cli_cmd_fn, mid_path=mid_path, wav_path=audio_path_temp + cli_cmd_fn=cli_cmd_fn, mid_path=mid_path, wav_path=audio_path_temp ) except: if os.path.isfile(audio_path_temp): @@ -133,7 +201,11 @@ def write_synth_features(cli_cmd_fn: Callable, mid_path: str, save_path: str): mid_path=mid_path, return_json=False, ) - os.remove(audio_path_temp) + + if os.path.isfile(audio_path_temp): + os.remove(audio_path_temp) + + print(f"Found {len(features)}") with open(save_path, mode="a") as file: for wav, seq in features: @@ -174,7 +246,11 @@ def build_synth_worker_fn( while not load_path_queue.empty(): mid_path = load_path_queue.get() - write_synth_features(cli_cmd, mid_path, worker_save_path) + try: + write_synth_features(cli_cmd, mid_path, worker_save_path) + except Exception as e: + print("Failed") + print(e) save_path_queue.put(worker_save_path) @@ -239,7 +315,7 @@ def _format(tok): seq_len=self.config["max_seq_len"], ) - return wav, self.tokenizer.encode(src), self.tokenizer.encode(tgt) + return wav, self.tokenizer.encode(src), self.tokenizer.encode(tgt), idx def _build_index(self): self.file_mmap.seek(0) @@ -254,7 +330,7 @@ def _build_index(self): return index - def _save_index(self, index: list[int], save_path: str): + def _save_index(self, index: list, save_path: str): with open(save_path, "w") as file: for idx in index: file.write(f"{idx}\n") @@ -325,7 +401,7 @@ def build( ] else: # Build synthetic dataset - assert len(load_paths[0]) == 1, "Invalid load paths" + assert isinstance(load_paths[0], str), "Invalid load paths" print("Building synthetic dataset") worker_processes = [ Process( diff --git a/amt/evaluate.py b/amt/evaluate.py index d79e885..282fc64 100644 --- a/amt/evaluate.py +++ b/amt/evaluate.py @@ -42,27 +42,32 @@ def midi_to_hz(note, shift=0): # return (a / 32) * (2 ** ((note - 9) / 12)) +def get_matched_files(est_dir: str, ref_dir: str): + # We assume that the files have the same path relative to their directory + + res = [] + est_paths = glob.glob(os.path.join(est_dir, "**/*.mid"), recursive=True) + print(f"found {len(est_paths)} est files") + + for est_path in est_paths: + est_rel_path = os.path.relpath(est_path, est_dir) + ref_path = os.path.join( + ref_dir, os.path.splitext(est_rel_path)[0] + ".midi" + ) + if os.path.isfile(ref_path): + res.append((est_path, ref_path)) + + print(f"found {len(res)} matched est-ref pairs") + + return res + + def evaluate_mir_eval(est_dir, ref_dir, output_stats_file=None, est_shift=0): """ Evaluate the estimated pitches against the reference pitches using mir_eval. """ - # Evaluate the estimated pitches against the reference pitches - ref_midi_files = glob.glob(f"{ref_dir}/*.mid*") - est_midi_files = glob.glob(f"{est_dir}/*.mid*") - - est_ref_pairs = [] - for est_fpath in est_midi_files: - ref_fpath = os.path.join(ref_dir, os.path.basename(est_fpath)) - if ref_fpath in ref_midi_files: - est_ref_pairs.append((est_fpath, ref_fpath)) - if ref_fpath.replace(".mid", ".midi") in ref_midi_files: - est_ref_pairs.append( - (est_fpath, ref_fpath.replace(".mid", ".midi")) - ) - else: - print( - f"Reference file not found for {est_fpath} (ref file: {ref_fpath})" - ) + + est_ref_pairs = get_matched_files(est_dir, ref_dir) output_fhandle = ( open(output_stats_file, "w") if output_stats_file is not None else None @@ -104,38 +109,9 @@ def evaluate_mir_eval(est_dir, ref_dir, output_stats_file=None, est_shift=0): help="Path to the file to save the evaluation stats", ) - # add mir_eval and dtw subparsers - subparsers = parser.add_subparsers(help="sub-command help") - mir_eval_parse = subparsers.add_parser( - "run_mir_eval", - help="Run standard mir_eval evaluation on MAESTRO test set.", - ) - mir_eval_parse.add_argument( - "--shift", - type=int, - default=0, - help="Shift to apply to the estimated pitches.", - ) - - # to come - dtw_eval_parse = subparsers.add_parser( - "run_dtw", - help="Run dynamic time warping evaluation on a specified dataset.", - ) - args = parser.parse_args() - if not hasattr(args, "command"): - parser.print_help() - print("Unrecognized command") - exit(1) - - # todo: should we add an option to run transcription again every time we wish to evaluate? - # that way, we can run both tests with a range of different audio augmentations right here. - # -> We expect that baseline methods will fall flat on these, while aria-amt will be OK. - - if args.command == "run_mir_eval": - evaluate_mir_eval( - args.est_dir, args.ref_dir, args.output_stats_file, args.shift - ) - elif args.command == "run_dtw": - pass + evaluate_mir_eval( + args.est_dir, + args.ref_dir, + args.output_stats_file, + ) diff --git a/amt/inference/transcribe.py b/amt/inference/transcribe.py index 9109e6a..4984dad 100644 --- a/amt/inference/transcribe.py +++ b/amt/inference/transcribe.py @@ -1,4 +1,6 @@ import os +import sys +import signal import time import random import logging @@ -10,7 +12,6 @@ import torch._inductor.config from torch.multiprocessing import Queue -from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm from functools import wraps from torch.cuda import is_bf16_supported @@ -31,15 +32,17 @@ CHUNK_LEN_MS = LEN_MS // STRIDE_FACTOR -def _setup_logger(): +def _setup_logger(name: str | None = None): + logger_name = f"[{name}] " if name else "" logger = logging.getLogger(__name__) for h in logger.handlers[:]: logger.removeHandler(h) logger.propagate = False logger.setLevel(logging.DEBUG) + # Adjust the formatter to include the name before the PID if provided formatter = logging.Formatter( - "[%(asctime)s] %(process)d: [%(levelname)s] %(message)s", + f"[%(asctime)s] {logger_name}%(process)d: [%(levelname)s] %(message)s", ) ch = logging.StreamHandler() @@ -110,12 +113,12 @@ def update_seq_end_idxs_( prefix_lens: torch.Tensor, idx: int, ): - # Update eos_idxs if next tok is eos_tok - eos_mask = next_tok_ids == 1 + # Update eos_idxs if next tok is eos_tok and not eos_id < idx + eos_mask = (next_tok_ids == 1) & (eos_idxs > idx) eos_idxs[eos_mask] = idx # Update eos_idxs if next tok in onset > 20000 - offset_mask = next_tok_ids >= 2418 + offset_mask = (next_tok_ids >= 2418) & (eos_idxs > idx) eos_idxs[offset_mask] = idx - 2 # Don't update toks in prefix or after eos_idx @@ -136,6 +139,7 @@ def wrapper(*args, **kwargs): return wrapper +@torch.no_grad() def decode_token( model: AmtEncoderDecoder, x: torch.Tensor, @@ -163,22 +167,23 @@ def process_segments( tokenizer: AmtTokenizer, logger: logging.Logger, ): - audio_segs = torch.stack( - [audio_seg for (audio_seg, prefix), _ in tasks] - ).cuda() - log_mels = audio_transform.log_mel(audio_segs) + log_mels = audio_transform.log_mel( + torch.stack([audio_seg.cuda() for (audio_seg, prefix), _ in tasks]) + ) audio_features = model.encoder(xa=log_mels) raw_prefixes = [prefix for (audio_seg, prefix), _ in tasks] prefix_lens = torch.tensor( [len(prefix) for prefix in raw_prefixes], dtype=torch.int - ) + ).cuda() min_prefix_len = min(prefix_lens).item() prefixes = [ tokenizer.trunc_seq(prefix, MAX_BLOCK_LEN) for prefix in raw_prefixes ] seq = torch.stack([tokenizer.encode(prefix) for prefix in prefixes]).cuda() - eos_idxs = torch.tensor([MAX_BLOCK_LEN for _ in prefixes], dtype=torch.int) + eos_idxs = torch.tensor( + [MAX_BLOCK_LEN for _ in prefixes], dtype=torch.int + ).cuda() # for idx in ( # pbar := tqdm( @@ -243,34 +248,39 @@ def process_segments( return results -# There is a memory leak in here somewhere def gpu_manager( gpu_batch_queue: Queue, result_queue: Queue, model: AmtEncoderDecoder, batch_size: int, + compile: bool = False, gpu_id: int | None = None, ): - logger = _setup_logger() + if gpu_id: + logger = _setup_logger(name=f"GPU-{gpu_id}") + else: + logger = _setup_logger(name=f"GPU") + logger.info("Started GPU manager") if gpu_id is not None: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - global decode_token, recalculate_tok_ids model.decoder.setup_cache(batch_size=batch_size, max_seq_len=MAX_BLOCK_LEN) model.cuda() model.eval() - if batch_size == 1: - recalculate_tok_ids = torch.compile( - recalculate_tok_ids, mode="max-autotune-no-cudagraphs" + if compile is True: + global decode_token, recalculate_tok_ids + if batch_size == 1: + recalculate_tok_ids = torch.compile( + recalculate_tok_ids, mode="max-autotune-no-cudagraphs" + ) + decode_token = torch.compile( + decode_token, + # mode="reduce-overhead", + # mode="max-autotune", + fullgraph=True, ) - decode_token = torch.compile( - decode_token, - # mode="reduce-overhead", - mode="max-autotune", - fullgraph=True, - ) audio_transform = AudioTransform().cuda() tokenizer = AmtTokenizer(return_tensors=True) @@ -278,9 +288,9 @@ def gpu_manager( try: while True: try: - batch = gpu_batch_queue.get(timeout=10) + batch = gpu_batch_queue.get(timeout=30) except Exception as e: - logger.info(f"GPU timedout waiting for batch") + logger.info(f"GPU timed out waiting for batch") break else: try: @@ -339,13 +349,13 @@ def gpu_batch_manager( gpu_batch_queue: Queue, batch_size: int, ): - logger = _setup_logger() + logger = _setup_logger(name="B") logger.info("Started batch manager") try: tasks = [] while True: try: - task, pid = gpu_task_queue.get(timeout=0.2) + task, pid = gpu_task_queue.get(timeout=0.05) except Exception as e: pass else: @@ -360,7 +370,9 @@ def gpu_batch_manager( # Get new batch and add to batch queue if len(tasks) < batch_size: - logger.info("Not enough tasks - padding batch") + logger.warning( + f"Not enough tasks ({len(tasks)}) - padding batch" + ) while len(tasks) < batch_size: _pad_task, _pid = tasks[0] tasks.append((_pad_task, -1)) @@ -397,7 +409,6 @@ def _truncate_seq( seq: list, start_ms: int, end_ms: int, - logger: logging.Logger, tokenizer: AmtTokenizer = AmtTokenizer(), ): # Truncates and shifts a sequence by retokenizing the underlying midi_dict @@ -408,16 +419,15 @@ def _truncate_seq( random.shuffle(unclosed_notes) return [("prev", p) for p in unclosed_notes] + [tokenizer.bos_tok] else: - try: - _mid_dict = tokenizer._detokenize_midi_dict(seq, LEN_MS) - res = tokenizer._tokenize_midi_dict(_mid_dict, start_ms, end_ms - 1) - except Exception as e: - logger.error(f"Truncate segment failed: {e}") + _mid_dict = tokenizer._detokenize_midi_dict(seq, LEN_MS) + if len(_mid_dict.note_msgs) == 0: return [tokenizer.bos_tok] else: - if res[-1] == tokenizer.eos_tok: - res.pop() - return res + res = tokenizer._tokenize_midi_dict(_mid_dict, start_ms, end_ms - 1) + + if res[-1] == tokenizer.eos_tok: + res.pop() + return res def transcribe_file( @@ -433,63 +443,89 @@ def transcribe_file( audio_segments = [ f for f, _ in get_wav_mid_segments( - audio_path=file_path, stride_factor=STRIDE_FACTOR + audio_path=file_path, + stride_factor=STRIDE_FACTOR, + pad_last=True, ) ] res = [] seq = [tokenizer.bos_tok] concat_seq = [tokenizer.bos_tok] - for idx, audio_seg in enumerate(audio_segments): + idx = 0 + while audio_segments: init_idx = len(seq) # Add to gpu queue and wait for results - gpu_task_queue.put(((audio_seg, seq), pid)) + gpu_task_queue.put(((audio_segments.pop(0), seq), pid)) while True: - # Issue with this logic perhaps - gpu_result = result_queue.get(timeout=300) - if gpu_result["pid"] == pid: - seq = gpu_result["result"] - break + try: + gpu_result = result_queue.get(timeout=0.1) + except Exception as e: + pass else: - result_queue.put(gpu_result) - - concat_seq += _shift_onset( - seq[init_idx:], - idx * CHUNK_LEN_MS, - ) - - if idx == len(audio_segments) - 1: - res.append(concat_seq) - elif concat_seq[-1] == tokenizer.eos_tok: - res.append(concat_seq) - seq = [tokenizer.bos_tok] - concat_seq = [tokenizer.bos_tok] - logger.info(f"Finished segment (eos_tok): {file_path}") - else: - # This might need it's logic adjusted + if gpu_result["pid"] == pid: + seq = gpu_result["result"] + break + else: + result_queue.put(gpu_result) - seq = _truncate_seq( + try: + next_seq = _truncate_seq( seq, CHUNK_LEN_MS, LEN_MS - CHUNK_LEN_MS, - logger=logger, ) + except Exception as e: + logger.info( + f"Skipping segment {idx} (failed to transcribe): {file_path}" + ) + logger.debug(traceback.format_exc()) + seq = [tokenizer.bos_tok] + else: + if seq[-1] == tokenizer.eos_tok: + logger.info(f"Seen eos_tok at segment {idx}: {file_path}") + seq = seq[:-1] - if len(seq) == 1: - logger.error(f"Failed to transcribe segment: {file_path}") - if len(concat_seq) > 500: - res.append(concat_seq) - else: - pass - # logger.info(f"Sequence too short ({len(concat_seq)})") - + if len(next_seq) == 1: + logger.info(f"Skipping segment {idx} (silence): {file_path}") seq = [tokenizer.bos_tok] - concat_seq = [tokenizer.bos_tok] + else: + concat_seq += _shift_onset( + seq[init_idx:], + idx * CHUNK_LEN_MS, + ) + seq = next_seq + + idx += 1 + + res.append(concat_seq) return res +def get_save_path( + file_path: str, + input_dir: str, + save_dir: str, + idx: int | str = "", +): + if input_dir is None: + save_path = os.path.join( + save_dir, + os.path.splitext(os.path.basename(file_path))[0] + f"{idx}.mid", + ) + else: + input_rel_path = os.path.relpath(file_path, input_dir) + save_path = os.path.join( + save_dir, os.path.splitext(input_rel_path)[0] + f"{idx}.mid" + ) + if not os.path.isdir(os.path.dirname(save_path)): + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + return save_path + + def process_file( file_path: str, file_queue: Queue, @@ -517,29 +553,14 @@ def _save_seq(_seq: list, _save_path: str): mid.save(_save_path) except Exception as e: logger.error(f"Failed to save {_save_path}") - - def _get_save_path(_file_path: str, _idx: int | str = ""): - if input_dir is None: - save_path = os.path.join( - save_dir, - os.path.splitext(os.path.basename(file_path))[0] - + f"{_idx}.mid", - ) - else: - input_rel_path = os.path.relpath(_file_path, input_dir) - save_path = os.path.join( - save_dir, os.path.splitext(input_rel_path)[0] + f"{_idx}.mid" - ) - if not os.path.isdir(os.path.dirname(save_path)): - os.makedirs(os.path.dirname(save_path), exist_ok=True) - - return save_path + logger.debug(traceback.format_exc()) + logger.debug(_seq) def remove_failures_from_queue_(_queue: Queue, _pid: int): _buff = [] while True: try: - _buff.append(_queue(timout=5)) + _buff.append(_queue.get(timeout=5)) except Exception: break @@ -564,18 +585,32 @@ def remove_failures_from_queue_(_queue: Queue, _pid: int): return logger.info(f"Finished file: {file_path}") - _idx = 0 for seq in seqs: - if len(seq) < 1000: + if len(seq) < 500: logger.info("Skipping seq - too short") - continue - _save_seq(seq, _get_save_path(file_path, _idx)) - _idx += 1 + else: + logger.debug( + f"Saving seq of length {len(seq)} from file: {file_path}" + ) + + _save_seq(seq, get_save_path(file_path, input_dir, save_dir)) - logger.info(f"Transcribed into {_idx} segment(s)") logger.info(f"{file_queue.qsize()} file(s) remaining in queue") +def watchdog(main_gpu_pid: int, child_pids: list): + while True: + if not os.path.exists(f"/proc/{main_gpu_pid}"): + print("Cleaning up children...") + for pid in child_pids: + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pass + + time.sleep(1) + + def worker( file_queue: Queue, gpu_task_queue: Queue, @@ -584,7 +619,7 @@ def worker( input_dir: str | None = None, tasks_per_worker: int = 1, ): - logger = _setup_logger() + logger = _setup_logger(name="F") tokenizer = AmtTokenizer() threads = [] try: @@ -629,8 +664,10 @@ def batch_transcribe( batch_size: int = 16, input_dir: str | None = None, gpu_ids: int | None = None, - quantize: bool = True, + quantize: bool = False, + compile: bool = False, ): + assert os.name == "posix", "UNIX/LINUX is the only supported OS" torch.multiprocessing.set_start_method("spawn") num_gpus = len(gpu_ids) if gpu_ids is not None else 1 logger = _setup_logger() @@ -639,17 +676,31 @@ def batch_transcribe( os.remove("transcribe.log") if quantize is True: - logger.info("Quantising weights to int8") - model = quantize_int8(model) + logger.info("Quantising decoder weights to int8") + model.decoder = quantize_int8(model.decoder) + + file_queue = Queue() + for file_path in file_paths: + if ( + os.path.isfile(get_save_path(file_path, input_dir, save_dir)) + is False + ): + file_queue.put(file_path) + elif len(file_paths) == 1: + file_queue.put(file_path) + + logger.info(f"Files to process: {file_queue.qsize()}/{len(file_paths)}") + + num_workers = min( + min(batch_size * num_gpus, multiprocessing.cpu_count() - num_gpus), + file_queue.qsize(), + ) gpu_task_queue = Queue() gpu_batch_queue = Queue() result_queue = Queue() - file_queue = Queue() - for file_path in file_paths: - file_queue.put(file_path) - num_workers = min(batch_size * num_gpus, len(file_paths)) + child_pids = [] logger.info(f"Creating {num_workers} file worker(s)") worker_processes = [ multiprocessing.Process( @@ -660,47 +711,73 @@ def batch_transcribe( result_queue, save_dir, input_dir, - # Wait for all threads to finish - 4, + 5, ), ) for _ in range(num_workers) ] + + for p in worker_processes: + p.start() + child_pids.append(p.pid) + gpu_batch_manager_process = multiprocessing.Process( target=gpu_batch_manager, args=(gpu_task_queue, gpu_batch_queue, batch_size), ) + gpu_batch_manager_process.start() + child_pids.append(gpu_batch_manager_process.pid) + time.sleep(5) start_time = time.time() - if num_gpus == 1: - gpu_manager_processes = [ - multiprocessing.Process( - target=gpu_manager, - args=(gpu_batch_queue, result_queue, model, batch_size), - ) - ] - else: + + if num_gpus > 1: gpu_manager_processes = [ multiprocessing.Process( target=gpu_manager, - args=(gpu_batch_queue, result_queue, model, batch_size, gpu_id), + args=( + gpu_batch_queue, + result_queue, + model, + batch_size, + compile, + gpu_id, + ), ) for gpu_id in gpu_ids ] + for p in gpu_manager_processes: + p.start() + watchdog_process = multiprocessing.Process( + target=watchdog, args=(gpu_batch_manager_process.pid, child_pids) + ) + watchdog_process.start() + else: + gpu_manager_processes = None + watchdog_process = multiprocessing.Process( + target=watchdog, args=(os.getpid(), child_pids) + ) + watchdog_process.start() + gpu_manager( + gpu_batch_queue, + result_queue, + model, + batch_size, + compile, + ) - for p in worker_processes: - p.start() - time.sleep(5) - gpu_batch_manager_process.start() - for p in gpu_manager_processes: - p.start() + if gpu_manager_processes is not None: + for p in gpu_manager_processes: + p.join() - # Watch for file workers to finish for p in worker_processes: + p.terminate() p.join() - for p in gpu_manager_processes: - p.join() + gpu_batch_manager_process.terminate() + gpu_batch_manager_process.join() + watchdog_process.terminate() + watchdog_process.join() print("Took", (time.time() - start_time) / 60, "mins to transcribe files") diff --git a/amt/run.py b/amt/run.py index 1b2bcc5..1af6ced 100644 --- a/amt/run.py +++ b/amt/run.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse -import sys import os import glob @@ -11,7 +10,6 @@ # TODO: Implement a way of inferring the tokenizer name automatically def _add_maestro_args(subparser): subparser.add_argument("dir", help="MAESTRO directory path") - subparser.add_argument("csv", help="MAESTRO csv path") subparser.add_argument("-train", help="train save path", required=True) subparser.add_argument("-val", help="val save path", required=True) subparser.add_argument("-test", help="test save path", required=True) @@ -19,7 +17,20 @@ def _add_maestro_args(subparser): "-mp", help="number of processes to use", type=int, - required=False, + default=1, + ) + + +def _add_synth_args(subparser): + subparser.add_argument("dir", help="Directory containing MIDIs") + subparser.add_argument("csv", help="Split csv") + subparser.add_argument("-train", help="train save path", required=True) + subparser.add_argument("-test", help="test save path", required=True) + subparser.add_argument( + "-mp", + help="number of processes to use", + type=int, + default=1, ) @@ -32,6 +43,12 @@ def _add_transcribe_args(subparser): subparser.add_argument( "-load_dir", help="dir containing mp3/wav files", required=False ) + subparser.add_argument( + "-maestro", + help="get file paths from maestro val/test sets", + action="store_true", + default=False, + ) subparser.add_argument( "-save_dir", help="dir to save midi files", required=True ) @@ -44,30 +61,86 @@ def _add_transcribe_args(subparser): action="store_true", default=False, ) + subparser.add_argument( + "-compile", + help="use the pytorch compiler to generate a cuda graph", + action="store_true", + default=False, + ) subparser.add_argument("-bs", help="batch size", type=int, default=16) -def build_maestro( - maestro_dir, maestro_csv_file, train_file, val_file, test_file, num_procs +def get_synth_mid_paths(mid_dir: str, csv_path: str): + assert os.path.isdir(mid_dir), "directory doesn't exist" + assert os.path.isfile(csv_path), "csv not found" + + train_paths = [] + test_paths = [] + with open(csv_path, "r") as f: + dict_reader = DictReader(f) + for entry in dict_reader: + mid_path = os.path.normpath( + os.path.join(mid_dir, entry["mid_path"]) + ) + + assert os.path.isfile(mid_path), "file missing" + if entry["split"] == "train": + train_paths.append(mid_path) + elif entry["split"] == "test": + test_paths.append(mid_path) + else: + raise ValueError("Invalid split") + + return train_paths, test_paths + + +def build_synth( + mid_dir: str, + csv_path: str, + train_file: str, + test_file: str, + num_procs: int, ): - from amt.data import AmtDataset + from amt.data import AmtDataset, pianoteq_cmd_fn - assert os.path.isdir(maestro_dir), "MAESTRO directory not found" - assert os.path.isfile(maestro_csv_file), "MAESTRO csv not found" if os.path.isfile(train_file): print(f"Dataset file already exists at {train_file} - removing") os.remove(train_file) - if os.path.isfile(val_file): - print(f"Dataset file already exists at {val_file} - removing") - os.remove(val_file) if os.path.isfile(test_file): print(f"Dataset file already exists at {test_file} - removing") os.remove(test_file) + ( + train_paths, + test_paths, + ) = get_synth_mid_paths(mid_dir, csv_path) + + print(f"Building {train_file}") + AmtDataset.build( + load_paths=train_paths, + save_path=train_file, + num_processes=num_procs, + cli_cmd_fn=pianoteq_cmd_fn, + ) + print(f"Building {test_file}") + AmtDataset.build( + load_paths=test_paths, + save_path=test_file, + num_processes=num_procs, + cli_cmd_fn=pianoteq_cmd_fn, + ) + + +def get_matched_maestro_paths(maestro_dir): + assert os.path.isdir(maestro_dir), "MAESTRO directory not found" + + maestro_csv_path = os.path.join(maestro_dir, "maestro-v3.0.0.csv") + assert os.path.isfile(maestro_csv_path), "MAESTRO csv not found" + matched_paths_train = [] matched_paths_val = [] matched_paths_test = [] - with open(maestro_csv_file, "r") as f: + with open(maestro_csv_path, "r") as f: dict_reader = DictReader(f) for entry in dict_reader: audio_path = os.path.normpath( @@ -92,6 +165,28 @@ def build_maestro( else: print("Invalid split") + return matched_paths_train, matched_paths_val, matched_paths_test + + +def build_maestro(maestro_dir, train_file, val_file, test_file, num_procs): + from amt.data import AmtDataset + + if os.path.isfile(train_file): + print(f"Dataset file already exists at {train_file} - removing") + os.remove(train_file) + if os.path.isfile(val_file): + print(f"Dataset file already exists at {val_file} - removing") + os.remove(val_file) + if os.path.isfile(test_file): + print(f"Dataset file already exists at {test_file} - removing") + os.remove(test_file) + + ( + matched_paths_train, + matched_paths_val, + matched_paths_test, + ) = get_matched_maestro_paths(maestro_dir) + print(f"Building {train_file}") AmtDataset.build( load_paths=matched_paths_train, @@ -118,8 +213,11 @@ def transcribe( save_dir, load_path=None, load_dir=None, + maestro=False, batch_size=16, multi_gpu=False, + quantize=False, + compile=False, ): """ Transcribe audio files to midi using the given model and checkpoint. @@ -158,7 +256,10 @@ def transcribe( trans_mode = "single" if load_dir: assert os.path.isdir(load_dir), "load directory doesn't exist" - trans_mode = "batch" + if maestro is True: + trans_mode = "maestro" + else: + trans_mode = "batch" if not os.path.exists(save_dir): os.makedirs(save_dir) assert os.path.isdir(save_dir), "save dir doesn't exist" @@ -189,6 +290,14 @@ def transcribe( ) print(f"Found {len(found_mp3)} mp3 and {len(found_wav)} wav files") file_paths = found_mp3 + found_wav + elif trans_mode == "maestro": + matched_train_paths, matched_val_paths, matched_test_paths = ( + get_matched_maestro_paths(load_dir) + ) + val_mp3_paths = [ap for ap, mp in matched_val_paths] + test_mp3_paths = [ap for ap, mp in matched_test_paths] + file_paths = test_mp3_paths # val_mp3_paths + test_mp3_paths + assert len(file_paths) == 177, "Invalid maestro files" else: file_paths = [load_path] batch_size = 1 @@ -205,6 +314,8 @@ def transcribe( batch_size=batch_size, input_dir=load_dir, gpu_ids=gpu_ids, + quantize=quantize, + compile=compile, ) else: @@ -214,6 +325,8 @@ def transcribe( save_dir=save_dir, batch_size=batch_size, input_dir=load_dir, + quantize=quantize, + compile=compile, ) @@ -225,10 +338,14 @@ def main(): subparser_maestro = subparsers.add_parser( "maestro", help="Commands to build the maestro dataset." ) + subparser_synth = subparsers.add_parser( + "synth", help="Commands to build the maestro dataset." + ) subparser_transcribe = subparsers.add_parser( "transcribe", help="Commands to run transcription." ) _add_maestro_args(subparser_maestro) + _add_synth_args(subparser_synth) _add_transcribe_args(subparser_transcribe) args = parser.parse_args() @@ -240,21 +357,31 @@ def main(): elif args.command == "maestro": build_maestro( maestro_dir=args.dir, - maestro_csv_file=args.csv, train_file=args.train, val_file=args.val, test_file=args.test, num_procs=args.mp, ) + elif args.command == "synth": + build_synth( + mid_dir=args.dir, + csv_path=args.csv, + train_file=args.train, + test_file=args.test, + num_procs=args.mp, + ) elif args.command == "transcribe": transcribe( model_name=args.model_name, checkpoint_path=args.checkpoint_path, load_path=args.load_path, load_dir=args.load_dir, + maestro=args.maestro, save_dir=args.save_dir, batch_size=args.bs, multi_gpu=args.multi_gpu, + quantize=args.q8, + compile=args.compile, ) else: print("Unrecognized command") diff --git a/amt/tokenizer.py b/amt/tokenizer.py index c368673..8b38cae 100644 --- a/amt/tokenizer.py +++ b/amt/tokenizer.py @@ -329,15 +329,16 @@ def _detokenize_midi_dict( ) elif tok_1_type == "on": if (tok_2_type, tok_3_type) != ("onset", "vel"): - print("Unexpected token order:", tok_1, tok_2, tok_3) - raise ValueError + raise ValueError( + "Unexpected token order:", tok_1, tok_2, tok_3 + ) else: notes_to_close[tok_1_data] = (tok_2_data, tok_3_data) elif tok_1_type == "off": if tok_2_type != "onset": - print("Unexpected token order:", tok_1, tok_2, tok_3) - if DEBUG: - raise Exception + raise ValueError( + "Unexpected token order:", tok_1, tok_2, tok_3 + ) else: # Process note and add to note msgs note_to_close = notes_to_close.pop(tok_1_data, None) diff --git a/amt/train.py b/amt/train.py index eee1b8e..864ed93 100644 --- a/amt/train.py +++ b/amt/train.py @@ -1,11 +1,13 @@ import os import sys import csv +import math import random import functools import argparse import logging import torch +import torchaudio import accelerate from torch import nn as nn @@ -148,7 +150,7 @@ def _get_optim( warmup_lrs = torch.optim.lr_scheduler.LinearLR( optimizer, - start_factor=0.000001, + start_factor=1e-8, end_factor=1, total_iters=warmup, ) @@ -194,7 +196,7 @@ def get_finetune_optim( ): LR = 1e-4 END_RATIO = 0.1 - WARMUP_STEPS = 500 + WARMUP_STEPS = 1000 return _get_optim( lr=LR, @@ -229,7 +231,7 @@ def get_dataloaders( audio_transform = AudioTransform() def _collate_fn(seqs, max_pitch_shift: int): - wav, src, tgt = torch.utils.data.default_collate(seqs) + wav, src, tgt, idxs = torch.utils.data.default_collate(seqs) # Pitch aug pitch_shift = random.randint(-max_pitch_shift, max_pitch_shift) @@ -239,13 +241,13 @@ def _collate_fn(seqs, max_pitch_shift: int): # Distortion wav = audio_transform.distortion_aug_cpu(wav) - return wav, src, tgt, pitch_shift + return wav, src, tgt, pitch_shift, idxs train_dataloader = DataLoader( train_dataset, batch_size=batch_size, num_workers=num_workers, - collate_fn=functools.partial(_collate_fn, max_pitch_shift=5), + collate_fn=functools.partial(_collate_fn, max_pitch_shift=4), shuffle=True, ) val_dataloader = DataLoader( @@ -258,14 +260,6 @@ def _collate_fn(seqs, max_pitch_shift: int): return train_dataloader, val_dataloader -def rolling_average(prev_avg: float, x_n: float, n: int): - # Returns rolling average without needing to recalculate - if n == 0: - return x_n - else: - return ((prev_avg * (n - 1)) / n) + (x_n / n) - - def _train( epochs: int, accelerator: accelerate.Accelerator, @@ -292,6 +286,18 @@ def make_checkpoint(_accelerator, _epoch: int, _step: int): ) _accelerator.save_state(checkpoint_dir) + def get_max_norm(named_parameters): + max_grad_norm = {"val": 0.0} + for name, parameter in named_parameters: + if parameter.grad is not None and parameter.requires_grad is True: + grad_norm = parameter.grad.data.norm(2).item() + # logger.debug(f"{name}: {grad_norm}") + if grad_norm >= max_grad_norm["val"]: + max_grad_norm["name"] = name + max_grad_norm["val"] = grad_norm + + return max_grad_norm + # This is all slightly messy as train_loop and val_loop make use of the # variables in the wider scope. Perhaps refactor this at some point. def train_loop( @@ -319,58 +325,73 @@ def train_loop( leave=False, ) ): - step = __step + _resume_step + 1 - - wav, src, tgt, pitch_shift = batch - with torch.no_grad(): - mel = audio_transform.forward(wav, shift=pitch_shift) - logits = model(mel, src) # (b_sz, s_len, v_sz) - logits = logits.transpose(1, 2) # Transpose for CrossEntropyLoss - loss = loss_fn(logits, tgt) + with accelerator.accumulate(model): + step = __step + _resume_step + 1 - # Calculate statistics - loss_buffer.append(loss.item()) - if len(loss_buffer) > TRAILING_LOSS_STEPS: - loss_buffer.pop(0) - trailing_loss = sum(loss_buffer) / len(loss_buffer) - avg_train_loss = rolling_average( - avg_train_loss, loss.item(), __step - ) + wav, src, tgt, pitch_shift, idxs = batch - # Logging - logger.debug( - f"EPOCH {_epoch} STEP {step}: " - f"lr={lr_for_print}, " - f"loss={round(loss.item(), 4)}, " - f"trailing_loss={round(trailing_loss, 4)}, " - f"average_loss={round(avg_train_loss, 4)}" - ) - if accelerator.is_main_process: - loss_writer.writerow([_epoch, step, loss.item()]) - pbar.set_postfix_str( - f"lr={lr_for_print}, " - f"loss={round(loss.item(), 4)}, " - f"trailing={round(trailing_loss, 4)}" - ) + mel = audio_transform.forward(wav, shift=pitch_shift) + logits = model(mel, src) # (b_sz, s_len, v_sz) + logits = logits.transpose( + 1, 2 + ) # Transpose for CrossEntropyLoss + loss = loss_fn(logits, tgt) - # Backwards step - accelerator.backward(loss) - if accelerator.sync_gradients: + # Calculate statistics + loss_buffer.append(accelerator.gather(loss).mean(dim=0).item()) + trailing_loss = sum(loss_buffer[-TRAILING_LOSS_STEPS:]) / len( + loss_buffer[-TRAILING_LOSS_STEPS:] + ) + avg_train_loss = sum(loss_buffer) / len(loss_buffer) + + # Logging + logger.debug( + f"EPOCH {_epoch} STEP {step}: " + f"lr={lr_for_print}, " + f"loss={round(loss_buffer[-1], 4)}, " + f"trailing_loss={round(trailing_loss, 4)}, " + f"average_loss={round(avg_train_loss, 4)}" + ) + if accelerator.is_main_process: + loss_writer.writerow([_epoch, step, loss_buffer[-1]]) + + pbar.set_postfix_str( + f"lr={lr_for_print}, " + f"loss={round(loss_buffer[-1], 4)}, " + f"trailing={round(trailing_loss, 4)}" + ) + + # Backwards step + accelerator.backward(loss) + + max_grad_norm_bn = get_max_norm(model.named_parameters()) accelerator.clip_grad_norm_(model.parameters(), 1.0) + max_grad_norm_an = get_max_norm(model.named_parameters()) - optimizer.step() - optimizer.zero_grad() - if scheduler: - scheduler.step() - lr_for_print = "{:.2e}".format(scheduler.get_last_lr()[0]) - - if steps_per_checkpoint: - if step % steps_per_checkpoint == 0: - make_checkpoint( - _accelerator=accelerator, - _epoch=_epoch, - _step=step, + if max_grad_norm_bn["val"] > 1.5: + logger.warning( + f"Seen large grad_norm {max_grad_norm_bn['name']}: {max_grad_norm_bn['val']} -> {max_grad_norm_an['val']}" ) + logger.debug(accelerator.gather(loss)) + logger.debug(accelerator.gather(idxs)) + elif math.isnan(trailing_loss): + logger.error(accelerator.gather(loss)) + logger.error(loss_buffer) + logger.error(accelerator.gather(idxs)) + + optimizer.step() + optimizer.zero_grad() + if scheduler: + scheduler.step() + lr_for_print = "{:.2e}".format(scheduler.get_last_lr()[0]) + + if steps_per_checkpoint: + if step % steps_per_checkpoint == 0: + make_checkpoint( + _accelerator=accelerator, + _epoch=_epoch, + _step=step, + ) logger.info( f"EPOCH {_epoch}/{epochs + start_epoch}: Finished training - " @@ -379,8 +400,9 @@ def train_loop( return avg_train_loss + @torch.no_grad() def val_loop(dataloader, _epoch: int, aug: bool): - avg_val_loss = 0 + loss_buffer = [] model.eval() for step, batch in ( pbar := tqdm( @@ -389,26 +411,25 @@ def val_loop(dataloader, _epoch: int, aug: bool): leave=False, ) ): - wav, src, tgt = batch - with torch.no_grad(): - if aug == False: - mel = audio_transform.log_mel(wav) - elif aug == True: - # Apply aug without distortion or spec-augment - mel = audio_transform.log_mel( - audio_transform.aug_wav(wav), detune=True - ) - else: - raise TypeError + wav, src, tgt, idxs = batch + + if aug == False: + mel = audio_transform.log_mel(wav) + elif aug == True: + # Apply aug without distortion or spec-augment + mel = audio_transform.log_mel( + audio_transform.aug_wav(wav), detune=True + ) + else: + raise TypeError - logits = model(mel, src) - logits = logits.transpose( - 1, 2 - ) # Transpose for CrossEntropyLoss - loss = loss_fn(logits, tgt) + logits = model(mel, src) + logits = logits.transpose(1, 2) # Transpose for CrossEntropyLoss + loss = loss_fn(logits, tgt) # Logging - avg_val_loss = rolling_average(avg_val_loss, loss.item(), step) + loss_buffer.append(accelerator.gather(loss).mean(dim=0).item()) + avg_val_loss = sum(loss_buffer) / len(loss_buffer) pbar.set_postfix_str(f"average_loss={round(avg_val_loss, 4)}") # EPOCH @@ -425,7 +446,7 @@ def val_loop(dataloader, _epoch: int, aug: bool): steps_per_checkpoint > 1 ), "Invalid checkpoint mode value (too small)" - TRAILING_LOSS_STEPS = 200 + TRAILING_LOSS_STEPS = 100 PAD_ID = train_dataloader.dataset.tokenizer.pad_id logger = get_logger(__name__) # Accelerate logger loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID) @@ -531,7 +552,9 @@ def resume_train( assert os.path.isfile(val_data_path), f"No file found at {val_data_path}" tokenizer = AmtTokenizer() - accelerator = accelerate.Accelerator(project_dir=project_dir) + accelerator = accelerate.Accelerator( + project_dir=project_dir, gradient_accumulation_steps=4 + ) if accelerator.is_main_process: project_dir = setup_project_dir(project_dir) logger = setup_logger(project_dir) @@ -662,7 +685,9 @@ def train( assert os.path.isfile(finetune_cp_path), "Invalid checkpoint path" tokenizer = AmtTokenizer() - accelerator = accelerate.Accelerator(project_dir=project_dir) + accelerator = accelerate.Accelerator( + project_dir=project_dir, gradient_accumulation_steps=4 + ) if accelerator.is_main_process: project_dir = setup_project_dir(project_dir) logger = setup_logger(project_dir) @@ -801,6 +826,9 @@ def parse_train_args(): argp.add_argument("model", help="name of model config file") argp.add_argument("train_data", help="path to train dir") argp.add_argument("val_data", help="path to val dir") + argp.add_argument( + "-cpath", help="resuming checkpoint", type=str, required=False + ) argp.add_argument("-epochs", help="train epochs", type=int, required=True) argp.add_argument("-bs", help="batch size", type=int, default=32) argp.add_argument("-workers", help="number workers", type=int, default=1) @@ -849,6 +877,7 @@ def parse_train_args(): num_workers=train_args.workers, batch_size=train_args.bs, epochs=train_args.epochs, + finetune_cp_path=train_args.cpath, steps_per_checkpoint=train_args.spc, project_dir=train_args.pdir, ) diff --git a/config/models/medium-final.json b/config/models/medium-final.json new file mode 100644 index 0000000..69b79c7 --- /dev/null +++ b/config/models/medium-final.json @@ -0,0 +1,11 @@ +{ + "n_mels": 256, + "n_audio_ctx": 1500, + "n_audio_state": 768, + "n_audio_head": 12, + "n_audio_layer": 12, + "n_text_ctx": 4096, + "n_text_state": 768, + "n_text_head": 12, + "n_text_layer": 12 +} \ No newline at end of file diff --git a/config/models/medium.json b/config/models/small-final.json similarity index 77% rename from config/models/medium.json rename to config/models/small-final.json index 45c0de6..1d40dd9 100644 --- a/config/models/medium.json +++ b/config/models/small-final.json @@ -3,9 +3,9 @@ "n_audio_ctx": 1500, "n_audio_state": 512, "n_audio_head": 8, - "n_audio_layer": 12, + "n_audio_layer": 6, "n_text_ctx": 4096, "n_text_state": 512, "n_text_head": 8, - "n_text_layer": 12 + "n_text_layer": 6 } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a74c9cf..696fb40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ aria @ git+https://github.com/EleutherAI/aria.git -torch >= 2.1 +torch >= 2.2 torchaudio accelerate +psutil mido tqdm orjson -mir_eval \ No newline at end of file +mir_eval diff --git a/scripts/eval/dedupe.py b/scripts/eval/dedupe.py new file mode 100644 index 0000000..8d67d81 --- /dev/null +++ b/scripts/eval/dedupe.py @@ -0,0 +1,72 @@ +import os +import hashlib +import argparse +import multiprocessing + +from pydub import AudioSegment + + +def hash_audio_file(file_path): + """Hash the audio content of an MP3 file.""" + try: + audio = AudioSegment.from_mp3(file_path) + raw_data = audio.raw_data + except Exception as e: + print(e) + return file_path, -1 + else: + return file_path, hashlib.sha256(raw_data).hexdigest() + + +def find_duplicates(root_dir): + """Find and remove duplicate MP3 files in the directory and its subdirectories.""" + duplicates = [] + mp3_paths = [] + for root, _, files in os.walk(root_dir): + for file in files: + if file.endswith(".mp3"): + mp3_paths.append(os.path.join(root, file)) + + with multiprocessing.Pool() as pool: + hashes = pool.map(hash_audio_file, mp3_paths) + + seen_hash = {} + for p, h in hashes: + if seen_hash.get(h, False) is True: + print("Seen dupe") + duplicates.append(p) + else: + print("Seen orig") + seen_hash[h] = True + + return duplicates + + +def remove_duplicates(duplicate_files): + """Remove the duplicate files.""" + for file in duplicate_files: + os.remove(file) + print(f"Removed duplicate file: {file}") + + +def main(): + parser = argparse.ArgumentParser( + description="Remove duplicate MP3 files based on audio content." + ) + parser.add_argument( + "dir", type=str, help="Directory to scan for duplicate MP3 files." + ) + args = parser.parse_args() + + root_directory = args.dir + duplicates = find_duplicates(root_directory) + + if duplicates: + print(f"Found {len(duplicates)} duplicates. Removing...") + remove_duplicates(duplicates) + else: + print("No duplicates found.") + + +if __name__ == "__main__": + main() diff --git a/scripts/eval/dtw.py b/scripts/eval/dtw.py new file mode 100644 index 0000000..41e5754 --- /dev/null +++ b/scripts/eval/dtw.py @@ -0,0 +1,211 @@ +# pip install git+https://github.com/alex2awesome/djitw.git + +import argparse +import csv +import librosa +import djitw +import pretty_midi +import scipy +import random +import multiprocessing +import os +import warnings +import functools +import glob +import numpy as np + +from multiprocessing.dummy import Pool as ThreadPool + +# Audio/CQT parameters +FS = 22050.0 +NOTE_START = 36 +N_NOTES = 48 +HOP_LENGTH = 1024 + +# DTW parameters +GULLY = 0.96 + + +def compute_cqt(audio_data): + """Compute the CQT and frame times for some audio data""" + # Compute CQT + cqt = librosa.cqt( + audio_data, + sr=FS, + fmin=librosa.midi_to_hz(NOTE_START), + n_bins=N_NOTES, + hop_length=HOP_LENGTH, + tuning=0.0, + ) + # Compute the time of each frame + times = librosa.frames_to_time( + np.arange(cqt.shape[1]), sr=FS, hop_length=HOP_LENGTH + ) + # Compute log-amplitude + cqt = librosa.amplitude_to_db(cqt, ref=cqt.max()) + # Normalize and return + return librosa.util.normalize(cqt, norm=2).T, times + + +# Had to change this to average chunks for large audio files for cpu reasons +def load_and_run_dtw(args): + def calc_score(_midi_cqt, _audio_cqt): + # Nearly all high-performing systems used cosine distance + distance_matrix = scipy.spatial.distance.cdist( + _midi_cqt, _audio_cqt, "cosine" + ) + + # Get lowest cost path + p, q, score = djitw.dtw( + distance_matrix, + GULLY, # The gully for all high-performing systems was near 1 + np.median( + distance_matrix + ), # The penalty was also near 1.0*median(distance_matrix) + inplace=False, + ) + # Normalize by path length, normalize by distance matrix submatrix within path + score = score / len(p) + score = ( + score / distance_matrix[p.min() : p.max(), q.min() : q.max()].mean() + ) + + return score + + audio_file, midi_file = args + # Load in the audio data + audio_data, _ = librosa.load(audio_file, sr=FS) + audio_cqt, audio_times = compute_cqt(audio_data) + + midi_object = pretty_midi.PrettyMIDI(midi_file) + midi_audio = midi_object.fluidsynth(fs=FS) + midi_cqt, midi_times = compute_cqt(midi_audio) + + # Truncate to save on compute time for long tracks + MAX_LEN = 10000 + total_len = midi_cqt.shape[0] + if total_len > MAX_LEN: + idx = 0 + scores = [] + while idx < total_len: + scores.append( + calc_score( + _midi_cqt=midi_cqt[idx : idx + MAX_LEN, :], + _audio_cqt=audio_cqt[idx : idx + MAX_LEN, :], + ) + ) + idx += MAX_LEN + + max_score = max(scores) + avg_score = sum(scores) / len(scores) if scores else 1.0 + + else: + avg_score = calc_score(_midi_cqt=midi_cqt, _audio_cqt=audio_cqt) + max_score = avg_score + + return midi_file, avg_score, max_score + + +# I changed wav with mp3 in here :/ +def get_matched_files(audio_dir: str, mid_dir: str): + # We assume that the files have the same path relative to their directory + res = [] + wav_paths = glob.glob(os.path.join(audio_dir, "**/*.mp3"), recursive=True) + print(f"found {len(wav_paths)} mp3 files") + + for wav_path in wav_paths: + input_rel_path = os.path.relpath(wav_path, audio_dir) + mid_path = os.path.join( + mid_dir, os.path.splitext(input_rel_path)[0] + ".mid" + ) + if os.path.isfile(mid_path): + res.append((wav_path, mid_path)) + + print(f"found {len(res)} matched mp3-midi pairs") + + return res + + +def abortable_worker(func, *args, **kwargs): + timeout = kwargs.get("timeout", None) + p = ThreadPool(1) + res = p.apply_async(func, args=args) + try: + out = res.get(timeout) + return out + except multiprocessing.TimeoutError: + return None, None, None + except Exception as e: + print(e) + return None, None, None + finally: + p.close() + p.join() + + +if __name__ == "__main__": + multiprocessing.set_start_method("fork") + warnings.filterwarnings( + "ignore", + category=UserWarning, + message="amplitude_to_db was called on complex input", + ) + parser = argparse.ArgumentParser() + parser.add_argument("-audio_dir", help="dir containing .wav files") + parser.add_argument( + "-mid_dir", help="dir containing .mid files", default=None + ) + parser.add_argument( + "-output_file", help="path to output file", default=None + ) + args = parser.parse_args() + + matched_files = get_matched_files( + audio_dir=args.audio_dir, mid_dir=args.mid_dir + ) + + results = {} + if os.path.exists(args.output_file): + with open(args.output_file, "r") as f: + reader = csv.DictReader(f) + for row in reader: + results[row["mid_path"]] = { + "avg_score": row["avg_score"], + "max_score": row["max_score"], + } + + matched_files = [ + (audio_path, mid_path) + for audio_path, mid_path in matched_files + if mid_path not in results.keys() + ] + random.shuffle(matched_files) + print(f"loaded {len(results)} results") + print(f"calculating scores for {len(matched_files)}") + + score_csv = open(args.output_file, "a") + csv_writer = csv.writer(score_csv) + csv_writer.writerow(["mid_path", "avg_score", "max_score"]) + + with multiprocessing.Pool() as pool: + abortable_func = functools.partial( + abortable_worker, load_and_run_dtw, timeout=15000 + ) + scores = pool.imap_unordered(abortable_func, matched_files) + + skipped = 0 + processed = 0 + for mid_path, avg_score, max_score in scores: + if avg_score is not None and max_score is not None: + csv_writer.writerow([mid_path, avg_score, max_score]) + score_csv.flush() + else: + print(f"timeout") + skipped += 1 + + processed += 1 + if processed % 10 == 0: + print(f"PROCESSED: {processed}/{len(matched_files)}") + print(f"***") + + print(f"skipped: {skipped}") diff --git a/scripts/eval/dtw.sh b/scripts/eval/dtw.sh new file mode 100644 index 0000000..1d25b71 --- /dev/null +++ b/scripts/eval/dtw.sh @@ -0,0 +1,5 @@ +python /home/loubb/work/aria-amt/scripts/eval/dtw.py \ + -audio_dir /mnt/ssd1/data/mp3/raw/aria-mp3 \ + -mid_dir /mnt/ssd1/amt/transcribed_data/0/aria-mid \ + -output_file /mnt/ssd1/amt/transcribed_data/0/aria-mid.csv + diff --git a/scripts/eval/mir.sh b/scripts/eval/mir.sh new file mode 100644 index 0000000..0364d4c --- /dev/null +++ b/scripts/eval/mir.sh @@ -0,0 +1,5 @@ +python /home/loubb/work/aria-amt/amt/evaluate.py \ + --est-dir /home/loubb/work/aria-amt/maestro-ft \ + --ref-dir /mnt/ssd1/data/mp3/raw/maestro-mp3 \ + --output-stats-file out.json + \ No newline at end of file diff --git a/scripts/eval/prune.py b/scripts/eval/prune.py new file mode 100644 index 0000000..1b8f919 --- /dev/null +++ b/scripts/eval/prune.py @@ -0,0 +1,91 @@ +import argparse +import csv +import os +import shutil + + +# Calculate percentiles without using numpy +def calculate_percentiles(data, percentiles): + data_sorted = sorted(data) + n = len(data_sorted) + results = [] + for percentile in percentiles: + k = (n - 1) * percentile / 100 + f = int(k) + c = k - f + if f + 1 < n: + result = data_sorted[f] + c * (data_sorted[f + 1] - data_sorted[f]) + else: + result = data_sorted[f] + results.append(result) + return results + + +def main(mid_dir, output_dir, score_file, max_score, dry): + if os.path.isdir(output_dir) is False: + os.makedirs(output_dir) + + scores = {} + with open(score_file, "r") as f: + reader = csv.DictReader(f) + failures = 0 + for row in reader: + try: + if 0.0 < float(row["avg_score"]) < 1.0: + scores[row["mid_path"]] = float(row["avg_score"]) + except Exception as e: + pass + + print(f"{failures} failures") + print(f"found {len(scores.items())} mid-score pairs") + + print("top 50 by score:") + for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)[ + :50 + ]: + print(f"{v}: {k}") + print("bottom 50 by score:") + for k, v in sorted(scores.items(), key=lambda item: item[1])[:50]: + print(f"{v}: {k}") + + # Define the percentiles to calculate + percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90] + floats = [v for k, v in scores.items()] + + # Calculate the percentiles + print(f"percentiles: {calculate_percentiles(floats, percentiles)}") + + cnt = 0 + for mid_path, score in scores.items(): + mid_rel_path = os.path.relpath(mid_path, mid_dir) + output_path = os.path.join(output_dir, mid_rel_path) + if not os.path.exists(os.path.dirname(output_path)): + os.makedirs(os.path.dirname(output_path)) + + if score < max_score: + if args.dry is not True: + shutil.copyfile(mid_path, output_path) + else: + cnt += 1 + + print(f"excluded {cnt}/{len(scores.items())} files") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-mid_dir", help="dir containing .mid files", default=None + ) + parser.add_argument( + "-output_dir", help="dir containing .mid files", default=None + ) + parser.add_argument("-score_file", help="path to output file", default=None) + parser.add_argument( + "-max_score", type=float, help="path to output file", default=None + ) + parser.add_argument("-dry", action="store_true", help="path to output file") + args = parser.parse_args() + + main( + args.mid_dir, args.output_dir, args.score_file, args.max_score, args.dry + ) diff --git a/scripts/eval/prune.sh b/scripts/eval/prune.sh new file mode 100644 index 0000000..9b3f89a --- /dev/null +++ b/scripts/eval/prune.sh @@ -0,0 +1,6 @@ +python /home/loubb/work/aria-amt/scripts/eval/prune.py \ + -mid_dir /mnt/ssd1/amt/transcribed_data/0/pijama-mid \ + -output_dir /mnt/ssd1/amt/transcribed_data/0/pijama-mid-pruned \ + -score_file /mnt/ssd1/amt/transcribed_data/0/pijama-mid.csv \ + -max_score 0.42 \ + # -dry \ No newline at end of file diff --git a/scripts/eval/req-eval.txt b/scripts/eval/req-eval.txt new file mode 100644 index 0000000..d3ed177 --- /dev/null +++ b/scripts/eval/req-eval.txt @@ -0,0 +1,4 @@ +djitw @ git+https://github.com/alex2awesome/djitw.git +librosa +pretty_midi +pyfluidsynth \ No newline at end of file diff --git a/scripts/eval/split.py b/scripts/eval/split.py new file mode 100644 index 0000000..c912cbc --- /dev/null +++ b/scripts/eval/split.py @@ -0,0 +1,57 @@ +import csv +import random +import glob +import argparse +import os + + +def get_matched_paths(audio_dir: str, mid_dir: str): + # Assume that the files have the same path relative to their directory + res = [] + mid_paths = glob.glob(os.path.join(mid_dir, "**/*.mid"), recursive=True) + print(f"found {len(mid_paths)} mid files") + + audio_dir_last = os.path.basename(audio_dir) + mid_dir_last = os.path.basename(mid_dir) + + for mid_path in mid_paths: + input_rel_path = os.path.relpath(mid_path, mid_dir) + + mp3_rel_path = os.path.splitext(input_rel_path)[0] + ".mp3" + mp3_path = os.path.join(audio_dir, mp3_rel_path) + + # Check if the corresponding .mp3 file exists + if os.path.isfile(mp3_path): + matched_mid_path = os.path.join(mid_dir_last, input_rel_path) + matched_mp3_path = os.path.join(audio_dir_last, mp3_rel_path) + + res.append((matched_mp3_path, matched_mid_path)) + + print(f"found {len(res)} matched mp3-midi pairs") + assert len(mid_paths) == len(res), "audio files missing" + + return res + + +def create_csv(matched_paths, csv_path): + split_csv = open(csv_path, "w") + csv_writer = csv.writer(split_csv) + csv_writer.writerow(["mid_path", "audio_path", "split"]) + + for audio_path, mid_path in matched_paths: + if random.random() < 0.1: + csv_writer.writerow([mid_path, audio_path, "test"]) + else: + csv_writer.writerow([mid_path, audio_path, "train"]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-mid_dir", type=str) + parser.add_argument("-audio_dir", type=str) + parser.add_argument("-csv_path", type=str) + args = parser.parse_args() + + matched_paths = get_matched_paths(args.audio_dir, args.mid_dir) + + create_csv(matched_paths, args.csv_path) diff --git a/scripts/eval/split.sh b/scripts/eval/split.sh new file mode 100644 index 0000000..05faece --- /dev/null +++ b/scripts/eval/split.sh @@ -0,0 +1,4 @@ +python /home/loubb/work/aria-amt/scripts/eval/split.py \ + -mid_dir /mnt/ssd1/amt/transcribed_data/0/aria-mid-pruned \ + -audio_dir /mnt/ssd1/data/mp3/raw/aria-mp3 \ + -csv_path aria-pruned-split.csv diff --git a/tests/test_data.py b/tests/test_data.py index f69117e..8e1010c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -18,7 +18,7 @@ if os.path.isdir("tests/test_results") is False: os.mkdir("tests/test_results") -MAESTRO_PATH = "/weka/proj-aria/aria-amt/data/train.jsonl" +MAESTRO_PATH = "/mnt/ssd1/amt/training_data/train.txt" def plot_spec(mel: torch.Tensor, name: str | int): @@ -57,7 +57,7 @@ def test_build(self): dataset = AmtDataset("tests/test_results/dataset.jsonl") tokenizer = AmtTokenizer() - for idx, (wav, src, tgt) in enumerate(dataset): + for idx, (wav, src, tgt, idx) in enumerate(dataset): print(wav.shape, src.shape, tgt.shape) src_decoded = tokenizer.decode(src) tgt_decoded = tokenizer.decode(tgt) @@ -76,11 +76,11 @@ def test_maestro(self): audio_transform = AudioTransform() dataset = AmtDataset(load_path=MAESTRO_PATH) print(f"Dataset length: {len(dataset)}") - for idx, (wav, src, tgt) in enumerate(dataset): + for idx, (wav, src, tgt, __idx) in enumerate(dataset): src_dec, tgt_dec = tokenizer.decode(src), tokenizer.decode(tgt) - if (idx + 1) % 100 == 0: - break - if idx % 7 == 0: + + if idx % 7 == 0 and idx < 100: + print(idx) src_mid_dict = tokenizer._detokenize_midi_dict( src_dec, len_ms=30000, @@ -91,6 +91,11 @@ def test_maestro(self): torchaudio.save( f"tests/test_results/wav_{idx}.wav", wav.unsqueeze(0), 16000 ) + torchaudio.save( + f"tests/test_results/wav_aug_{idx}.wav", + audio_transform.aug_wav(wav.unsqueeze(0)), + 16000, + ) plot_spec( audio_transform(wav.unsqueeze(0)).squeeze(0), f"mel_{idx}" )