From 50f0b60092cd1bbc77cf317420bec0e44fb22376 Mon Sep 17 00:00:00 2001 From: Louis Date: Fri, 22 Mar 2024 16:48:46 +0000 Subject: [PATCH] Update inference (#22) * synth data - untested * adjust audio striding * add detune aug * add aug val to train * add b64 encoding to dataset * update dataset * fix * training changes * inference changes * format * format --- amt/audio.py | 66 ++-- amt/data.py | 292 ++++++++------- amt/infer.py | 475 ------------------------ amt/inference/__init__.py | 0 amt/inference/model.py | 435 ++++++++++++++++++++++ amt/inference/quantize.py | 153 ++++++++ amt/inference/transcribe.py | 716 ++++++++++++++++++++++++++++++++++++ amt/model.py | 84 +---- amt/run.py | 53 +-- amt/tokenizer.py | 24 +- amt/train.py | 86 ++--- config/config.json | 2 +- config/models/small.json | 11 - config/models/test.json | 11 - tests/test_data.py | 97 ++++- 15 files changed, 1690 insertions(+), 815 deletions(-) delete mode 100644 amt/infer.py create mode 100644 amt/inference/__init__.py create mode 100644 amt/inference/model.py create mode 100644 amt/inference/quantize.py create mode 100644 amt/inference/transcribe.py delete mode 100644 config/models/small.json delete mode 100644 config/models/test.json diff --git a/amt/audio.py b/amt/audio.py index a0d5d4c..6c37f3f 100644 --- a/amt/audio.py +++ b/amt/audio.py @@ -194,10 +194,11 @@ def __init__( noise_ratio: float = 0.95, reverb_ratio: float = 0.95, applause_ratio: float = 0.01, - bandpass_ratio: float = 0.1, + bandpass_ratio: float = 0.15, distort_ratio: float = 0.15, reduce_ratio: float = 0.01, - codecs_ratio: float = 0.01, + detune_ratio: float = 0.1, + detune_max_shift: float = 0.15, spec_aug_ratio: float = 0.5, ): super().__init__() @@ -219,8 +220,9 @@ def __init__( self.bandpass_ratio = bandpass_ratio self.distort_ratio = distort_ratio self.reduce_ratio = reduce_ratio + self.detune_ratio = detune_ratio + self.detune_max_shift = detune_max_shift self.spec_aug_ratio = spec_aug_ratio - self.codecs_ratio = codecs_ratio self.reduction_resample_rate = 6000 # Hardcoded? # Audio aug @@ -268,6 +270,19 @@ def __init__( ), ) + def get_params(self): + return { + "noise_ratio": self.noise_ratio, + "reverb_ratio": self.reverb_ratio, + "applause_ratio": self.applause_ratio, + "bandpass_ratio": self.bandpass_ratio, + "distort_ratio": self.distort_ratio, + "reduce_ratio": self.reduce_ratio, + "detune_ratio": self.detune_ratio, + "detune_max_shift": self.detune_max_shift, + "spec_aug_ratio": self.spec_aug_ratio, + } + def _get_paths(self, dir_path): os.makedirs(dir_path, exist_ok=True) @@ -399,21 +414,7 @@ def distortion_aug_cpu(self, wav: torch.Tensor): return wav - def apply_codec(self, wav: torch.tensor): - """ - Apply different audio codecs to the audio. - """ - format_encoder_pairs = [ - ("wav", "pcm_mulaw"), - ("g722", None), - ("ogg", "vorbis") - ] - for format, encoder in format_encoder_pairs: - encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder) - if random.random() < self.codecs_ratio: - wav = encoder.apply(wav, self.sample_rate) - - def shift_spec(self, specs: torch.Tensor, shift: int): + def shift_spec(self, specs: torch.Tensor, shift: int | float): if shift == 0: return specs @@ -438,9 +439,21 @@ def shift_spec(self, specs: torch.Tensor, shift: int): return shifted_specs + def detune_spec(self, specs: torch.Tensor): + if random.random() < self.detune_ratio: + detune_shift = random.uniform( + -self.detune_max_shift, self.detune_max_shift + ) + detuned_specs = self.shift_spec(specs, shift=detune_shift) + + return (specs + detuned_specs) / 2 + else: + return specs + def aug_wav(self, wav: torch.Tensor): # This function doesn't apply distortion. If distortion is desired it - # should be run before hand on the cpu with distortion_aug_cpu. + # should be run beforehand on the cpu with distortion_aug_cpu. Note + # also that detuning is done to the spectrogram in log_mel, not the wav. # Noise if random.random() < self.noise_ratio: @@ -468,10 +481,17 @@ def norm_mel(self, mel_spec: torch.Tensor): return log_spec - def log_mel(self, wav: torch.Tensor, shift: int | None = None): + def log_mel( + self, wav: torch.Tensor, shift: int | None = None, detune: bool = False + ): spec = self.spec_transform(wav)[..., :-1] - if shift and shift != 0: + + if shift is not None and shift != 0: spec = self.shift_spec(spec, shift) + elif detune is True: + # Don't detune and spec shift at the same time + spec = self.detune_spec(spec) + mel_spec = self.mel_transform(spec) # Norm @@ -483,8 +503,8 @@ def forward(self, wav: torch.Tensor, shift: int = 0): # Noise, and reverb wav = self.aug_wav(wav) - # Spec & pitch shift - log_mel = self.log_mel(wav, shift) + # Spec, detuning & pitch shift + log_mel = self.log_mel(wav, shift, detune=True) # Spec aug if random.random() < self.spec_aug_ratio: diff --git a/amt/data.py b/amt/data.py index 6a4810e..377b760 100644 --- a/amt/data.py +++ b/amt/data.py @@ -1,91 +1,22 @@ import mmap import os +import io +import base64 import shutil import orjson import torch import torchaudio -from multiprocessing import Pool +from multiprocessing import Pool, Queue, Process +from typing import Callable from aria.data.midi import MidiDict from amt.tokenizer import AmtTokenizer from amt.config import load_config from amt.audio import pad_or_trim -from midi2audio import FluidSynth -import random - - -class SyntheticMidiHandler: - def __init__(self, soundfont_path: str, soundfont_prob_dict: dict = None, num_wavs_per_midi: int = 1): - """ - File to load MIDI files and convert them to audio. - - Parameters - ---------- - soundfont_path : str - Path to the directory containing soundfont files. - soundfont_prob_dict : dict, optional - Dictionary containing the probability of using a soundfont file. - The keys are the soundfont file names and the values are the - probability of using the soundfont file. If none is given, then - a uniform distribution is used. - num_wavs_per_midi : int, optional - Number of audio files to generate per MIDI file. - """ - - self.soundfont_path = soundfont_path - self.soundfont_prob_dict = soundfont_prob_dict - self.num_wavs_per_midi = num_wavs_per_midi - - self.fs_objs = self._load_soundfonts() - self.soundfont_cumul_prob_dict = self._get_cumulative_prob_dict() - - def _load_soundfonts(self): - """Loads the soundfonts into fluidsynth objects.""" - fs_files = os.listdir(self.soundfont_path) - fs_objs = {} - for fs_file in fs_files: - fs_objs[fs_file] = FluidSynth(fs_file) - return fs_objs - - def _get_cumulative_prob_dict(self): - """Returns a dictionary with the cumulative probabilities of the soundfonts. - Used for sampling the soundfonts. - """ - if self.soundfont_prob_dict is None: - self.soundfont_prob_dict = {k: 1 / len(self.fs_objs) for k in self.fs_objs.keys()} - self.soundfont_prob_dict = {k: v / sum(self.soundfont_prob_dict.values()) - for k, v in self.soundfont_prob_dict.items()} - cumul_prob_dict = {} - cumul_prob = 0 - for k, v in self.soundfont_prob_dict.items(): - cumul_prob_dict[k] = (cumul_prob, cumul_prob + v) - cumul_prob += v - return cumul_prob_dict - - def _sample_soundfont(self): - """Samples a soundfont file.""" - rand_num = random.random() - for k, (v_s, v_e) in self.soundfont_cumul_prob_dict.items(): - if (rand_num >= v_s) and (rand_num < v_e): - return self.fs_objs[k] - - def get_wav(self, midi_path: str, save_path: str): - """ - Converts a MIDI file to audio. - - Parameters - ---------- - midi_path : str - Path to the MIDI file. - save_path : str - Path to save the audio file. - """ - for i in range(self.num_wavs_per_midi): - soundfont = self._sample_soundfont() - if self.num_wavs_per_midi > 1: - save_path = save_path[:-4] + f"_{i}.wav" - soundfont.midi_to_audio(midi_path, save_path) + + +# Occasionally the worker util goes to 0 for some reason, debug this def get_wav_mid_segments( @@ -133,7 +64,7 @@ def get_wav_mid_segments( res = [] for idx in range( 0, - total_samples - (num_samples - (num_samples // stride_factor)), + total_samples - (num_samples - num_samples // stride_factor), num_samples // stride_factor, ): audio_feature = pad_or_trim(wav[idx:], length=num_samples) @@ -142,6 +73,7 @@ def get_wav_mid_segments( midi_dict=midi_dict, start_ms=idx // samples_per_ms, end_ms=(idx + num_samples) / samples_per_ms, + max_pedal_len_ms=10000, ) else: mid_feature = [] @@ -154,29 +86,97 @@ def get_wav_mid_segments( return res -def write_features(args): - audio_path, mid_path, save_path = args +def write_features(audio_path: str, mid_path: str, save_path: str): features = get_wav_mid_segments( audio_path=audio_path, mid_path=mid_path, return_json=False, ) - dirname, basename = os.path.split(save_path) - proc_save_path = os.path.join(dirname, str(os.getpid()) + basename) - with open(proc_save_path, mode="ab") as file: + # Father forgive me for I have sinned + with open(save_path, mode="a") as file: for wav, seq in features: - file.write( - orjson.dumps( - wav.numpy(), - option=orjson.OPT_SERIALIZE_NUMPY, - ) - ) - file.write(b"\n") - file.write(orjson.dumps(seq)) - file.write(b"\n") + # Encode wav using b64 to avoid newlines + wav_buffer = io.BytesIO() + torch.save(wav, wav_buffer) + wav_buffer.seek(0) + wav_bytes = wav_buffer.read() + wav_str = base64.b64encode(wav_bytes).decode("utf-8") + file.write(wav_str) + file.write("\n") + + seq_bytes = orjson.dumps(seq) + seq_str = base64.b64encode(seq_bytes).decode("utf-8") + file.write(seq_str) + file.write("\n") + + +def get_synth_audio(cli_cmd_fn: str, mid_path: str, wav_path: str): + _cmd = cli_cmd_fn(mid_path, wav_path) + os.system(_cmd) + + +def write_synth_features(cli_cmd_fn: Callable, mid_path: str, save_path: str): + audio_path_temp = f"{os.getpid()}_temp.wav" + + try: + get_synth_audio( + cli_cmd=cli_cmd_fn, mid_path=mid_path, wav_path=audio_path_temp + ) + except: + if os.path.isfile(audio_path_temp): + os.remove(audio_path_temp) + return + else: + features = get_wav_mid_segments( + audio_path=audio_path_temp, + mid_path=mid_path, + return_json=False, + ) + os.remove(audio_path_temp) + + with open(save_path, mode="a") as file: + for wav, seq in features: + wav_buffer = io.BytesIO() + torch.save(wav, wav_buffer) + wav_buffer.seek(0) + wav_bytes = wav_buffer.read() + wav_str = base64.b64encode(wav_bytes).decode("utf-8") + file.write(wav_str) + file.write("\n") + + seq_bytes = orjson.dumps(seq) + seq_str = base64.b64encode(seq_bytes).decode("utf-8") + file.write(seq_str) + file.write("\n") + - return proc_save_path +def build_worker_fn(load_path_queue, save_path_queue, _save_path: str): + dirname, basename = os.path.split(_save_path) + worker_save_path = os.path.join(dirname, str(os.getpid()) + basename) + + while not load_path_queue.empty(): + audio_path, mid_path = load_path_queue.get() + write_features(audio_path, mid_path, worker_save_path) + + print("Worker", os.getpid(), "finished") + save_path_queue.put(worker_save_path) + + +def build_synth_worker_fn( + cli_cmd: Callable, + load_path_queue, + save_path_queue, + _save_path: str, +): + dirname, basename = os.path.split(_save_path) + worker_save_path = os.path.join(dirname, str(os.getpid()) + basename) + + while not load_path_queue.empty(): + mid_path = load_path_queue.get() + write_synth_features(cli_cmd, mid_path, worker_save_path) + + save_path_queue.put(worker_save_path) class AmtDataset(torch.utils.data.Dataset): @@ -222,8 +222,10 @@ def _format(tok): self.file_mmap.seek(self.index[idx]) # Load data from line - wav = torch.tensor(orjson.loads(self.file_mmap.readline())) - _seq = orjson.loads(self.file_mmap.readline()) + wav = torch.load( + io.BytesIO(base64.b64decode(self.file_mmap.readline())) + ) + _seq = orjson.loads(base64.b64decode(self.file_mmap.readline())) _seq = [_format(tok) for tok in _seq] # Format seq _seq = self.mixup_fn(_seq) # Data augmentation @@ -267,11 +269,31 @@ def _get_index_path(load_path: str): f"{load_path.rsplit('.', 1)[0]}_index.{load_path.rsplit('.', 1)[1]}" ) + def _build_index(self): + self.file_mmap.seek(0) + index = [] + pos = 0 + while True: + pos_buff = pos + + pos = self.file_mmap.find(b"\n", pos) + if pos == -1: + break + pos = self.file_mmap.find(b"\n", pos + 1) + if pos == -1: + break + + index.append(pos_buff) + pos += 1 + + return index + @classmethod def build( cls, - matched_load_paths: list[tuple[str, str]], + load_paths: list, save_path: str, + cli_cmd_fn: Callable | None = None, num_processes: int = 1, ): assert os.path.isfile(save_path) is False, f"{save_path} already exists" @@ -281,18 +303,55 @@ def build( print(f"Removing existing index file at {index_path}") os.remove(AmtDataset._get_index_path(load_path=save_path)) - num_paths = len(matched_load_paths) - with Pool(processes=num_processes) as pool: - sharded_save_paths = [] - res = pool.imap_unordered( - write_features, - ((ap, mp, save_path) for ap, mp in matched_load_paths), - ) - for idx, proc_save_path in enumerate(res): - if idx % 10 == 0 and idx != 0: - print(f"Finished {idx}/{num_paths}") - if proc_save_path not in sharded_save_paths: - sharded_save_paths.append(proc_save_path) + save_path_queue = Queue() + load_path_queue = Queue() + for entry in load_paths: + load_path_queue.put(entry) + + if cli_cmd_fn is None: + # Build matched audio-midi dataset + assert len(load_paths[0]) == 2, "Invalid load paths" + print("Building matched audio-midi dataset") + worker_processes = [ + Process( + target=build_worker_fn, + args=( + load_path_queue, + save_path_queue, + save_path, + ), + ) + for _ in range(num_processes) + ] + else: + # Build synthetic dataset + assert len(load_paths[0]) == 1, "Invalid load paths" + print("Building synthetic dataset") + worker_processes = [ + Process( + target=build_synth_worker_fn, + args=( + cli_cmd_fn, + load_path_queue, + save_path_queue, + save_path, + ), + ) + for _ in range(num_processes) + ] + + for p in worker_processes: + p.start() + for p in worker_processes: + p.join() + + sharded_save_paths = [] + while not save_path_queue.empty(): + try: + _path = save_path_queue.get_nowait() + sharded_save_paths.append(_path) + except Queue.Empty: + break # This is bad, however cat is fast if shutil.which("cat") is None: @@ -311,22 +370,3 @@ def build( # Create index by loading object AmtDataset(load_path=save_path) - - def _build_index(self): - self.file_mmap.seek(0) - index = [] - pos = 0 - while True: - pos_buff = pos - - pos = self.file_mmap.find(b"\n", pos) - if pos == -1: - break - pos = self.file_mmap.find(b"\n", pos + 1) - if pos == -1: - break - - index.append(pos_buff) - pos += 1 - - return index diff --git a/amt/infer.py b/amt/infer.py deleted file mode 100644 index 289d499..0000000 --- a/amt/infer.py +++ /dev/null @@ -1,475 +0,0 @@ -import os -import time -import random -import logging -import torch -import torch.multiprocessing as multiprocessing - -from torch.multiprocessing import Queue -from tqdm import tqdm -from functools import wraps -from torch.cuda import is_bf16_supported - -from amt.model import AmtEncoderDecoder -from amt.tokenizer import AmtTokenizer -from amt.audio import AudioTransform, pad_or_trim -from amt.data import get_wav_mid_segments - - -MAX_SEQ_LEN = 4096 -LEN_MS = 30000 -STRIDE_FACTOR = 3 -CHUNK_LEN_MS = LEN_MS // STRIDE_FACTOR -BEAM = 5 -ONSET_TOLERANCE = 61 -VEL_TOLERANCE = 100 - - -def _setup_logger(): - logger = logging.getLogger(__name__) - for h in logger.handlers[:]: - logger.removeHandler(h) - - logger.propagate = False - logger.setLevel(logging.DEBUG) - formatter = logging.Formatter( - "[%(asctime)s] %(process)d: [%(levelname)s] %(message)s", - ) - - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - ch.setFormatter(formatter) - logger.addHandler(ch) - - return logging.getLogger(__name__) - - -def calculate_vel( - logits: torch.Tensor, - init_vel: int, - tokenizer: AmtTokenizer = AmtTokenizer(), -): - probs, idxs = torch.topk(torch.softmax(logits, dim=-1), BEAM) - vels = [tokenizer.id_to_tok[idx.item()] for idx in idxs] - - # Get rid of outliers - for idx in range(BEAM): - vel = vels[idx] - if type(vel) is not tuple: - vels[idx] = 0 - probs[idx] = 0.0 - elif vel[0] != "vel": - vels[idx] = 0 - probs[idx] = 0.0 - elif (vel[1] < init_vel - VEL_TOLERANCE / 2) or ( - vel[1] > init_vel + VEL_TOLERANCE / 2 - ): - vels[idx] = vels[idx][1] - probs[idx] = 0.0 - else: - vels[idx] = vels[idx][1] - - vels = torch.tensor(vels).to(probs.device) - new_vel = torch.sum(vels * probs) / torch.sum(probs) - new_vel = round(new_vel.item() / 5) * 5 - - return tokenizer.tok_to_id[("vel", new_vel)] - - -def calculate_onset( - logits: torch.Tensor, - init_onset: int, - tokenizer: AmtTokenizer = AmtTokenizer(), -): - probs, idxs = torch.topk(torch.softmax(logits, dim=-1), BEAM) - onsets = [tokenizer.id_to_tok[idx.item()] for idx in idxs] - - # Get rid of outliers - for idx in range(BEAM): - onset = onsets[idx] - if type(onset) is not tuple: - onsets[idx] = 0 - probs[idx] = 0.0 - elif onset[0] != "onset": - onsets[idx] = 0 - probs[idx] = 0.0 - elif (onset[1] < init_onset - ONSET_TOLERANCE / 2) or ( - onset[1] > init_onset + ONSET_TOLERANCE / 2 - ): - onsets[idx] = onsets[idx][1] - probs[idx] = 0.0 - else: - onsets[idx] = onsets[idx][1] - - onsets = torch.tensor(onsets).to(probs.device) - new_onset = torch.sum(onsets * probs) / torch.sum(probs) - new_onset = round(new_onset.item() / 10) * 10 - - return tokenizer.tok_to_id[("onset", new_onset)] - - -def optional_bf16_autocast(func): - @wraps(func) - def wrapper(*args, **kwargs): - # Assuming 'check_bfloat16_support()' returns True if bfloat16 is supported - if is_bf16_supported(): - with torch.autocast("cuda", dtype=torch.bfloat16): - return func(*args, **kwargs) - else: - # Call the function with float16 if bfloat16 is not supported - with torch.autocast("cuda", dtype=torch.float32): - return func(*args, **kwargs) - - return wrapper - - -@optional_bf16_autocast -def process_segments( - tasks: list, - model: AmtEncoderDecoder, - audio_transform: AudioTransform, - tokenizer: AmtTokenizer, -): - logger = logging.getLogger(__name__) - audio_segs = torch.stack( - [audio_seg for (audio_seg, prefix), _ in tasks] - ).cuda() - log_mels = audio_transform.log_mel(audio_segs) - audio_features = model.embed_audio(mel=log_mels) - - raw_prefixes = [prefix for (audio_seg, prefix), _ in tasks] - prefix_lens = [len(prefix) for prefix in raw_prefixes] - min_prefix_len = min(prefix_lens) - prefixes = [ - tokenizer.trunc_seq(prefix, MAX_SEQ_LEN) for prefix in raw_prefixes - ] - seq = torch.stack([tokenizer.encode(prefix) for prefix in prefixes]).cuda() - end_idxs = [MAX_SEQ_LEN for _ in prefixes] - - kv_cache = model.get_empty_cache() - - # for idx in ( - # pbar := tqdm( - # range(min_prefix_len, MAX_SEQ_LEN - 1), - # total=MAX_SEQ_LEN - (min_prefix_len + 1), - # leave=False, - # ) - # ): - for idx in range(min_prefix_len, MAX_SEQ_LEN - 1): - if idx == min_prefix_len: - logits = model.decoder( - xa=audio_features, - x=seq[:, :idx], - kv_cache=kv_cache, - ) - else: - logits = model.decoder( - xa=audio_features, - x=seq[:, idx - 1 : idx], - kv_cache=kv_cache, - ) - - next_tok_ids = torch.argmax(logits[:, -1], dim=-1) - - for batch_idx in range(logits.shape[0]): - if idx > end_idxs[batch_idx]: - # End already seen, add pad token - tok_id = tokenizer.pad_id - elif idx >= prefix_lens[batch_idx]: - # New token required, recalculated if needed - tok_id = next_tok_ids[batch_idx].item() - tok = tokenizer.id_to_tok[tok_id] - if type(tok) is tuple and tok[0] == "onset": - # If onset token, recalculate - tok_id = calculate_onset(logits[batch_idx, -1], tok[1]) - elif type(tok) is tuple and tok[0] == "vel": - # If velocity token, recalculate - tok_id = calculate_vel(logits[batch_idx, -1], tok[1]) - - else: - # Still in prefix tokens, do nothing - tok_id = tokenizer.tok_to_id[prefixes[batch_idx][idx]] - - seq[batch_idx, idx] = tok_id - tok = tokenizer.id_to_tok[tok_id] - if tok == tokenizer.eos_tok: - end_idxs[batch_idx] = idx - elif ( - type(tok) is tuple - and tok[0] == "onset" - and tok[1] >= LEN_MS - CHUNK_LEN_MS - ): - end_idxs[batch_idx] = idx - 2 - - if all(_idx <= idx for _idx in end_idxs): - break - - if not all(_idx <= idx for _idx in end_idxs): - logger.warning("Context length overflow when transcribing segment") - - results = [ - tokenizer.decode(seq[_idx, : end_idxs[_idx] + 1]) - for _idx in range(seq.shape[0]) - ] - - return results - - -def gpu_manager( - gpu_task_queue: Queue, - result_queue: Queue, - model: AmtEncoderDecoder, - batch_size: int, -): - model.compile() - logger = _setup_logger() - audio_transform = AudioTransform().cuda() - tokenizer = AmtTokenizer(return_tensors=True) - - wait_for_batch = True - batch = [] - while True: - try: - task, pid = gpu_task_queue.get(timeout=5) - except: - logger.info(f"GPU task timeout") - if len(batch) == 0: - logger.info(f"Finished GPU tasks") - return - else: - wait_for_batch = False - else: - batch.append((task, pid)) - - if len(batch) == batch_size or ( - len(batch) > 0 and wait_for_batch is False - ): - # Process batch on GPU - results = process_segments( - tasks=[task for task in batch], - model=model, - audio_transform=audio_transform, - tokenizer=tokenizer, - ) - for result, (_, pid) in zip(results, batch): - result_queue.put({"result": result, "pid": pid}) - batch.clear() - - -def _shift_onset(seq: list, shift_ms: int): - res = [] - for tok in seq: - if type(tok) is tuple and tok[0] == "onset": - res.append(("onset", tok[1] + shift_ms)) - else: - res.append(tok) - - return res - - -def _truncate_seq( - seq: list, - start_ms: int, - end_ms: int, - tokenizer: AmtTokenizer = AmtTokenizer(), -): - if start_ms == end_ms: - _mid_dict, unclosed_notes = tokenizer._detokenize_midi_dict( - seq, start_ms, return_unclosed_notes=True - ) - random.shuffle(unclosed_notes) - return [("prev", p) for p in unclosed_notes] + [tokenizer.bos_tok] - else: - _mid_dict = tokenizer._detokenize_midi_dict(seq, LEN_MS) - try: - res = tokenizer._tokenize_midi_dict(_mid_dict, start_ms, end_ms - 1) - except Exception: - print("Truncate failed") - return [""] - else: - if res[-1] == tokenizer.eos_tok: - res.pop() - return res - - -def process_file( - file_path, - gpu_task_queue: Queue, - result_queue: Queue, - tokenizer: AmtTokenizer = AmtTokenizer(), -): - logger = logging.getLogger(__name__) - pid = multiprocessing.current_process().pid - - logger.info(f"Getting wav segments") - audio_segments = [ - f - for f, _ in get_wav_mid_segments( - audio_path=file_path, stride_factor=STRIDE_FACTOR - ) - ] - - res = [] - seq = [tokenizer.bos_tok] - concat_seq = [tokenizer.bos_tok] - for idx, audio_seg in enumerate(audio_segments): - init_idx = len(seq) - - # Add to gpu queue and wait for results - gpu_task_queue.put(((audio_seg, seq), pid)) - while True: - gpu_result = result_queue.get() - if gpu_result["pid"] == pid: - seq = gpu_result["result"] - break - else: - result_queue.put(gpu_result) - - concat_seq += _shift_onset( - seq[init_idx:], - idx * CHUNK_LEN_MS, - ) - - if idx == len(audio_segments) - 1: - res.append(concat_seq) - elif concat_seq[-1] == tokenizer.eos_tok: - res.append(concat_seq) - seq = [tokenizer.bos_tok] - concat_seq = [tokenizer.bos_tok] - logger.info(f"Finished segment - eos_tok seen") - else: - seq = _truncate_seq(seq, CHUNK_LEN_MS, LEN_MS - CHUNK_LEN_MS) - if len(seq) == 1: - res.append(concat_seq) - seq = [tokenizer.bos_tok] - concat_seq = [tokenizer.bos_tok] - logger.info(f"Exiting early - silence") - - return res - - -def worker( - file_queue: Queue, - gpu_task_queue: Queue, - result_queue: Queue, - save_dir: str, - input_dir: str | None = None, -): - def _save_seq(_seq: list, _save_path: str): - if os.path.exists(_save_path): - logger.info(f"Already exists {_save_path} - overwriting") - - for tok in _seq[::-1]: - if type(tok) is tuple and tok[0] == "onset": - last_onset = tok[1] - break - - try: - mid_dict = tokenizer._detokenize_midi_dict( - tokenized_seq=_seq, len_ms=last_onset - ) - mid = mid_dict.to_midi() - mid.save(_save_path) - except Exception as e: - logger.error(f"Failed to save {_save_path}") - - def _get_save_path(_file_path: str, _idx: int | str = ""): - if input_dir is None: - save_path = os.path.join( - save_dir, - os.path.splitext(os.path.basename(file_path))[0] - + f"{_idx}.mid", - ) - else: - input_rel_path = os.path.relpath(_file_path, input_dir) - save_path = os.path.join( - save_dir, os.path.splitext(input_rel_path)[0] + f"{_idx}.mid" - ) - if not os.path.isdir(os.path.dirname(save_path)): - os.makedirs(os.path.dirname(save_path), exist_ok=True) - - return save_path - - logger = _setup_logger() - tokenizer = AmtTokenizer() - files_processed = 0 - while not file_queue.empty(): - file_path = file_queue.get() - - try: - seqs = process_file(file_path, gpu_task_queue, result_queue) - except Exception as e: - logger.error(f"Failed to process {file_path}") - continue - - logger.info(f"Transcribed into {len(seqs)} segment(s)") - for _idx, seq in enumerate(seqs): - _save_seq(seq, _get_save_path(file_path, _idx)) - - files_processed += 1 - logger.info(f"Finished file {files_processed} - {file_path}") - logger.info(f"{file_queue.qsize()} file(s) remaining in queue") - - -def batch_transcribe( - file_paths, # Queue | list, - model: AmtEncoderDecoder, - save_dir: str, - batch_size: int = 16, - gpu_id: int | None = None, - input_dir: str | None = None, -): - if gpu_id is not None: - os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - - model.cuda() - model.eval() - if isinstance(file_paths, list): - file_queue = Queue() - for file_path in file_paths: - file_queue.put(file_path) - else: - file_queue = file_paths - - gpu_task_queue = Queue() - result_queue = Queue() - - worker_processes = [ - multiprocessing.Process( - target=worker, - args=( - file_queue, - gpu_task_queue, - result_queue, - save_dir, - input_dir, - ), - ) - for _ in range(batch_size + 1) - ] - for p in worker_processes: - p.start() - - time.sleep(10) - gpu_manager_process = multiprocessing.Process( - target=gpu_manager, - args=(gpu_task_queue, result_queue, model, batch_size), - ) - gpu_manager_process.start() - - for p in worker_processes: - p.join() - - gpu_manager_process.join() - - -def sample_top_p(probs, p): - probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - mask = probs_sum - probs_sort > p - probs_sort[mask] = 0.0 - probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) - next_token = torch.multinomial(probs_sort, num_samples=1) - next_token = torch.gather(probs_idx, -1, next_token) - - return next_token diff --git a/amt/inference/__init__.py b/amt/inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/amt/inference/model.py b/amt/inference/model.py new file mode 100644 index 0000000..c302614 --- /dev/null +++ b/amt/inference/model.py @@ -0,0 +1,435 @@ +"""Contains code modified from https://github.com/openai/whisper""" + +import math +import torch +import torch.nn.functional as F + +from torch import Tensor, nn +from dataclasses import dataclass +from typing import Dict, Iterable, Optional + + +@dataclass +class ModelConfig: + n_mels: int + n_audio_ctx: int + n_audio_state: int + n_audio_head: int + n_audio_layer: int + n_text_ctx: int + n_text_state: int + n_text_head: int + n_text_layer: int + n_vocab: Optional[int] = None + + def set_vocab_size(self, vocab_size: int): + self.n_vocab = vocab_size + + +class KVCache(nn.Module): + def __init__( + self, + max_batch_size: int, + max_seq_length: int, + n_heads: int, + head_dim: int, + dtype=torch.bfloat16, + ): + super().__init__() + cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim) + self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype)) + self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype)) + + def update(self, input_pos, k_val, v_val): + # input_pos: [S], k_val, v_val: [B, H, L, D] + + k_out = self.k_cache + v_out = self.v_cache + k_out[:, :, input_pos] = k_val + v_out[:, :, input_pos] = v_val + + return k_out, v_out + + +def sinusoids( + length: int, channels: int, max_timescale: float = 10000 +) -> torch.Tensor: + """Returns sinusoids for positional embedding""" + if channels % 2 != 0: + raise ValueError( + f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels." + ) + log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1) + inv_timescales = torch.exp( + -log_timescale_increment * torch.arange(channels // 2) + ) + scaled_time = torch.arange(length).view(-1, 1) * inv_timescales.view(1, -1) + return torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1) + + +class EncoderAttention(nn.Module): + def __init__(self, n_state: int, n_head: int): + super().__init__() + assert n_state % n_head == 0, "n_head does not evenly devide n_state" + + self.n_head = n_head + self.d_head = n_state // n_head + self.query = nn.Linear(n_state, n_state, bias=False) + self.key = nn.Linear(n_state, n_state, bias=False) + self.value = nn.Linear(n_state, n_state, bias=False) + self.out = nn.Linear(n_state, n_state, bias=False) + + def forward( + self, + xa: Tensor, + ): + q = self.query(xa) + k = self.key(xa) + v = self.value(xa) + + # Reshape for correct format + batch_size, source_seq_len, _ = k.shape + batch_size, target_seq_len, _ = q.shape + q = q.view( + batch_size, target_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + k = k.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + v = v.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + + wv = F.scaled_dot_product_attention( + query=q, + key=k, + value=v, + is_causal=False, + ) + wv = wv.transpose(1, 2).reshape( + batch_size, + target_seq_len, + self.n_head * self.d_head, + ) + + return self.out(wv) + + +class CrossAttention(nn.Module): + def __init__(self, n_state: int, n_head: int): + super().__init__() + assert n_state % n_head == 0, "n_head does not evenly devide n_state" + + self.n_head = n_head + self.d_head = n_state // n_head + self.query = nn.Linear(n_state, n_state, bias=False) + self.key = nn.Linear(n_state, n_state, bias=False) + self.value = nn.Linear(n_state, n_state, bias=False) + self.out = nn.Linear(n_state, n_state, bias=False) + self.kv_cache: KVCache | None = None + + def get_kv(self, xa: torch.Tensor, xa_input_pos: Tensor): + assert self.kv_cache is not None, "No kv_cache" + k = self.key(xa[:, xa_input_pos]) + v = self.value(xa[:, xa_input_pos]) + + # Reshape for correct format + batch_size, source_seq_len, _ = k.shape + k = k.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + v = v.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + + k, v = self.kv_cache.update(k_val=k, v_val=v, input_pos=xa_input_pos) + + return k, v + + def forward( + self, + x: Tensor, + xa: Tensor, + xa_input_pos: Tensor, + ): + q = self.query(x) + batch_size, target_seq_len, _ = q.shape + q = q.view( + batch_size, target_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + + k, v = self.get_kv(xa, xa_input_pos) + wv = F.scaled_dot_product_attention( + query=q, + key=k, + value=v, + is_causal=False, + ) + wv = wv.transpose(1, 2).reshape( + batch_size, + target_seq_len, + self.n_head * self.d_head, + ) + + return self.out(wv) + + +class CausalSelfAttention(nn.Module): + def __init__(self, n_state: int, n_head: int): + super().__init__() + assert n_state % n_head == 0, "n_head does not evenly devide n_state" + + self.n_state = n_state + self.n_head = n_head + self.d_head = n_state // n_head + self.out = nn.Linear(n_state, n_state, bias=False) + self.kv_cache: KVCache | None = None + + # Add this back after + self.combined_qkv = nn.Linear(n_state, 3 * n_state, bias=False) + self._register_load_state_dict_pre_hook(self.combined_qkv_hook) + + def get_kv(self, k: Tensor, v: Tensor, input_pos: Tensor): + k, v = self.kv_cache.update(k_val=k, v_val=v, input_pos=input_pos) + + return k, v + + def combined_qkv_hook(self, state_dict, prefix, *args): + if prefix + "query.weight" in state_dict: + wq = state_dict.pop(prefix + "query.weight") + wk = state_dict.pop(prefix + "key.weight") + wv = state_dict.pop(prefix + "value.weight") + state_dict[prefix + "combined_qkv.weight"] = torch.cat([wq, wk, wv]) + + def forward( + self, + x: Tensor, + mask: Optional[Tensor] = None, + input_pos: Optional[Tensor] = None, + ): + q, k, v = self.combined_qkv(x).split( + [self.n_state, self.n_state, self.n_state], dim=-1 + ) + + batch_size, target_seq_len, _ = q.shape + q = q.view( + batch_size, target_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + + batch_size, source_seq_len, _ = k.shape + k = k.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + v = v.view( + batch_size, source_seq_len, self.n_head, self.d_head + ).transpose(1, 2) + + k, v = self.get_kv(k, v, input_pos=input_pos) + wv = F.scaled_dot_product_attention( + query=q, + key=k, + value=v, + attn_mask=mask, + ) + + # (bz, nh, L, dh) -> (bz, L, nh, dh) -> (bz, L, d) + wv = wv.transpose(1, 2).reshape( + batch_size, target_seq_len, self.n_head * self.d_head + ) + + return self.out(wv) + + +class EncoderAttentionBlock(nn.Module): + def __init__( + self, n_state: int, n_head: int, cross_attention: bool = False + ): + super().__init__() + self.attn = EncoderAttention(n_state, n_head) + self.attn_ln = nn.LayerNorm(n_state) + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + nn.Linear(n_state, n_mlp, bias=False), + nn.GELU(), + nn.Linear(n_mlp, n_state, bias=False), + ) + self.mlp_ln = nn.LayerNorm(n_state) + + def forward( + self, + xa: Tensor, + ): + xa = xa + self.attn( + self.attn_ln(xa), + ) + xa = xa + self.mlp(self.mlp_ln(xa)) + + return xa + + +class DecoderAttentionBlock(nn.Module): + def __init__( + self, n_state: int, n_head: int, cross_attention: bool = False + ): + super().__init__() + self.attn = CausalSelfAttention(n_state, n_head) + self.attn_ln = nn.LayerNorm(n_state) + self.cross_attn = ( + CrossAttention(n_state, n_head) if cross_attention else None + ) + self.cross_attn_ln = nn.LayerNorm(n_state) if cross_attention else None + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + nn.Linear(n_state, n_mlp, bias=False), + nn.GELU(), + nn.Linear(n_mlp, n_state, bias=False), + ) + self.mlp_ln = nn.LayerNorm(n_state) + + def forward( + self, + x: Tensor, + xa: Tensor, + mask: Optional[Tensor] = None, + x_input_pos: Optional[Tensor] = None, + xa_input_pos: Optional[Tensor] = None, + ): + x = x + self.attn( + self.attn_ln(x), + mask=mask, + input_pos=x_input_pos, + ) + x = x + self.cross_attn(self.cross_attn_ln(x), xa, xa_input_pos) + x = x + self.mlp(self.mlp_ln(x)) + + return x + + +class AudioEncoder(nn.Module): + def __init__( + self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int + ): + super().__init__() + self.conv1 = nn.Conv1d(n_mels, n_state, kernel_size=3, padding=1) + self.conv2 = nn.Conv1d( + n_state, n_state, kernel_size=3, stride=2, padding=1 + ) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + + self.blocks: Iterable[EncoderAttentionBlock] = nn.ModuleList( + [EncoderAttentionBlock(n_state, n_head) for _ in range(n_layer)] + ) + self.ln_post = nn.LayerNorm(n_state) + + def forward(self, xa: Tensor): + xa = F.gelu(self.conv1(xa)) + xa = F.gelu(self.conv2(xa)) + xa = xa.permute(0, 2, 1) + + assert ( + xa.shape[1:] == self.positional_embedding.shape + ), f"incorrect audio shape: {xa.shape[1:]} != {self.positional_embedding.shape}" + xa = (xa + self.positional_embedding).to(xa.dtype) + + for block in self.blocks: + xa = block(xa) + + xa = self.ln_post(xa) + return xa + + +class TextDecoder(nn.Module): + def __init__( + self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int + ): + super().__init__() + self.token_embedding = nn.Embedding(n_vocab, n_state) + self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) + + self.blocks: Iterable[DecoderAttentionBlock] = nn.ModuleList( + [ + DecoderAttentionBlock(n_state, n_head, cross_attention=True) + for _ in range(n_layer) + ] + ) + self.ln = nn.LayerNorm(n_state) + self.register_buffer("causal_mask", None, persistent=False) + + def forward( + self, + x: Tensor, + xa: Tensor, + x_input_pos: Tensor, + xa_input_pos: Tensor, + ): + mask = self.causal_mask[None, None, x_input_pos] + x = self.token_embedding(x) + self.positional_embedding[x_input_pos] + + for block in self.blocks: + x = block( + x=x, + xa=xa, + mask=mask, + x_input_pos=x_input_pos, + xa_input_pos=xa_input_pos, + ) + + x = self.ln(x) + logits = ( + x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1) + ).float() + + return logits + + def setup_cache( + self, + batch_size, + max_seq_len=4096, + max_audio_len=1500, + ): + self.causal_mask = torch.tril( + torch.ones(max_seq_len, max_seq_len, dtype=torch.bool) + ) + # Init cache + for b in self.blocks: + b.attn.kv_cache = KVCache( + max_batch_size=batch_size, + max_seq_length=max_seq_len, + n_heads=8, + head_dim=64, + ).cuda() + b.cross_attn.kv_cache = KVCache( + max_batch_size=batch_size, + max_seq_length=max_audio_len, + n_heads=8, + head_dim=64, + ).cuda() + + +class AmtEncoderDecoder(nn.Module): + def __init__(self, dims: ModelConfig): + super().__init__() + self.dims = dims + self.encoder = AudioEncoder( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, + ) + self.decoder = TextDecoder( + self.dims.n_vocab, + self.dims.n_text_ctx, + self.dims.n_text_state, + self.dims.n_text_head, + self.dims.n_text_layer, + ) + + def forward(self, mel: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor: + _buff = self.encoder(mel) + return self.decoder(tokens, _buff) + + @property + def device(self): + return next(self.parameters()).device diff --git a/amt/inference/quantize.py b/amt/inference/quantize.py new file mode 100644 index 0000000..a54b4f7 --- /dev/null +++ b/amt/inference/quantize.py @@ -0,0 +1,153 @@ +"""Contains code modified from https://github.com/pytorch-labs/gpt-fast""" + +import torch + +from torch import nn as nn +from torch.nn import functional as F + + +def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype): + # assumes symmetric quantization + # assumes axis == 0 + # assumes dense memory format + # TODO(future): relax ^ as needed + + # default setup for affine quantization of activations + eps = torch.finfo(torch.float32).eps + + # get min and max + min_val, max_val = torch.aminmax(x, dim=1) + + # calculate scales and zero_points based on min and max + # reference: https://fburl.com/code/srbiybme + min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) + max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) + device = min_val_neg.device + + # reference: https://fburl.com/code/4wll53rk + max_val_pos = torch.max(-min_val_neg, max_val_pos) + scales = max_val_pos / (float(quant_max - quant_min) / 2) + # ensure scales is the same dtype as the original tensor + scales = torch.clamp(scales, min=eps).to(x.dtype) + zero_points = torch.zeros( + min_val_neg.size(), dtype=torch.int64, device=device + ) + + # quantize based on qmin/qmax/scales/zp + # reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63 + x_div = x / scales.unsqueeze(-1) + x_round = torch.round(x_div) + x_zp = x_round + zero_points.unsqueeze(-1) + quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype) + + return quant, scales, zero_points + + +def replace_linear_weight_only_int8_per_channel(module): + for name, child in module.named_children(): + if isinstance(child, nn.Linear): + if child.bias is not None: + setattr( + module, + name, + WeightOnlyInt8LinearBias( + child.in_features, child.out_features + ), + ) + else: + setattr( + module, + name, + WeightOnlyInt8Linear(child.in_features, child.out_features), + ) + else: + replace_linear_weight_only_int8_per_channel(child) + + +class WeightOnlyInt8QuantHandler: + def __init__(self, mod: torch.nn.Module): + self.mod = mod + + @torch.no_grad() + def create_quantized_state_dict(self): + cur_state_dict = self.mod.state_dict() + for fqn, mod in self.mod.named_modules(): + if isinstance(mod, torch.nn.Linear): + int8_weight, scales, _ = dynamically_quantize_per_channel( + mod.weight.float(), -128, 127, torch.int8 + ) + cur_state_dict[f"{fqn}.weight"] = int8_weight.to("cpu") + cur_state_dict[f"{fqn}.scales"] = scales.to( + mod.weight.dtype + ).to("cpu") + + return cur_state_dict + + def convert_for_runtime(self): + replace_linear_weight_only_int8_per_channel(self.mod) + return self.mod + + +class WeightOnlyInt8Linear(torch.nn.Module): + __constants__ = ["in_features", "out_features"] + in_features: int + out_features: int + weight: torch.Tensor + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.register_buffer( + "weight", torch.empty((out_features, in_features), dtype=torch.int8) + ) + self.register_buffer( + "scales", torch.ones(out_features, dtype=torch.bfloat16) + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales + + +# Kinda gross workaround - might not be fused by the compiler +class WeightOnlyInt8LinearBias(torch.nn.Module): + __constants__ = ["in_features", "out_features"] + in_features: int + out_features: int + weight: torch.Tensor + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.register_buffer( + "weight", torch.empty((out_features, in_features), dtype=torch.int8) + ) + self.register_buffer( + "bias", torch.empty(out_features, dtype=torch.bfloat16) + ) + self.register_buffer( + "scales", torch.ones(out_features, dtype=torch.bfloat16) + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return ( + F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales + + self.bias + ) diff --git a/amt/inference/transcribe.py b/amt/inference/transcribe.py new file mode 100644 index 0000000..9109e6a --- /dev/null +++ b/amt/inference/transcribe.py @@ -0,0 +1,716 @@ +import os +import time +import random +import logging +import traceback +import threading +import torch +import torch.multiprocessing as multiprocessing +import torch._dynamo.config +import torch._inductor.config + +from torch.multiprocessing import Queue +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +from functools import wraps +from torch.cuda import is_bf16_supported + +from amt.inference.model import AmtEncoderDecoder +from amt.tokenizer import AmtTokenizer +from amt.audio import AudioTransform +from amt.data import get_wav_mid_segments + +torch._inductor.config.coordinate_descent_tuning = True +torch._inductor.config.triton.unique_kernel_names = True +torch._inductor.config.fx_graph_cache = True + +MAX_SEQ_LEN = 4096 +MAX_BLOCK_LEN = 4096 +LEN_MS = 30000 +STRIDE_FACTOR = 3 +CHUNK_LEN_MS = LEN_MS // STRIDE_FACTOR + + +def _setup_logger(): + logger = logging.getLogger(__name__) + for h in logger.handlers[:]: + logger.removeHandler(h) + + logger.propagate = False + logger.setLevel(logging.DEBUG) + formatter = logging.Formatter( + "[%(asctime)s] %(process)d: [%(levelname)s] %(message)s", + ) + + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + logger.addHandler(ch) + + fh = logging.FileHandler("transcribe.log") + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + return logging.getLogger(__name__) + + +@torch.jit.script +def get_static_mask(): + # The values are hardcoded here for the pytorch jit - manually update + col_indices = torch.arange(3419, device="cuda").unsqueeze(0) + mask_a = col_indices >= 392 + mask_b = col_indices <= 3418 + return col_indices, mask_a & mask_b + + +@torch.jit.script +def recalculate_tok_ids( + logits: torch.Tensor, + tok_ids: torch.Tensor, +): + probs = torch.softmax(logits, dim=-1) + + # Mask out all non-onset/vel tok_ids + col_indices, interval_mask = get_static_mask() + + # Mask out tok_ids larger than 30ms from original tok_id + tok_ids_expanded = tok_ids.unsqueeze(1) + mask_c = col_indices <= tok_ids_expanded + 3 + mask_d = col_indices >= tok_ids_expanded - 3 + beam_mask = mask_c & mask_d + + # Don't mask out the original tok_id (required for non-onset/vel toks) + tok_id_mask = torch.zeros_like(probs, dtype=torch.bool) + tok_id_mask.scatter_(1, tok_ids_expanded, 1) + + # Combine and calculate probs + combined_mask = (interval_mask & beam_mask) | tok_id_mask + probs[~combined_mask] = 0 + + # Calculate expected value + weighted_idxs = probs * torch.arange( + probs.size(1), device=probs.device + ).float().unsqueeze(0) + idx_evs = ( + (weighted_idxs.sum(dim=1) / (probs.sum(dim=1) + 1e-9)) + .round() + .to(torch.long) + ) + + return idx_evs + + +# Changes seq and eos_idxs in place - tok_ids hardcoded +@torch.jit.script +def update_seq_end_idxs_( + next_tok_ids: torch.Tensor, + seq: torch.Tensor, + eos_idxs: torch.Tensor, + prefix_lens: torch.Tensor, + idx: int, +): + # Update eos_idxs if next tok is eos_tok + eos_mask = next_tok_ids == 1 + eos_idxs[eos_mask] = idx + + # Update eos_idxs if next tok in onset > 20000 + offset_mask = next_tok_ids >= 2418 + eos_idxs[offset_mask] = idx - 2 + + # Don't update toks in prefix or after eos_idx + insert_mask = (prefix_lens <= idx) & (eos_idxs >= idx) + seq[insert_mask, idx] = next_tok_ids[insert_mask] + + +def optional_bf16_autocast(func): + @wraps(func) + def wrapper(*args, **kwargs): + if is_bf16_supported(): + with torch.autocast("cuda", dtype=torch.bfloat16): + return func(*args, **kwargs) + else: + with torch.autocast("cuda", dtype=torch.float32): + return func(*args, **kwargs) + + return wrapper + + +def decode_token( + model: AmtEncoderDecoder, + x: torch.Tensor, + xa: torch.Tensor, + x_input_pos: torch.Tensor, + xa_input_pos: torch.Tensor, +): + logits = model.decoder.forward( + x=x, + xa=xa, + x_input_pos=x_input_pos, + xa_input_pos=xa_input_pos, + )[:, -1] + next_tok_ids = torch.argmax(logits, dim=-1) + + return logits, next_tok_ids + + +@optional_bf16_autocast +@torch.no_grad() +def process_segments( + tasks: list, + model: AmtEncoderDecoder, + audio_transform: AudioTransform, + tokenizer: AmtTokenizer, + logger: logging.Logger, +): + audio_segs = torch.stack( + [audio_seg for (audio_seg, prefix), _ in tasks] + ).cuda() + log_mels = audio_transform.log_mel(audio_segs) + audio_features = model.encoder(xa=log_mels) + + raw_prefixes = [prefix for (audio_seg, prefix), _ in tasks] + prefix_lens = torch.tensor( + [len(prefix) for prefix in raw_prefixes], dtype=torch.int + ) + min_prefix_len = min(prefix_lens).item() + prefixes = [ + tokenizer.trunc_seq(prefix, MAX_BLOCK_LEN) for prefix in raw_prefixes + ] + seq = torch.stack([tokenizer.encode(prefix) for prefix in prefixes]).cuda() + eos_idxs = torch.tensor([MAX_BLOCK_LEN for _ in prefixes], dtype=torch.int) + + # for idx in ( + # pbar := tqdm( + # range(min_prefix_len, MAX_BLOCK_LEN - 1), + # total=MAX_BLOCK_LEN - (min_prefix_len + 1), + # leave=False, + # ) + # ): + for idx in range(min_prefix_len, MAX_BLOCK_LEN - 1): + with torch.backends.cuda.sdp_kernel( + enable_flash=False, enable_mem_efficient=False, enable_math=True + ): + if idx == min_prefix_len: + logits, next_tok_ids = decode_token( + model, + x=seq[:, :idx], + xa=audio_features, + x_input_pos=torch.arange(0, idx, device=seq.device), + xa_input_pos=torch.arange( + 0, audio_features.shape[1], device=seq.device + ), + ) + else: + logits, next_tok_ids = decode_token( + model, + x=seq[:, idx - 1 : idx], + xa=audio_features, + x_input_pos=torch.tensor( + [idx - 1], device=seq.device, dtype=torch.int + ), + xa_input_pos=torch.tensor( + [], device=seq.device, dtype=torch.int + ), + ) + + next_tok_ids = recalculate_tok_ids( + logits=logits, + tok_ids=next_tok_ids, + ) + update_seq_end_idxs_( + next_tok_ids=next_tok_ids, + seq=seq, + eos_idxs=eos_idxs, + prefix_lens=prefix_lens, + idx=idx, + ) + + if all(_idx <= idx for _idx in eos_idxs): + break + + # If there is a context length overflow, we need to have some special logic + # to make sure that a sequence of the correct format is returned. Right now + # it messes things up somehow + if not all(_idx <= idx for _idx in eos_idxs): + logger.warning("Context length overflow when transcribing segment") + + results = [ + tokenizer.decode(seq[_idx, : eos_idxs[_idx] + 1]) + for _idx in range(seq.shape[0]) + ] + + return results + + +# There is a memory leak in here somewhere +def gpu_manager( + gpu_batch_queue: Queue, + result_queue: Queue, + model: AmtEncoderDecoder, + batch_size: int, + gpu_id: int | None = None, +): + logger = _setup_logger() + logger.info("Started GPU manager") + + if gpu_id is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + + global decode_token, recalculate_tok_ids + model.decoder.setup_cache(batch_size=batch_size, max_seq_len=MAX_BLOCK_LEN) + model.cuda() + model.eval() + if batch_size == 1: + recalculate_tok_ids = torch.compile( + recalculate_tok_ids, mode="max-autotune-no-cudagraphs" + ) + decode_token = torch.compile( + decode_token, + # mode="reduce-overhead", + mode="max-autotune", + fullgraph=True, + ) + + audio_transform = AudioTransform().cuda() + tokenizer = AmtTokenizer(return_tensors=True) + + try: + while True: + try: + batch = gpu_batch_queue.get(timeout=10) + except Exception as e: + logger.info(f"GPU timedout waiting for batch") + break + else: + try: + results = process_segments( + tasks=batch, + model=model, + audio_transform=audio_transform, + tokenizer=tokenizer, + logger=logger, + ) + except Exception as e: + logger.error( + f"Failed to process batch: {traceback.format_exc()}" + ) + raise e + else: + # pid = -1 when its a pad sequence + for result, (_, pid) in zip(results, batch): + if pid != -1: + result_queue.put({"result": result, "pid": pid}) + + except Exception as e: + logger.error(f"GPU manager failed with exception: {e}") + finally: + logger.info(f"GPU manager terminated") + + +def _find_min_diff_batch(tasks: list, batch_size: int): + prefix_lens = [ + (len(prefix), idx) for idx, ((audio_seg, prefix), _) in enumerate(tasks) + ] + prefix_lens.sort(key=lambda x: x[0]) + + min_diff = float("inf") + start_idx = 0 + + # Iterate through the array to find the batch with the min difference + for _idx in range(len(prefix_lens) - batch_size + 1): + current_diff = ( + prefix_lens[_idx + batch_size - 1][0] - prefix_lens[_idx][0] + ) + if current_diff < min_diff: + min_diff = current_diff + start_idx = _idx + + return [ + orig_idx + for prefix_lens, orig_idx in prefix_lens[ + start_idx : start_idx + batch_size + ] + ] + + +def gpu_batch_manager( + gpu_task_queue: Queue, + gpu_batch_queue: Queue, + batch_size: int, +): + logger = _setup_logger() + logger.info("Started batch manager") + try: + tasks = [] + while True: + try: + task, pid = gpu_task_queue.get(timeout=0.2) + except Exception as e: + pass + else: + tasks.append((task, pid)) + continue + + # No tasks in queue -> check gpu batch queue + if gpu_batch_queue.empty() is False: + continue + elif len(tasks) == 0: + continue + + # Get new batch and add to batch queue + if len(tasks) < batch_size: + logger.info("Not enough tasks - padding batch") + while len(tasks) < batch_size: + _pad_task, _pid = tasks[0] + tasks.append((_pad_task, -1)) + + assert len(tasks) >= batch_size, "batch error" + new_batch_idxs = _find_min_diff_batch( + tasks, + batch_size=batch_size, + ) + gpu_batch_queue.put([tasks[_idx] for _idx in new_batch_idxs]) + tasks = [ + task + for _idx, task in enumerate(tasks) + if _idx not in new_batch_idxs + ] + except Exception as e: + logger.error(f"GPU batch manager failed with exception: {e}") + finally: + logger.info(f"GPU batch manager terminated") + + +def _shift_onset(seq: list, shift_ms: int): + res = [] + for tok in seq: + if type(tok) is tuple and tok[0] == "onset": + res.append(("onset", tok[1] + shift_ms)) + else: + res.append(tok) + + return res + + +def _truncate_seq( + seq: list, + start_ms: int, + end_ms: int, + logger: logging.Logger, + tokenizer: AmtTokenizer = AmtTokenizer(), +): + # Truncates and shifts a sequence by retokenizing the underlying midi_dict + if start_ms == end_ms: + _mid_dict, unclosed_notes = tokenizer._detokenize_midi_dict( + seq, start_ms, return_unclosed_notes=True + ) + random.shuffle(unclosed_notes) + return [("prev", p) for p in unclosed_notes] + [tokenizer.bos_tok] + else: + try: + _mid_dict = tokenizer._detokenize_midi_dict(seq, LEN_MS) + res = tokenizer._tokenize_midi_dict(_mid_dict, start_ms, end_ms - 1) + except Exception as e: + logger.error(f"Truncate segment failed: {e}") + return [tokenizer.bos_tok] + else: + if res[-1] == tokenizer.eos_tok: + res.pop() + return res + + +def transcribe_file( + file_path, + gpu_task_queue: Queue, + result_queue: Queue, + pid: int, + tokenizer: AmtTokenizer = AmtTokenizer(), +): + logger = logging.getLogger(__name__) + + logger.info(f"Getting wav segments: {file_path}") + audio_segments = [ + f + for f, _ in get_wav_mid_segments( + audio_path=file_path, stride_factor=STRIDE_FACTOR + ) + ] + + res = [] + seq = [tokenizer.bos_tok] + concat_seq = [tokenizer.bos_tok] + for idx, audio_seg in enumerate(audio_segments): + init_idx = len(seq) + + # Add to gpu queue and wait for results + gpu_task_queue.put(((audio_seg, seq), pid)) + while True: + # Issue with this logic perhaps + gpu_result = result_queue.get(timeout=300) + if gpu_result["pid"] == pid: + seq = gpu_result["result"] + break + else: + result_queue.put(gpu_result) + + concat_seq += _shift_onset( + seq[init_idx:], + idx * CHUNK_LEN_MS, + ) + + if idx == len(audio_segments) - 1: + res.append(concat_seq) + elif concat_seq[-1] == tokenizer.eos_tok: + res.append(concat_seq) + seq = [tokenizer.bos_tok] + concat_seq = [tokenizer.bos_tok] + logger.info(f"Finished segment (eos_tok): {file_path}") + else: + # This might need it's logic adjusted + + seq = _truncate_seq( + seq, + CHUNK_LEN_MS, + LEN_MS - CHUNK_LEN_MS, + logger=logger, + ) + + if len(seq) == 1: + logger.error(f"Failed to transcribe segment: {file_path}") + if len(concat_seq) > 500: + res.append(concat_seq) + else: + pass + # logger.info(f"Sequence too short ({len(concat_seq)})") + + seq = [tokenizer.bos_tok] + concat_seq = [tokenizer.bos_tok] + + return res + + +def process_file( + file_path: str, + file_queue: Queue, + gpu_task_queue: Queue, + result_queue: Queue, + tokenizer: AmtTokenizer, + save_dir: str, + input_dir: str, + logger: logging.Logger, +): + def _save_seq(_seq: list, _save_path: str): + if os.path.exists(_save_path): + logger.info(f"Already exists {_save_path} - overwriting") + + for tok in _seq[::-1]: + if type(tok) is tuple and tok[0] == "onset": + last_onset = tok[1] + break + + try: + mid_dict = tokenizer._detokenize_midi_dict( + tokenized_seq=_seq, len_ms=last_onset + ) + mid = mid_dict.to_midi() + mid.save(_save_path) + except Exception as e: + logger.error(f"Failed to save {_save_path}") + + def _get_save_path(_file_path: str, _idx: int | str = ""): + if input_dir is None: + save_path = os.path.join( + save_dir, + os.path.splitext(os.path.basename(file_path))[0] + + f"{_idx}.mid", + ) + else: + input_rel_path = os.path.relpath(_file_path, input_dir) + save_path = os.path.join( + save_dir, os.path.splitext(input_rel_path)[0] + f"{_idx}.mid" + ) + if not os.path.isdir(os.path.dirname(save_path)): + os.makedirs(os.path.dirname(save_path), exist_ok=True) + + return save_path + + def remove_failures_from_queue_(_queue: Queue, _pid: int): + _buff = [] + while True: + try: + _buff.append(_queue(timout=5)) + except Exception: + break + + num_removed = 0 + for _task, __pid in _buff: + if _pid != __pid: + _queue.put((_task, __pid)) + else: + num_removed += 1 + + return num_removed + + pid = threading.get_ident() + try: + seqs = transcribe_file(file_path, gpu_task_queue, result_queue, pid=pid) + except Exception as e: + logger.error(f"Failed to process {file_path}: {traceback.format_exc()}") + task_rmv_cnt = remove_failures_from_queue_(gpu_task_queue, pid) + res_rmv_cnt = remove_failures_from_queue_(result_queue, pid) + logger.info(f"Removed {task_rmv_cnt} from task queue") + logger.info(f"Removed {res_rmv_cnt} from result queue") + return + + logger.info(f"Finished file: {file_path}") + _idx = 0 + for seq in seqs: + if len(seq) < 1000: + logger.info("Skipping seq - too short") + continue + _save_seq(seq, _get_save_path(file_path, _idx)) + _idx += 1 + + logger.info(f"Transcribed into {_idx} segment(s)") + logger.info(f"{file_queue.qsize()} file(s) remaining in queue") + + +def worker( + file_queue: Queue, + gpu_task_queue: Queue, + result_queue: Queue, + save_dir: str, + input_dir: str | None = None, + tasks_per_worker: int = 1, +): + logger = _setup_logger() + tokenizer = AmtTokenizer() + threads = [] + try: + while not file_queue.empty() or any(t.is_alive() for t in threads): + while len(threads) < tasks_per_worker and not file_queue.empty(): + logging.info("Starting worker") + file_path = file_queue.get() + t = threading.Thread( + target=process_file, + args=( + file_path, + file_queue, + gpu_task_queue, + result_queue, + tokenizer, + save_dir, + input_dir, + logger, + ), + ) + t.start() + threads.append(t) + + threads = [t for t in threads if t.is_alive()] + + time.sleep(0.1) + + for t in threads: + t.join() + + except Exception as e: + logger.error(f"File worker failed with exception: {e}") + finally: + logger.info(f"File worker terminated") + + +# Needs to test this for multi-gpu +def batch_transcribe( + file_paths: list, + model: AmtEncoderDecoder, + save_dir: str, + batch_size: int = 16, + input_dir: str | None = None, + gpu_ids: int | None = None, + quantize: bool = True, +): + torch.multiprocessing.set_start_method("spawn") + num_gpus = len(gpu_ids) if gpu_ids is not None else 1 + logger = _setup_logger() + + if os.path.isfile("transcribe.log"): + os.remove("transcribe.log") + + if quantize is True: + logger.info("Quantising weights to int8") + model = quantize_int8(model) + + gpu_task_queue = Queue() + gpu_batch_queue = Queue() + result_queue = Queue() + file_queue = Queue() + for file_path in file_paths: + file_queue.put(file_path) + + num_workers = min(batch_size * num_gpus, len(file_paths)) + logger.info(f"Creating {num_workers} file worker(s)") + worker_processes = [ + multiprocessing.Process( + target=worker, + args=( + file_queue, + gpu_task_queue, + result_queue, + save_dir, + input_dir, + # Wait for all threads to finish + 4, + ), + ) + for _ in range(num_workers) + ] + gpu_batch_manager_process = multiprocessing.Process( + target=gpu_batch_manager, + args=(gpu_task_queue, gpu_batch_queue, batch_size), + ) + + start_time = time.time() + if num_gpus == 1: + gpu_manager_processes = [ + multiprocessing.Process( + target=gpu_manager, + args=(gpu_batch_queue, result_queue, model, batch_size), + ) + ] + else: + gpu_manager_processes = [ + multiprocessing.Process( + target=gpu_manager, + args=(gpu_batch_queue, result_queue, model, batch_size, gpu_id), + ) + for gpu_id in gpu_ids + ] + + for p in worker_processes: + p.start() + time.sleep(5) + gpu_batch_manager_process.start() + for p in gpu_manager_processes: + p.start() + + # Watch for file workers to finish + for p in worker_processes: + p.join() + for p in gpu_manager_processes: + p.join() + gpu_batch_manager_process.terminate() + + print("Took", (time.time() - start_time) / 60, "mins to transcribe files") + + +def quantize_int8(model: torch.nn.Module): + from amt.inference.quantize import WeightOnlyInt8QuantHandler + + quantizer = WeightOnlyInt8QuantHandler(model) + int8_state_dict = quantizer.create_quantized_state_dict() + _model = quantizer.convert_for_runtime() + _model.load_state_dict(int8_state_dict) + + return _model diff --git a/amt/model.py b/amt/model.py index 9e8ccb2..1b60a46 100644 --- a/amt/model.py +++ b/amt/model.py @@ -50,67 +50,27 @@ def __init__(self, n_state: int, n_head: int): self.n_head = n_head self.d_head = n_state // n_head - self.query = nn.Linear(n_state, n_state) + self.query = nn.Linear(n_state, n_state, bias=False) self.key = nn.Linear(n_state, n_state, bias=False) - self.value = nn.Linear(n_state, n_state) - self.out = nn.Linear(n_state, n_state) + self.value = nn.Linear(n_state, n_state, bias=False) + self.out = nn.Linear(n_state, n_state, bias=False) def forward( self, x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None, - kv_cache: Optional[dict] = None, ): q = self.query(x) - if kv_cache is None: - # Normal forward - if xa is not None: - # Cross att - k = self.key(xa) - v = self.value(xa) - else: - # Self att in encoder/decoder - k = self.key(x) - v = self.value(x) + if xa is not None: + # Cross att + k = self.key(xa) + v = self.value(xa) else: - # Using cache - k_id = f"{id(self)}_k" - v_id = f"{id(self)}_v" - - if xa is not None: - # Cross att - calculate once and reuse - if kv_cache.get(k_id) is None: - # Not recorded yet, calculate and store - k = self.key(xa) - v = self.value(xa) - kv_cache[k_id] = k - kv_cache[v_id] = v - else: - # Already recorded, get - k = kv_cache[k_id] - v = kv_cache[v_id] - else: - # Decoder self att, append each time - if kv_cache.get(k_id) is None: - # Not recorded yet, calculate and store - k = self.key(x) - v = self.value(x) - kv_cache[k_id] = k - kv_cache[v_id] = v - else: - # Already recorded, get and append - k = torch.cat((kv_cache[k_id], self.key(x)), dim=1).detach() - v = torch.cat( - (kv_cache[v_id], self.value(x)), dim=1 - ).detach() - kv_cache[k_id] = k - kv_cache[v_id] = v - - # When using kv_cache for decoder self attention, we don't - # want to use a mask in the self attention calculation - mask = None + # Self att in encoder/decoder + k = self.key(x) + v = self.value(x) # Reshape and transpose for attention calculation batch_size, target_seq_len, _ = q.shape @@ -157,7 +117,9 @@ def __init__( n_mlp = n_state * 4 self.mlp = nn.Sequential( - nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state) + nn.Linear(n_state, n_mlp, bias=False), + nn.GELU(), + nn.Linear(n_mlp, n_state, bias=False), ) self.mlp_ln = nn.LayerNorm(n_state) @@ -166,16 +128,10 @@ def forward( x: Tensor, xa: Optional[Tensor] = None, mask: Optional[Tensor] = None, - kv_cache: Optional[dict] = None, ): - x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0] + x = x + self.attn(self.attn_ln(x), mask=mask)[0] if self.cross_attn: - x = ( - x - + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[ - 0 - ] - ) + x = x + self.cross_attn(self.cross_attn_ln(x), xa)[0] x = x + self.mlp(self.mlp_ln(x)) return x @@ -236,22 +192,18 @@ def __init__( mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1) self.register_buffer("mask", mask, persistent=False) - def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): + def forward(self, x: Tensor, xa: Tensor): """ x : torch.LongTensor, shape = (batch_size, <= n_ctx) the text tokens xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state) the encoded audio features to be attended on """ - offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 - x = ( - self.token_embedding(x) - + self.positional_embedding[offset : offset + x.shape[-1]] - ) + x = self.token_embedding(x) + self.positional_embedding[: x.shape[-1]] x = x.to(xa.dtype) for block in self.blocks: - x = block(x, xa, mask=self.mask, kv_cache=kv_cache) + x = block(x, xa, mask=self.mask) x = self.ln(x) logits = ( diff --git a/amt/run.py b/amt/run.py index 7e29ae0..1b2bcc5 100644 --- a/amt/run.py +++ b/amt/run.py @@ -38,6 +38,12 @@ def _add_transcribe_args(subparser): subparser.add_argument( "-multi_gpu", help="use all GPUs", action="store_true", default=False ) + subparser.add_argument( + "-q8", + help="apply int8 quantization on weights", + action="store_true", + default=False, + ) subparser.add_argument("-bs", help="batch size", type=int, default=16) @@ -88,19 +94,19 @@ def build_maestro( print(f"Building {train_file}") AmtDataset.build( - matched_load_paths=matched_paths_train, + load_paths=matched_paths_train, save_path=train_file, num_processes=num_procs, ) print(f"Building {val_file}") AmtDataset.build( - matched_load_paths=matched_paths_val, + load_paths=matched_paths_val, save_path=val_file, num_processes=num_procs, ) print(f"Building {test_file}") AmtDataset.build( - matched_load_paths=matched_paths_test, + load_paths=matched_paths_test, save_path=test_file, num_processes=num_procs, ) @@ -114,7 +120,6 @@ def transcribe( load_dir=None, batch_size=16, multi_gpu=False, - augment=None, ): """ Transcribe audio files to midi using the given model and checkpoint. @@ -138,13 +143,11 @@ def transcribe( augment : str Augment the audio files before transcribing. This is used for evaluation. This tests the robustness of the model. """ - import torch from torch.cuda import is_available as cuda_is_available - from torch.multiprocessing import Queue from amt.tokenizer import AmtTokenizer - from amt.infer import batch_transcribe + from amt.inference.transcribe import batch_transcribe from amt.config import load_model_config - from amt.model import ModelConfig, AmtEncoderDecoder + from amt.inference.model import ModelConfig, AmtEncoderDecoder from aria.utils import _load_weight assert cuda_is_available(), "CUDA device not found" @@ -176,7 +179,6 @@ def transcribe( _model_state[k] = v model_state = _model_state model.load_state_dict(model_state) - torch.multiprocessing.set_start_method("spawn") if trans_mode == "batch": found_wav = glob.glob( @@ -196,31 +198,14 @@ def transcribe( int(id) for id in os.getenv("CUDA_VISIBLE_DEVICES").split(",") ] print(f"Visible gpu_ids: {gpu_ids}") - - # Use shared file queue between gpu processes - file_queue = torch.multiprocessing.Queue() - for file_path in file_paths: - file_queue.put(file_path) - - processes = [] - for gpu_id in gpu_ids: - print(f"Starting process on cuda-{gpu_id}") - process = torch.multiprocessing.Process( - target=batch_transcribe, - args=( - file_queue, - model, - save_dir, - batch_size, - gpu_id, - load_dir, - ), - ) - process.start() - processes.append(process) - - for process in processes: - process.join() + batch_transcribe( + file_paths=file_paths, + model=model, + save_dir=save_dir, + batch_size=batch_size, + input_dir=load_dir, + gpu_ids=gpu_ids, + ) else: batch_transcribe( diff --git a/amt/tokenizer.py b/amt/tokenizer.py index d5416a7..c368673 100644 --- a/amt/tokenizer.py +++ b/amt/tokenizer.py @@ -11,10 +11,6 @@ from amt.config import load_config -# Instead of doing this, we could calculate beams at inference time, selecting -# the note with the first onset so that we don't miss notes. - - DEBUG = os.getenv("DEBUG") @@ -63,6 +59,12 @@ def __init__(self, return_tensors: bool = False): ) self.pad_id = self.tok_to_id[self.pad_tok] + def _get_inference_ids(self): + return [ + self.tok_to_id[tok] + for tok in self.velocity_tokens + self.onset_tokens + ] + def _quantize_onset(self, time: int): # This function will return values res >= 0 (inc. 0) return self._find_closest_int(time, self.onset_time_quantizations) @@ -86,13 +88,16 @@ def _tokenize_midi_dict( midi_dict: MidiDict, start_ms: int, end_ms: int, + max_pedal_len_ms: int | None = None, ): assert ( end_ms - start_ms <= self.max_onset ), "Invalid values for start_ms, end_ms" - midi_dict.resolve_pedal() # Important !! + if midi_dict.pedal_resolved is False: + midi_dict.resolve_pedal() # Important !! pedal_intervals = midi_dict._build_pedal_intervals() + if len(pedal_intervals.keys()) > 1: print("Warning: midi_dict has more than one pedal channel") if len(midi_dict.instrument_msgs) > 1: @@ -179,6 +184,9 @@ def _tokenize_midi_dict( ticks_per_beat=midi_dict.ticks_per_beat, ) + if max_pedal_len_ms is not None: + pedal_off_ms = min(pedal_off_ms, pedal_on_ms + max_pedal_len_ms) + rel_on_ms_q = self._quantize_onset(pedal_on_ms - start_ms) rel_off_ms_q = self._quantize_onset(pedal_off_ms - start_ms) @@ -307,8 +315,7 @@ def _detokenize_midi_dict( if tok_1_type == "prev": notes_to_close[tok_1_data] = (0, self.default_velocity) print("Unexpected token order: 'prev' seen after ''") - if DEBUG: - raise Exception + raise ValueError elif tok_1_type == "pedal": _pedal_data = tok_1_data _tick = tok_2_data @@ -323,8 +330,7 @@ def _detokenize_midi_dict( elif tok_1_type == "on": if (tok_2_type, tok_3_type) != ("onset", "vel"): print("Unexpected token order:", tok_1, tok_2, tok_3) - if DEBUG: - raise Exception + raise ValueError else: notes_to_close[tok_1_data] = (tok_2_data, tok_3_data) elif tok_1_type == "off": diff --git a/amt/train.py b/amt/train.py index 10a3952..eee1b8e 100644 --- a/amt/train.py +++ b/amt/train.py @@ -266,7 +266,6 @@ def rolling_average(prev_avg: float, x_n: float, n: int): return ((prev_avg * (n - 1)) / n) + (x_n / n) -# TODO: Test that loss/backprop is working correctly (look at shapes) def _train( epochs: int, accelerator: accelerate.Accelerator, @@ -281,34 +280,6 @@ def _train( resume_epoch: int | None = None, project_dir: str | None = None, ): - def profile_flops(dataloader: DataLoader): - def _bench(): - for batch in dataloader: - wav, src, tgt, pitch_shift = batch - with torch.no_grad(): - mel = audio_transform.forward(wav, shift=pitch_shift) - logits = model(mel, src) # (b_sz, s_len, v_sz) - logits = logits.transpose(1, 2) - loss = loss_fn(logits, tgt) - - # Backwards step - omit optimizer.step() - accelerator.backward(loss) - optimizer.zero_grad() - break - - logger.info( - f"Model has " - f"{'{:,}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad))} " - "parameters" - ) - logger.info("Compiling model...") - _bench() - - # with flop_counter: - # _bench() - # total_flop = sum(flop_counter.get_flop_counts()["Global"].values()) - # logger.info(f"Forwards & backwards FLOP: {total_flop / 1e12} TF") - def make_checkpoint(_accelerator, _epoch: int, _step: int): checkpoint_dir = os.path.join( project_dir, @@ -327,7 +298,6 @@ def train_loop( dataloader: DataLoader, _epoch: int, _resume_step: int = 0, - overfit: bool = False, ): avg_train_loss = 0 trailing_loss = 0 @@ -409,7 +379,7 @@ def train_loop( return avg_train_loss - def val_loop(dataloader, _epoch: int): + def val_loop(dataloader, _epoch: int, aug: bool): avg_val_loss = 0 model.eval() for step, batch in ( @@ -421,10 +391,21 @@ def val_loop(dataloader, _epoch: int): ): wav, src, tgt = batch with torch.no_grad(): - mel = audio_transform.log_mel(wav) + if aug == False: + mel = audio_transform.log_mel(wav) + elif aug == True: + # Apply aug without distortion or spec-augment + mel = audio_transform.log_mel( + audio_transform.aug_wav(wav), detune=True + ) + else: + raise TypeError + logits = model(mel, src) - logits = logits.transpose(1, 2) # Transpose for CrossEntropyLoss - loss = loss_fn(logits, tgt) + logits = logits.transpose( + 1, 2 + ) # Transpose for CrossEntropyLoss + loss = loss_fn(logits, tgt) # Logging avg_val_loss = rolling_average(avg_val_loss, loss.item(), step) @@ -432,7 +413,8 @@ def val_loop(dataloader, _epoch: int): # EPOCH logger.info( - f"EPOCH {_epoch}/{epochs + start_epoch}: Finished evaluation - " + f"EPOCH {_epoch}/{epochs + start_epoch}: Finished evaluation " + f"{'(aug)' if aug is True else ''} - " f"average_loss={round(avg_val_loss, 4)}" ) @@ -447,7 +429,11 @@ def val_loop(dataloader, _epoch: int): PAD_ID = train_dataloader.dataset.tokenizer.pad_id logger = get_logger(__name__) # Accelerate logger loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID) - profile_flops(dataloader=train_dataloader) + logger.info( + f"Model has " + f"{'{:,}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad))} " + "parameters" + ) if accelerator.is_main_process: loss_csv = open(os.path.join(project_dir, "loss.csv"), "w") @@ -455,7 +441,9 @@ def val_loop(dataloader, _epoch: int): loss_writer.writerow(["epoch", "step", "loss"]) epoch_csv = open(os.path.join(project_dir, "epoch.csv"), "w") epoch_writer = csv.writer(epoch_csv) - epoch_writer.writerow(["epoch", "avg_train_loss", "avg_val_loss"]) + epoch_writer.writerow( + ["epoch", "avg_train_loss", "avg_val_loss", "avg_val_loss_aug"] + ) if resume_epoch is not None: start_epoch = resume_epoch + 1 @@ -477,9 +465,16 @@ def val_loop(dataloader, _epoch: int): _epoch=resume_epoch, _resume_step=resume_step, ) - avg_val_loss = val_loop(dataloader=val_dataloader, _epoch=resume_epoch) + avg_val_loss = val_loop( + dataloader=val_dataloader, _epoch=resume_epoch, aug=False + ) + avg_val_loss_aug = val_loop( + dataloader=val_dataloader, _epoch=resume_epoch, aug=True + ) if accelerator.is_main_process: - epoch_writer.writerow([resume_epoch, avg_train_loss, avg_val_loss]) + epoch_writer.writerow( + [resume_epoch, avg_train_loss, avg_val_loss, avg_val_loss_aug] + ) epoch_csv.flush() make_checkpoint( _accelerator=accelerator, _epoch=start_epoch, _step=0 @@ -487,9 +482,16 @@ def val_loop(dataloader, _epoch: int): for epoch in range(start_epoch, epochs + start_epoch): avg_train_loss = train_loop(dataloader=train_dataloader, _epoch=epoch) - avg_val_loss = val_loop(dataloader=val_dataloader, _epoch=epoch) + avg_val_loss = val_loop( + dataloader=val_dataloader, _epoch=epoch, aug=False + ) + avg_val_loss_aug = val_loop( + dataloader=val_dataloader, _epoch=epoch, aug=True + ) if accelerator.is_main_process: - epoch_writer.writerow([epoch, avg_train_loss, avg_val_loss]) + epoch_writer.writerow( + [epoch, avg_train_loss, avg_val_loss, avg_val_loss_aug] + ) epoch_csv.flush() make_checkpoint(_accelerator=accelerator, _epoch=epoch + 1, _step=0) @@ -565,6 +567,7 @@ def resume_train( model = torch.compile(model) audio_transform = AudioTransform().to(accelerator.device) logger.info(f"Loaded model with config: {load_model_config(model_name)}") + logger.info(f"Loaded transform with config: {audio_transform.get_params()}") train_dataloader, val_dataloader = get_dataloaders( train_data_path=train_data_path, @@ -682,6 +685,7 @@ def train( model = torch.compile(model) audio_transform = AudioTransform().to(accelerator.device) logger.info(f"Loaded model with config: {load_model_config(model_name)}") + logger.info(f"Loaded transform with config: {audio_transform.get_params()}") if mode == "finetune": try: model.load_state_dict(_load_weight(finetune_cp_path)) diff --git a/config/config.json b/config/config.json index 2fa9fd4..9da2e4e 100644 --- a/config/config.json +++ b/config/config.json @@ -17,7 +17,7 @@ "n_mels": 256 }, "data": { - "stride_factor": 6, + "stride_factor": 12, "max_seq_len": 4096 } } \ No newline at end of file diff --git a/config/models/small.json b/config/models/small.json deleted file mode 100644 index 1c87733..0000000 --- a/config/models/small.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_mels": 256, - "n_audio_ctx": 1500, - "n_audio_state": 384, - "n_audio_head": 6, - "n_audio_layer": 8, - "n_text_ctx": 4096, - "n_text_state": 384, - "n_text_head": 6, - "n_text_layer": 8 -} \ No newline at end of file diff --git a/config/models/test.json b/config/models/test.json deleted file mode 100644 index 93c0f16..0000000 --- a/config/models/test.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "n_mels": 256, - "n_audio_ctx": 1500, - "n_audio_state": 64, - "n_audio_head": 4, - "n_audio_layer": 4, - "n_text_ctx": 4096, - "n_text_state": 64, - "n_text_head": 4, - "n_text_layer": 4 -} \ No newline at end of file diff --git a/tests/test_data.py b/tests/test_data.py index 1437472..f69117e 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,7 +1,8 @@ import unittest import logging import os -import time +import cProfile +import pstats import torch import torchaudio import matplotlib.pyplot as plt @@ -9,6 +10,7 @@ from amt.data import get_wav_mid_segments, AmtDataset from amt.tokenizer import AmtTokenizer from amt.audio import AudioTransform, log_mel_spectrogram +from amt.train import get_dataloaders from aria.data.midi import MidiDict @@ -16,7 +18,17 @@ if os.path.isdir("tests/test_results") is False: os.mkdir("tests/test_results") -MAESTRO_PATH = "/weka/proj-aria/aria-amt/data/maestro/val.jsonl" +MAESTRO_PATH = "/weka/proj-aria/aria-amt/data/train.jsonl" + + +def plot_spec(mel: torch.Tensor, name: str | int): + plt.figure(figsize=(10, 4)) + plt.imshow(mel, aspect="auto", origin="lower", cmap="viridis") + plt.colorbar(format="%+2.0f dB") + plt.title("(mel)-Spectrogram") + plt.tight_layout() + plt.savefig(f"tests/test_results/{name}.png") + plt.close() # Need to test this properly, have issues turning mel_spec back into audio @@ -32,14 +44,14 @@ def test_wav_mid_segments(self): class TestAmtDataset(unittest.TestCase): def test_build(self): matched_paths = [ - ("tests/test_data/147.wav", "tests/test_data/147.mid") + ("tests/test_data/maestro.wav", "tests/test_data/maestro1.mid") for _ in range(3) ] if os.path.isfile("tests/test_results/dataset.jsonl"): os.remove("tests/test_results/dataset.jsonl") AmtDataset.build( - matched_load_paths=matched_paths, + load_paths=matched_paths, save_path="tests/test_results/dataset.jsonl", ) @@ -61,6 +73,7 @@ def test_maestro(self): return tokenizer = AmtTokenizer() + audio_transform = AudioTransform() dataset = AmtDataset(load_path=MAESTRO_PATH) print(f"Dataset length: {len(dataset)}") for idx, (wav, src, tgt) in enumerate(dataset): @@ -74,8 +87,13 @@ def test_maestro(self): ) src_mid = src_mid_dict.to_midi() - if idx % 10 == 0: - src_mid.save(f"tests/test_results/dataset_{idx}.mid") + src_mid.save(f"tests/test_results/dataset_{idx}.mid") + torchaudio.save( + f"tests/test_results/wav_{idx}.wav", wav.unsqueeze(0), 16000 + ) + plot_spec( + audio_transform(wav.unsqueeze(0)).squeeze(0), f"mel_{idx}" + ) self.assertTrue(tokenizer.unk_tok not in src_dec) self.assertTrue(tokenizer.unk_tok not in tgt_dec) @@ -84,15 +102,6 @@ def test_maestro(self): class TestAug(unittest.TestCase): - def plot_spec(self, mel: torch.Tensor, name: str | int): - plt.figure(figsize=(10, 4)) - plt.imshow(mel, aspect="auto", origin="lower", cmap="viridis") - plt.colorbar(format="%+2.0f dB") - plt.title("(mel)-Spectrogram") - plt.tight_layout() - plt.savefig(f"tests/test_results/{name}.png") - plt.close() - def test_spec(self): SAMPLE_RATE, CHUNK_LEN = 16000, 30 audio_transform = AudioTransform() @@ -115,11 +124,11 @@ def test_spec(self): torchaudio.save("tests/test_results/shift.wav", shift_wav, SAMPLE_RATE) log_mel = log_mel_spectrogram(wav) - self.plot_spec(log_mel.squeeze(0), "orig") + plot_spec(log_mel.squeeze(0), "orig") _mel = audio_transform.mel_transform(spec) _log_mel = audio_transform.norm_mel(_mel) - self.plot_spec(_log_mel.squeeze(0), "new") + plot_spec(_log_mel.squeeze(0), "new") def test_pitch_aug(self): tokenizer = AmtTokenizer(return_tensors=True) @@ -147,6 +156,36 @@ def test_pitch_aug(self): for src_tok, tgt_tok in zip(src_aug_dec[1:], tgt_aug_dec): self.assertEqual(src_tok, tgt_tok) + def test_detune(self): + SAMPLE_RATE, CHUNK_LEN = 16000, 30 + audio_transform = AudioTransform() + wav, sr = torchaudio.load("tests/test_data/maestro.wav") + wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE).mean( + 0, keepdim=True + )[:, : SAMPLE_RATE * CHUNK_LEN] + + griffin_lim = torchaudio.transforms.GriffinLim( + n_fft=2048, + hop_length=160, + power=1, + n_iter=64, + ) + + spec = audio_transform.spec_transform(wav) + shift_spec = audio_transform.detune_spec(spec) + shift_wav = griffin_lim(shift_spec) + gl_wav = griffin_lim(spec) + torchaudio.save("tests/test_results/orig.wav", wav, SAMPLE_RATE) + torchaudio.save("tests/test_results/orig_gl.wav", gl_wav, SAMPLE_RATE) + torchaudio.save("tests/test_results/detune.wav", shift_wav, SAMPLE_RATE) + + log_mel = log_mel_spectrogram(wav) + plot_spec(log_mel.squeeze(0), "orig") + + _mel = audio_transform.mel_transform(spec) + _log_mel = audio_transform.norm_mel(_mel) + plot_spec(_log_mel.squeeze(0), "new") + def test_mels(self): SAMPLE_RATE, CHUNK_LEN = 16000, 30 audio_transform = AudioTransform() @@ -163,7 +202,7 @@ def test_mels(self): wavs = torch.stack((wav[0], wav[0], wav[0])) mels = audio_transform(wavs) for idx in range(mels.shape[0]): - self.plot_spec(mels[idx], idx) + plot_spec(mels[idx], idx) def test_distortion(self): SAMPLE_RATE, CHUNK_LEN = 16000, 30 @@ -226,5 +265,27 @@ def test_noise(self): torchaudio.save("tests/test_results/noise.wav", res, SAMPLE_RATE) +class TestDataLoader(unittest.TestCase): + def load_data(self, dataloader, num_batches=100): + for idx, data in enumerate(dataloader): + if idx >= num_batches: + break + + def test_profile_dl(self): + train_dataloader, val_dataloader = get_dataloaders( + train_data_path="/weka/proj-aria/aria-amt/data/train.jsonl", + val_data_path="/weka/proj-aria/aria-amt/data/train.jsonl", + batch_size=16, + num_workers=0, + ) + + profiler = cProfile.Profile() + profiler.enable() + self.load_data(train_dataloader, num_batches=10) + profiler.disable() + stats = pstats.Stats(profiler).sort_stats("cumulative") + stats.print_stats() + + if __name__ == "__main__": unittest.main()