diff --git a/.gitignore b/.gitignore index f722a8b1..40392acb 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,3 @@ dataset/* /notebooks /local_scripts /notes - -Untitled.ipynb -magnet_demo.ipynb \ No newline at end of file diff --git a/audiocraft/modules/conditioners.py b/audiocraft/modules/conditioners.py index 599c17cb..19525d2e 100644 --- a/audiocraft/modules/conditioners.py +++ b/audiocraft/modules/conditioners.py @@ -708,8 +708,7 @@ class FeatureExtractor(WaveformConditioner): Then, we feed this excerpt to a feature extractor. Args: - model_name (str): 'encodec', 'musicfm' or 'mert'. For now 'musicfm' - is not supported. + model_name (str): 'encodec' or 'mert'. sample_rate (str): sample rate of the input audio. (32000) encodec_checkpoint (str): if encodec is used as a feature extractor, checkpoint of the model. ('//pretrained/facebook/encodec_32khz' is the default) @@ -736,13 +735,10 @@ def __init__( use_middle_of_segment: bool = False, ds_rate_compression: int = 640, num_codebooks_lm: int = 4 ): - assert model_name in ['encodec', 'musicfm', 'mert'] + assert model_name in ['encodec', 'mert'] if model_name == 'encodec': from ..solvers.compression import CompressionSolver feat_extractor = CompressionSolver.model_from_checkpoint(encodec_checkpoint, device) - # elif model_name == 'musicfm': - # from ..musicfmbis.model.musicfm_25hz import MusicFM25Hz - # feat_extractor = MusicFM25Hz() elif model_name == 'mert': from transformers import AutoModel feat_extractor = AutoModel.from_pretrained("m-a-p/MERT-v1-95M", trust_remote_code=True) @@ -759,9 +755,6 @@ def __init__( self.__dict__["feat_extractor"] = feat_extractor.to(device) self.encodec_n_q = encodec_n_q self.embed = nn.ModuleList([nn.Embedding(feat_extractor.cardinality, dim) for _ in range(encodec_n_q)]) - elif model_name == 'musicfm': - self.__dict__["feat_extractor"] = feat_extractor.eval().to(device) - self.embed = nn.Linear(1024, dim) # hardcoded if model_name == 'mert': self.__dict__["feat_extractor"] = feat_extractor.eval().to(device) self.embed = nn.Linear(768, dim) # hardcoded @@ -787,9 +780,6 @@ def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor: self.temp_mask = self._get_mask_wav(x, start) if self.model_name == 'encodec': tokens = self.feat_extractor.encode(wav)[0] # type: ignore - elif self.model_name == 'musicfm': - wav = convert_audio(wav, from_rate=x.sample_rate[0], to_rate=24000, to_channels=1) - embeds = self.feat_extractor.get_latent(wav, layer_ix=6) # type: ignore elif self.model_name == 'mert': wav = convert_audio(wav, from_rate=x.sample_rate[0], to_rate=24000, to_channels=1) embeds = self.feat_extractor(wav.squeeze(-2)).last_hidden_state @@ -804,8 +794,6 @@ def _get_wav_embedding(self, x: WavCondition) -> torch.Tensor: def _downsampling_factor(self): if self.model_name == 'encodec': return self.sample_rate / self.feat_extractor.frame_rate - elif self.model_name == 'musicfm': - return self.sample_rate / 25 elif self.model_name == 'mert': return self.sample_rate / 75 diff --git a/config/solver/musicgen/musicgen_style_32khz.yaml b/config/solver/musicgen/musicgen_style_32khz.yaml index a4dd85f7..35dc4927 100644 --- a/config/solver/musicgen/musicgen_style_32khz.yaml +++ b/config/solver/musicgen/musicgen_style_32khz.yaml @@ -1,7 +1,7 @@ # @package __global__ # This is the training loop solver -# for the base MusicGen model (text-to-music) +# for MusicGen-Style model (text-and-style-to-music) # on monophonic audio sampled at 32 kHz defaults: - musicgen/default diff --git a/config/teams/labs.yaml b/config/teams/labs.yaml index bc95fb4d..fe662d8a 100644 --- a/config/teams/labs.yaml +++ b/config/teams/labs.yaml @@ -1,29 +1,19 @@ aws: - dora_dir: /fsx-codegen/${oc.env:USER}/experiments/audiocraft/outputs + dora_dir: /fsx-audio-craft-llm/${oc.env:USER}/experiments/audiocraft/outputs partitions: global: learnlab team: learnlab - reference_dir: /fsx-codegen/shared/audiocraft/reference + reference_dir: /fsx-audio-craft-llm/shared/audiocraft/reference dataset_mappers: - "^/checkpoint/jadecopet/datasets/mmi/mmi_11k_32khz": "/fsx-shutterstock-music-resampled/dataset/mmi_11k_32khz" - # "^/fsx-audio-craft-llm/datasets/mmi_nv/mmi_11k" - "^/fsx-audio-craft-llm/datasets/shutterstock_nv/p5": "/fsx-shutterstock-music-original/dataset/p5" - "^/fsx-audio-craft-llm/datasets/shutterstock_nv/shutterstock": "/fsx-shutterstock-music-original/dataset/shutterstock" + "^/checkpoint/[a-z]+": "/fsx-audio-craft-llm" fair: dora_dir: /checkpoint/${oc.env:USER}/experiments/audiocraft/outputs partitions: global: learnlab team: learnlab - reference_dir: /large_experiments/audiocraft/audiocraft_magma/reference + reference_dir: /large_experiments/audiocraft/reference dataset_mappers: "^/datasets01/datasets01": "/datasets01" - "^/datasets01/shutterstock-music-resampled/shutterstock_32khz_wav": "/large_experiments/audiocraft/datasets/datasets_32khz_mono/shutterstock" - "^/datasets01/shutterstock-music-resampled/p5_32khz_wav": "/large_experiments/audiocraft/datasets/datasets_32khz_mono/p5" - "^/checkpoint/jadecopet/datasets/mmi/mmi_11k_32khz": "/large_experiments/audiocraft/datasets/datasets_32khz_mono/mmi_11k" - "^/fsx-shutterstock-music-resampled/dataset/p5_32khz_wav": "/large_experiments/audiocraft/datasets/datasets_32khz_mono/p5" - "^/fsx-shutterstock-music-resampled/dataset/shutterstock_32khz_wav": "/large_experiments/audiocraft/datasets/datasets_32khz_mono/shutterstock" - "^/large_experiments/audiocraft/datasets/datasets_32khz_mono/shutterstock/db/e54/c55/f6f3/5329/14443.wav": "/datasets01/shutterstock-music-resampled/shutterstock_32khz_wav/db/e54/c55/f6f3/5329/14443.wav" - # last line is to fix the missing file '/large_experiments/audiocraft/datasets/datasets_32khz_mono/shutterstock/db/e54/c55/f6f3/5329/14443.wav' darwin: dora_dir: /tmp/audiocraft_${oc.env:USER} partitions: diff --git a/docs/MUSICGEN_STYLE.md b/docs/MUSICGEN_STYLE.md index 6814dcbe..e945571a 100644 --- a/docs/MUSICGEN_STYLE.md +++ b/docs/MUSICGEN_STYLE.md @@ -196,37 +196,3 @@ dora run solver=musicgen/musicgen_style_32khz model/lm/model_scale=medium contin to change some parts, like the conditioning or some other parts of the model, you are responsible for manually crafting a checkpoint file from which we can safely run `load_state_dict`. If you decide to do so, make sure your checkpoint is saved with `torch.save` and contains a dict `{'best_state': {'model': model_state_dict_here}}`. Directly give the path to `continue_from` without a `//pretrained/` prefix. - - -## FAQ - -#### What are top-k, top-p, temperature and classifier-free guidance? - -Check out [@FurkanGozukara tutorial](https://github.com/FurkanGozukara/Stable-Diffusion/blob/main/Tutorials/AI-Music-Generation-Audiocraft-Tutorial.md#more-info-about-top-k-top-p-temperature-and-classifier-free-guidance-from-chatgpt). - -#### Should I use FSDP or autocast ? - -The two are mutually exclusive (because FSDP does autocast on its own). -You can use autocast up to 1.5B (medium), if you have enough RAM on your GPU. -FSDP makes everything more complex but will free up some memory for the actual -activations by sharding the optimizer state. - -## Citation -``` -@misc{rouard2024audioconditioningmusicgeneration, - title={Audio Conditioning for Music Generation via Discrete Bottleneck Features}, - author={Simon Rouard and Yossi Adi and Jade Copet and Axel Roebel and Alexandre Défossez}, - year={2024}, - eprint={2407.12563}, - archivePrefix={arXiv}, - primaryClass={cs.SD}, - url={https://arxiv.org/abs/2407.12563}, -} -``` - -## License - -See license information in the [model card](../model_cards/MUSICGEN_STYLE_MODEL_CARD.md). - -[arxiv]: https://arxiv.org/abs/2407.12563 -[musicgen_style_samples]: https://musicgenstyle.github.io/