update autocap

snap-research · Oct 14, 2024 · 0268089 · 0268089
1 parent 7503e62
commit 0268089
Show file tree

Hide file tree

Showing 12 changed files with 73 additions and 53 deletions.
diff --git a/AutoCap/README.md b/AutoCap/README.md
@@ -3,7 +3,7 @@
 # AutoCap inference, training and evaluation
 - [Inference](#inference)
     * [Audio to text script](#audio-to-text)
-    * [Gradio demo](#gradio-demo)
+    <!-- * [Gradio demo](#gradio-demo) -->
     * [Caption a list of audio files](#caption-list-of-audio-files)
     * [Caption your custom dataset](#caption-a-dataset)
 - [Training](#training)
@@ -24,14 +24,14 @@ python scripts/audio_to_text.py --wav_path <path-to-wav-file>
 # Example inference
 python scripts/audio_to_text.py --wav_path samples/ood_samples/loudwhistle-91003.wav
 ```
-- This will automatically download `TODO` model and run the inference with the default parameters. You may change these parameters or provide your cutome model config file and checkpoint path.
+- This will automatically download `autocap-full` model and run the inference with the default parameters. You may change these parameters or provide your cutome model config file and checkpoint path.
 - For more accurate captioning, provide meta data using `--title`, `description`, and `--video_caption` arguments.
 
-## Gradio Demo
+<!-- ## Gradio Demo
 A local Gradio demo is also available by running
 ```shell
 python app_audio2text.py
-```
+``` -->
 
 ## Caption list of audio files
 - Prepare all target audio files in a single folder
@@ -54,7 +54,7 @@ python scripts/inference_folder.py --folder_path samples/ood_samples --meta_data
 ## Caption your custom dataset
 
 If you want to caption a large dataset, we provide a script that works with multigpus for faster inference.
-- Prepare your custom dataset by following the instruction in the dataset prepeartion README (TODO) and run
+- Prepare your custom dataset by following the instruction in the [dataset preperation README](../dataset_preperation/README.md) and run
 
 ```shell
 python scripts/caption_dataset.py \
@@ -64,16 +64,23 @@ python scripts/caption_dataset.py \
             --end_idx 1000000 \
             --dataset_keys "dataset_1" "dataset_2" ...
 
+# Example
+python scripts/caption_dataset.py \
+        --caption_store_key autocap_caption \
+        --beam_size 2 \
+        --start_idx 0 \
+        --end_idx 100 \
+        --dataset_keys “wavcaps_soundbible”
 ```
-- Provide your dataset keys as registered in the dataset preperation (TODO)
+- Provide your dataset keys as registered in the [dataset preperation](../dataset_preperation/README.md) process
 - Captions will be generated and stores in each file json file with the specified caption_ store_key
 - `start_idx` and `end_idx` arugments can be used to resume or distribute captioning experiments
 - Add your `caption_store_key` under `keys_synonyms:gt_audio_caption` in the target yaml config file for it to be selected when the ground truth caption is not available in your audio captioning or audio generation experiments.
 
 
 # Training
 ### Dataset
-Please refer to the dataset README (TODO) for instructions on downloading our dataset or preparing your own dataset.
+Please refer to the [dataset preperation README](../dataset_preperation/README.md) for instructions on downloading our dataset or preparing your own dataset.
 
 ### Stage 1 (pretraining)
 - Specify your model parameters in a config yaml file. A sample yaml file is given under `settings/pretraining.yaml`
@@ -105,7 +112,16 @@ python evaluate.py -c <path-to-config> -ckpt <path-to-checkpoint>
 # Cite this work
 If you found this useful, please consider citing our work
 
-```TODO
+```
+@misc{hajiali2024tamingdatatransformersaudio,
+      title={Taming Data and Transformers for Audio Generation}, 
+      author={Moayed Haji-Ali and Willi Menapace and Aliaksandr Siarohin and Guha Balakrishnan and Sergey Tulyakov and Vicente Ordonez},
+      year={2024},
+      eprint={2406.19388},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2406.19388}, 
+}
 ```
 
 # Acknowledgements

diff --git a/AutoCap/__init__.py b/AutoCap/__init__.py
diff --git a/AutoCap/scripts/audio_to_text.py b/AutoCap/scripts/audio_to_text.py
@@ -1,8 +1,3 @@
-#!/usr/bin/env python3
-# coding: utf-8
-# @Author  : Xinhao Mei @CVSSP, University of Surrey
-# @E-mail  : [email protected]
-
 import __init__
 import argparse
 

diff --git a/AutoCap/scripts/caption_dataset.py b/AutoCap/scripts/caption_dataset.py
@@ -1,8 +1,3 @@
-#!/usr/bin/env python3
-# coding: utf-8
-# @Author  : Xinhao Mei @CVSSP, University of Surrey
-# @E-mail  : [email protected]
-
 import __init__
 import os
 import argparse
@@ -84,7 +79,7 @@ def main():
     setup_seed(config['training']["seed"])
     seed_everything(config['training']["seed"])
 
-    config["inference_exp_name"] = f"inference_{os.path.basename(args.config)[:-5]}" # remove .yaml
+    config["inference_exp_name"] = f"inference_{os.path.basename(args.config)[:-5]}" 
     config['beam_size'] = args.beam_size
     config['caption_key'] = args.caption_store_key
 

diff --git a/AutoCap/scripts/inference_folder.py b/AutoCap/scripts/inference_folder.py
@@ -1,8 +1,3 @@
-#!/usr/bin/env python3
-# coding: utf-8
-# @Author  : Xinhao Mei @CVSSP, University of Surrey
-# @E-mail  : [email protected]
-
 import __init__
 import os
 import argparse

diff --git a/GenAU/README.md b/GenAU/README.md
@@ -103,22 +103,31 @@ python train/1d_vae.py -c settings/simple_runs/1d_vae.yaml
 ## Evaluation
 - We follow [audioldm](https://github.com/haoheliu/AudioLDM-training-finetuning) to perform our evaulations. 
 - By default, the models will be evaluated periodically during training as specified in the config file. For each evaulation, a folder with the generated audio will be saved under `run_logs/train' at the same levels the specified config file. 
-- The code idenfities the test dataset in an already existing folder according to number of samples. If you would like to test on a new test dataset, register it in `scripts/generate_and_eval`
+- The code idenfities the test dataset in an already existing folder according to that number of samples. If you would like to test on a new test dataset, register it in `scripts/generate_and_eval` or provide `--evaluation_dataset` name.
 
 ```shell
 
 # Evaluate on an existing generated folder
 python scripts/evaluate.py --log_path <path-to-the-experiment-folder>
 
 # Geneate test audios from a pre-trained checkpoint and run evaulation
-python scripts/generate_and_eval.py -c <path-to-config> -ckpt <path-to-pretrained-ckpt>
+python scripts/generate_and_eval.py -c <path-to-config> -ckpt <path-to-pretrained-ckpt> --generate_and_eval audiocaps
 ```
 The evaluation result will be saved in a json file at the same level of the generated audio folder.
 
 # Cite this work
 If you found this useful, please consider citing our work
 
-```TODO
+```
+@misc{hajiali2024tamingdatatransformersaudio,
+      title={Taming Data and Transformers for Audio Generation}, 
+      author={Moayed Haji-Ali and Willi Menapace and Aliaksandr Siarohin and Guha Balakrishnan and Sergey Tulyakov and Vicente Ordonez},
+      year={2024},
+      eprint={2406.19388},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2406.19388}, 
+}
 ```
 
 # Acknowledgements

diff --git a/GenAU/scripts/generate_and_eval.py b/GenAU/scripts/generate_and_eval.py
@@ -57,7 +57,7 @@ def locate_validation_output(path):
     return folders
 
 
-def evaluate_exp_performance(exp_name):
+def evaluate_exp_performance(exp_name, evaluation_dataset=None):
     abs_path_exp = os.path.join(latent_diffusion_model_log_path, exp_name)
     config_yaml_path = locate_yaml_file(abs_path_exp)
 
@@ -68,6 +68,8 @@ def evaluate_exp_performance(exp_name):
     folders_todo = locate_validation_output(abs_path_exp)
 
     for folder in folders_todo:
+        if evaluation_dataset is not None:
+            test_dataset = evaluation_dataset
         if len(os.listdir(folder)) == 964:
             test_dataset = "audiocaps"
         elif len(os.listdir(folder)) > 5000:
@@ -82,7 +84,10 @@ def evaluate_exp_performance(exp_name):
         evaluator.main(folder, test_audio_data_folder)
 
 @torch.no_grad()
-def generate_test_audio(configs, config_yaml_path, exp_group_name, exp_name, use_wav_cond=False, strategy='wo_ema', batch_size=244, n_candidates_per_samples=1, ckpt=None):
+def generate_test_audio(configs, config_yaml_path, exp_group_name,
+                        exp_name, use_wav_cond=False, strategy='wo_ema',
+                        batch_size=244, n_candidates_per_samples=1, ckpt=None,
+                        evaluation_dataset=None):
     if "seed" in configs.keys():
         seed_everything(configs["seed"])
     else:
@@ -186,6 +191,8 @@ def generate_test_audio(configs, config_yaml_path, exp_group_name, exp_name, use
 
 
     # copy test data if it does not exists
+    if evaluation_dataset is not None:
+        assert evaluation_dataset==val_dataset.dataset_name, f"[ERROR, generate_and_eval.py] the given evaluation dataset {evaluation_dataset} and the specified dataset_name of the test dataset {val_dataset.dataset_name} do not match."
     test_data_subset_folder = os.path.join(
         os.path.dirname(configs['logging']["log_directory"]),
         "testset_data",
@@ -195,10 +202,10 @@ def generate_test_audio(configs, config_yaml_path, exp_group_name, exp_name, use
     copy_test_subset_data(val_dataset, test_data_subset_folder)
 
 
-def eval(exps):
+def eval(exps, evaluation_dataset=None):
     for exp in exps:
         try:
-            evaluate_exp_performance(exp)
+            evaluate_exp_performance(exp, evaluation_dataset=evaluation_dataset)
         except Exception as e:
             print(exp, e)
 
@@ -215,6 +222,13 @@ def eval(exps):
         required=True,
         help="path to config .yaml file",
     )
+    parser.add_argument(
+        "--evaluation_dataset",
+        type=str,
+        default=None,
+        required=False,
+        help="target dataset to run the evaluation on",
+    )
 
     parser.add_argument(
         "-s",
@@ -260,7 +274,10 @@ def eval(exps):
     configs = configuration.get_config()
 
     # generate audio
-    generate_test_audio(configs, config_yaml_path, exp_group_name, exp_name, strategy=args.strategy, batch_size=args.batch_size, n_candidates_per_samples=args.n_candidates_per_samples, ckpt=args.ckpt)
+    generate_test_audio(configs, config_yaml_path, exp_group_name, exp_name,
+                        strategy=args.strategy, batch_size=args.batch_size,
+                        n_candidates_per_samples=args.n_candidates_per_samples, ckpt=args.ckpt,
+                        evaluation_dataset=args.evaluation_dataset)
 
     test_audio_path = os.path.join(
         os.path.dirname(configs['logging']["log_directory"]),
@@ -271,4 +288,4 @@ def eval(exps):
     # copy config path
     shutil.copy(config_yaml_path, os.path.join(configs['logging']["log_directory"], exp_group_name, exp_name))
 
-    eval([exp_name])
+    eval([exp_name], evaluation_dataset=args.evaluation_dataset)
diff --git a/GenAU/src/models/genau_ddpm.py b/GenAU/src/models/genau_ddpm.py
@@ -2268,7 +2268,7 @@ def generate_sample(
                 mel = self.decode_first_stage(samples)
 
                 if self.log_melspectrogran:
-                    mel_grid = make_grid(mel, nrows=2) # TODO: decide on the number of rows
+                    mel_grid = make_grid(mel, nrows=2) 
                     self.log_spectrograms(mel[:4].exp(), "val/mel_spectrogram", self.global_step)
                     self.logger.experiment.log({"val/mel_spectrogram": [wandb.Image(mel_grid.permute(1, 2, 0).detach().cpu().numpy(), caption="Spectrograms")]}, step=self.global_step, commit=False)
 

diff --git a/GenAU/src/utilities/data/videoaudio_dataset.py b/GenAU/src/utilities/data/videoaudio_dataset.py
@@ -711,23 +711,14 @@ def run(self):
 
 
     def read_wav_file(self, filename, random_start=None):
-
         # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
-        # waveform = torch.from_numpy(waveform)
-        # print("waveform shape", waveform.shape)
         waveform, sr = self.load_audio_with_timeout(filename, timeout=10)
         if waveform is None:
             print("[INFO] timeout when loading the audio")
             # # # TODO Important, dummy audio
             waveform = torch.zeros(1, int(self.sampling_rate * self.duration))
             sr = 16000
 
-        # waveform = torch.zeros(1, int(self.sampling_rate * self.duration))
-        # sr = 16000
-        # waveform, sr = torchaudio.load(filename)
-        # # # TODO Important, dummy audio
-        # waveform = torch.zeros(1, int(self.sampling_rate * self.duration))
-
         waveform, random_start = self.random_segment_wav(
             waveform, target_length=int(sr * self.duration), random_start=random_start
         )

diff --git a/README.md b/README.md
@@ -57,5 +57,13 @@ See [GenAU](./GenAU/README.md) README for details on inference, training, finetu
 ## Citation
 If you find this paper useful in your research, please consider citing our work:
 ```
-TODO
+@misc{hajiali2024tamingdatatransformersaudio,
+      title={Taming Data and Transformers for Audio Generation}, 
+      author={Moayed Haji-Ali and Willi Menapace and Aliaksandr Siarohin and Guha Balakrishnan and Sergey Tulyakov and Vicente Ordonez},
+      year={2024},
+      eprint={2406.19388},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2406.19388}, 
+}
 ```
diff --git a/dataset_preperation/download_external_datasets.py b/dataset_preperation/download_external_datasets.py
@@ -75,7 +75,7 @@ def download_and_organize_wavcaps(dataset_root, subset_key, autocap_captions_fil
         snapshot_download(repo_id="cvssp/WavCaps", local_dir=dataset_dir, repo_type="dataset", allow_patterns=['Zip_files/SoundBible/*', 'json_files/*'])
         os.system(f"unzip {os.path.join(dataset_dir, 'Zip_files/SoundBible/SoundBible.zip')} -d {os.path.join(dataset_dir, 'wavcaps_soundbible')}")
         base_path = os.path.join(dataset_dir, "wavcaps_soundbible", "mnt/fast/nobackup/scratch4weeks/xm00178/WavCaps/data/waveforms/SoundBible_flac")
-        central_json_path = os.path.join(dataset_dir, "json_files/SoundBibl0e/sb_final.json")
+        central_json_path = os.path.join(dataset_dir, "json_files/SoundBible/sb_final.json")
         update_wavcaps_json_files(base_path, central_json_path, autocap_captions=autocap_captions['Soundbible'])
 
     # bbc
@@ -137,13 +137,7 @@ def download_and_organize_wavcaps(dataset_root, subset_key, autocap_captions_fil
 args = parser.parse_args()
 
 for dataset_key in args.dataset_names:
-    if dataset_key == 'clotho':
-        download_and_organize_clotho(args.save_root)
-
-    elif dataset_key == 'audiocaps':
-        download_and_organize_audiocaps(args.save_root)
-
-    elif dataset_key.startswith('wavcaps'):
+    if dataset_key.startswith('wavcaps'):
         download_and_organize_wavcaps(args.save_root, dataset_key)
 
     else:

diff --git a/dataset_preperation/organize_dataset.py b/dataset_preperation/organize_dataset.py
@@ -86,7 +86,7 @@ def find_json_files(directory):
 
     parser.add_argument("--overwrite", 
                         default=False,
-                        action="store_true"
+                        action="store_true",
                         help="Overwrite dataset metadata")
 
     args = parser.parse_args()