From 47dbd5c42aea5d1919244d6adb82178ac90338bb Mon Sep 17 00:00:00 2001 From: Hao Hao Tan Date: Tue, 17 Oct 2023 22:14:28 +0800 Subject: [PATCH] [chore] bump to 0.2.0 (#22) --- README.md | 17 ++++++++++++++++- frechet_audio_distance/fad.py | 3 --- pyproject.toml | 11 +++++++---- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 89f0d16..5bb3b76 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,8 @@ A lightweight library of Frechet Audio Distance calculation. Currently, we support embedding from: - `VGGish` by [S. Hershey et al.](https://arxiv.org/abs/1812.08466) -- `PANN` by [Kong et al.](https://arxiv.org/abs/1912.10211). +- `PANN` by [Kong et al.](https://arxiv.org/abs/1912.10211) +- `CLAP` by [Wu et al.](https://arxiv.org/abs/2211.06687) ### Installation @@ -18,6 +19,7 @@ from frechet_audio_distance import FrechetAudioDistance # to use `vggish` frechet = FrechetAudioDistance( model_name="vggish", + sample_rate=16000, use_pca=False, use_activation=False, verbose=False @@ -25,14 +27,27 @@ frechet = FrechetAudioDistance( # to use `PANN` frechet = FrechetAudioDistance( model_name="pann", + sample_rate=16000, use_pca=False, use_activation=False, verbose=False ) +# to use `CLAP` +frechet = FrechetAudioDistance( + model_name="clap", + sample_rate=48000, + submodel_name="630k-audioset", # for CLAP only + verbose=False, + enable_fusion=False, # for CLAP only +) fad_score = frechet.score("/path/to/background/set", "/path/to/eval/set", dtype="float32") ``` +You can also have a look at [this notebook](https://github.com/gudgud96/frechet-audio-distance/blob/main/test/test_all.ipynb) for a better understanding of how each model is used. + +### Save pre-computed embeddings + When computing the Frechet Audio Distance, you can choose to save the embeddings for future use. This capability not only ensures consistency across evaluations but can also significantly reduce computation time, especially if you're evaluating multiple times using the same dataset. diff --git a/frechet_audio_distance/fad.py b/frechet_audio_distance/fad.py index 8e260ea..d07c068 100644 --- a/frechet_audio_distance/fad.py +++ b/frechet_audio_distance/fad.py @@ -19,11 +19,8 @@ from .models.pann import Cnn14, Cnn14_8k, Cnn14_16k -# SAMPLE_RATE = 16000 - def load_audio_task(fname, sample_rate, dtype="float32"): - # print("LOAD AUDIO TASK") if dtype not in ['float64', 'float32', 'int32', 'int16']: raise ValueError(f"dtype not supported: {dtype}") diff --git a/pyproject.toml b/pyproject.toml index c44c04a..fddf827 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "frechet_audio_distance" -version = "0.1.2" +version = "0.2.0" authors = [ { name="Hao Hao Tan", email="helloharry66@gmail.com" }, ] @@ -18,13 +18,16 @@ classifiers = [ "Operating System :: OS Independent", ] dependencies = [ - 'numpy', + 'numpy==1.23.4', 'torch', - 'scipy', + 'scipy==1.10.1', 'tqdm', 'soundfile', 'resampy', - 'torchlibrosa' + 'torchlibrosa', + 'laion_clap', + 'transformers<=4.30.2', + 'torchaudio', ] [project.urls]