diff --git a/requirements.txt b/requirements.txt index 04f663c..5e680d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ pandas scipy hfutils>=0.4.1 soundfile>=0.12 -fastdtw \ No newline at end of file +fastdtw +librosa>=0.10 \ No newline at end of file diff --git a/soundutils/data/sound.py b/soundutils/data/sound.py index 3aecc89..76db31f 100644 --- a/soundutils/data/sound.py +++ b/soundutils/data/sound.py @@ -48,7 +48,7 @@ def resample(self, sample_rate) -> 'Sound': if sample_rate == self._sample_rate: return self - resampled_length = int(self.samples * (sample_rate / self._sample_rate)) + resampled_length = int(round(self.samples * (sample_rate / self._sample_rate))) resampled_data = signal.resample(self._data, resampled_length) return Sound(data=resampled_data, sample_rate=sample_rate) @@ -141,7 +141,7 @@ def _fmt_time(x, pos): ax.set_xlabel('Time [hh:mm:ss.mmm]') ax.set_ylabel('Amplitude') - ax.set_title(f'{title or "Audio Signal"}\n' - f'Channels: {self.channels}, Sample Rate: {self._sample_rate}\n' - f'Time: {self.time:.3f}s ({plural_word(self.samples, "frame")})\n') + ax.set_title(f'{title or "Audio Signal"}, ' + f'Channels: {self.channels}, Sample Rate: {self._sample_rate}, ' + f'Time: {self.time:.3f}s ({plural_word(self.samples, "frame")})') ax.legend() diff --git a/soundutils/similarity/__init__.py b/soundutils/similarity/__init__.py index abff1fe..d172ae5 100644 --- a/soundutils/similarity/__init__.py +++ b/soundutils/similarity/__init__.py @@ -1,5 +1,6 @@ from .base import SoundAlignError, SoundLengthNotMatch, SoundResampleRateNotMatch, SoundChannelsNotMatch from .correlation import sound_pearson_similarity from .dtw import sound_fastdtw +from .mfcc import sound_mfcc_similarity from .mse import sound_mse, sound_rmse from .spectral import sound_spectral_centroid_distance diff --git a/soundutils/similarity/mfcc.py b/soundutils/similarity/mfcc.py new file mode 100644 index 0000000..e9c28ba --- /dev/null +++ b/soundutils/similarity/mfcc.py @@ -0,0 +1,40 @@ +from typing import Literal + +import librosa +import numpy as np +from scipy.spatial.distance import cosine + +from .base import _align_sounds +from ..data import SoundTyping + + +def sound_mfcc_similarity( + sound1: SoundTyping, sound2: SoundTyping, + n_mfcc: int = 13, mode: Literal['flat', 'mean'] = 'flat', + resample_rate_align: Literal['max', 'min', 'none'] = 'none', + time_align: Literal['none', 'pad', 'prefix', 'resample_max', 'resample_min'] = 'none', + channels_align: Literal['none'] = 'none', +) -> float: + (data1, sr1), (data2, sr2) = _align_sounds( + sound1=sound1, + sound2=sound2, + resample_rate_align=resample_rate_align, + time_align=time_align, + channels_align=channels_align, + ) + + similarities = [] + for ch in range(data1.shape[0]): + mfcc1 = librosa.feature.mfcc(y=data1[ch], sr=sr1, n_mfcc=n_mfcc) + mfcc2 = librosa.feature.mfcc(y=data2[ch], sr=sr2, n_mfcc=n_mfcc) + if mode == 'flat': + mfcc1_feat = mfcc1.flatten() + mfcc2_feat = mfcc2.flatten() + elif mode == 'mean': + mfcc1_feat = np.mean(mfcc1, axis=-1) + mfcc2_feat = np.mean(mfcc2, axis=-1) + else: + raise ValueError(f'Invalid mode for MFCC - {mode!r}.') + similarities.append(1 - cosine(mfcc1_feat, mfcc2_feat)) + + return np.mean(similarities).item() diff --git a/test/data/test_sound.py b/test/data/test_sound.py index ec85545..7517f47 100644 --- a/test/data/test_sound.py +++ b/test/data/test_sound.py @@ -1,13 +1,13 @@ -import os import re import numpy as np import pytest +import torch import torchaudio from hbutils.testing import isolated_directory, tmatrix from soundutils.data import Sound -from soundutils.similarity import sound_spectral_centroid_distance +from soundutils.similarity import sound_spectral_centroid_distance, sound_mfcc_similarity from ..testings import get_testfile @@ -129,9 +129,15 @@ def test_repr(self, file, regex): def test_resample(self, file, sample_rate): sound_file = get_testfile('assets', file) sound = Sound.open(sound_file) + resampler = torchaudio.transforms.Resample(sound.sample_rate, sample_rate) + data, _ = sound.to_numpy() + r_data = resampler(torch.from_numpy(data).type(torch.float32)).numpy() + expected_new_sound = Sound.from_numpy(r_data, sample_rate) - body, ext = os.path.splitext(file) - expected_sound_file = get_testfile('assets', f'{body}_sr{sample_rate}{ext}') new_sound = sound.resample(sample_rate) - new_sound.save(expected_sound_file) - assert sound_spectral_centroid_distance(new_sound, expected_sound_file) < 2 + assert new_sound.sample_rate == sample_rate + assert sound_mfcc_similarity( + new_sound, expected_new_sound, + time_align='pad', + resample_rate_align='min', + ) >= 0.98 diff --git a/test/similarity/test_mfcc.py b/test/similarity/test_mfcc.py new file mode 100644 index 0000000..f9ec43f --- /dev/null +++ b/test/similarity/test_mfcc.py @@ -0,0 +1,121 @@ +import pytest + +from soundutils.similarity import SoundLengthNotMatch, SoundChannelsNotMatch, SoundResampleRateNotMatch, \ + sound_mfcc_similarity +from test.testings import get_testfile + + +@pytest.mark.unittest +class TestSimilarityMFCC: + @pytest.mark.parametrize(['file1', 'file2', 'v'], [ + ('texas_short.wav', 'texas_short.wav', 1.0), + ('texas_short.wav', 'texas_long.wav', SoundLengthNotMatch), + ('texas_long.wav', 'texas_long.wav', 1.0), + ('texas_long.wav', 'texas_long.flac', 1.0), + ('texas_long.wav', 'texas_long.mp3', 0.9987497286333322), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', SoundResampleRateNotMatch), + ('texas_short.wav', 'stereo_sine_wave_44100.wav', SoundChannelsNotMatch), + ]) + def test_sound_mfcc_similarity(self, file1, file2, v): + file1 = get_testfile('assets', file1) + file2 = get_testfile('assets', file2) + if isinstance(v, type) and issubclass(v, BaseException): + with pytest.raises(v): + _ = sound_mfcc_similarity(file1, file2) + else: + assert sound_mfcc_similarity(file1, file2) == pytest.approx(v) + + @pytest.mark.parametrize(['file1', 'file2', 'v'], [ + ('texas_short.wav', 'texas_short.wav', 1.0), + ('texas_short.wav', 'texas_long.wav', 0.8899687300652068), + ('texas_long.wav', 'texas_short.wav', 0.8899687300652068), + ('texas_long.wav', 'texas_long.wav', 1.0), + ('texas_long.wav', 'texas_long.flac', 1.0), + ('texas_long.wav', 'texas_long.mp3', 0.9987497286333322), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', SoundResampleRateNotMatch), + ('texas_short.wav', 'stereo_sine_wave_44100.wav', SoundChannelsNotMatch), + ]) + def test_sound_mfcc_similarity_pad(self, file1, file2, v): + file1 = get_testfile('assets', file1) + file2 = get_testfile('assets', file2) + if isinstance(v, type) and issubclass(v, BaseException): + with pytest.raises(v): + _ = sound_mfcc_similarity(file1, file2, time_align='pad') + else: + assert sound_mfcc_similarity(file1, file2, time_align='pad') == pytest.approx(v) + + @pytest.mark.parametrize(['file1', 'file2', 'time_align', 'v'], [ + ('texas_short.wav', 'texas_short.wav', 'none', 1.0), + ('texas_short.wav', 'texas_short.wav', 'pad', 1.0), + ('texas_short.wav', 'texas_short.wav', 'prefix', 1.0), + ('texas_short.wav', 'texas_short.wav', 'resample_max', 1.0), + ('texas_short.wav', 'texas_short.wav', 'resample_min', 1.0), + ('texas_short.wav', 'texas_short.wav', 'bullshit', ValueError), + + ('texas_short.wav', 'texas_long.wav', 'none', SoundLengthNotMatch), + ('texas_short.wav', 'texas_long.wav', 'pad', 0.8899687300652068), + ('texas_long.wav', 'texas_short.wav', 'pad', 0.8899687300652068), + ('texas_short.wav', 'texas_long.wav', 'prefix', 0.8728770796805216), + ('texas_short.wav', 'texas_long.wav', 'resample_max', 0.8713947848528786), + ('texas_long.wav', 'texas_short.wav', 'resample_max', 0.8713947848528786), + ('texas_short.wav', 'texas_long.wav', 'resample_min', 0.6971805796646183), + ('texas_long.wav', 'texas_short.wav', 'resample_min', 0.6971805796646183), + + ('texas_long.wav', 'texas_long.wav', 'none', 1.0), + ('texas_long.wav', 'texas_long.flac', 'none', 1.0), + ('texas_long.wav', 'texas_long.mp3', 'none', 0.9987497286333322), + ]) + def test_sound_mfcc_similarity_time_align(self, file1, file2, time_align, v): + file1 = get_testfile('assets', file1) + file2 = get_testfile('assets', file2) + if isinstance(v, type) and issubclass(v, BaseException): + with pytest.raises(v): + _ = sound_mfcc_similarity(file1, file2, time_align=time_align) + else: + assert sound_mfcc_similarity(file1, file2, time_align=time_align) == pytest.approx(v) + + @pytest.mark.parametrize(['file1', 'file2', 'sr_align', 'v'], [ + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'none', 1.0), + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'min', 1.0), + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'max', 1.0), + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'bullshit', ValueError), + + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'none', SoundResampleRateNotMatch), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'min', 0.9999701859565737), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'max', 0.9999913655328565), + + ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'none', SoundResampleRateNotMatch), + ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'min', 0.9623946880540923), + ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'max', 0.9517846942566877), + ]) + def test_sound_mfcc_similarity_sr_align(self, file1, file2, sr_align, v): + file1 = get_testfile('assets', file1) + file2 = get_testfile('assets', file2) + if isinstance(v, type) and issubclass(v, BaseException): + with pytest.raises(v): + _ = sound_mfcc_similarity(file1, file2, time_align='pad', resample_rate_align=sr_align) + else: + assert sound_mfcc_similarity(file1, file2, time_align='pad', resample_rate_align=sr_align) \ + == pytest.approx(v) + + @pytest.mark.parametrize(['file1', 'file2', 'mode', 'v'], [ + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'flat', 1.0), + ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'mean', 1.0), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'flat', 0.9999701859565737), + ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'mean', 0.9999993147811599), + + ('texas_long.wav', 'texas_long.wav', 'flat', 1.0), + ('texas_long.wav', 'texas_long.wav', 'mean', 1.0), + ('texas_long.wav', 'texas_long_sr8000.wav', 'flat', 0.999767841519668), + ('texas_long.wav', 'texas_long_sr16000.wav', 'flat', 0.9998774575084983), + ('texas_long.wav', 'texas_long.wav', 'bullshit', ValueError), + ]) + def test_sound_mfcc_similarity_pad_min(self, file1, file2, mode, v): + file1 = get_testfile('assets', file1) + file2 = get_testfile('assets', file2) + if isinstance(v, type) and issubclass(v, BaseException): + with pytest.raises(v): + _ = sound_mfcc_similarity(file1, file2, mode=mode, time_align='pad', resample_rate_align='min') + else: + assert sound_mfcc_similarity(file1, file2, mode=mode, time_align='pad', resample_rate_align='min') \ + == pytest.approx(v)