dev(narugo): save the newest features

deepghs · Aug 27, 2024 · 88aeda7 · 88aeda7
1 parent 8dbedef
commit 88aeda7
Show file tree

Hide file tree

Showing 6 changed files with 180 additions and 11 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ pandas
 scipy
 hfutils>=0.4.1
 soundfile>=0.12
-fastdtw
+fastdtw
+librosa>=0.10
diff --git a/soundutils/data/sound.py b/soundutils/data/sound.py
@@ -48,7 +48,7 @@ def resample(self, sample_rate) -> 'Sound':
         if sample_rate == self._sample_rate:
             return self
 
-        resampled_length = int(self.samples * (sample_rate / self._sample_rate))
+        resampled_length = int(round(self.samples * (sample_rate / self._sample_rate)))
         resampled_data = signal.resample(self._data, resampled_length)
 
         return Sound(data=resampled_data, sample_rate=sample_rate)
@@ -141,7 +141,7 @@ def _fmt_time(x, pos):
 
         ax.set_xlabel('Time [hh:mm:ss.mmm]')
         ax.set_ylabel('Amplitude')
-        ax.set_title(f'{title or "Audio Signal"}\n'
-                     f'Channels: {self.channels}, Sample Rate: {self._sample_rate}\n'
-                     f'Time: {self.time:.3f}s ({plural_word(self.samples, "frame")})\n')
+        ax.set_title(f'{title or "Audio Signal"}, '
+                     f'Channels: {self.channels}, Sample Rate: {self._sample_rate}, '
+                     f'Time: {self.time:.3f}s ({plural_word(self.samples, "frame")})')
         ax.legend()
diff --git a/soundutils/similarity/__init__.py b/soundutils/similarity/__init__.py
@@ -1,5 +1,6 @@
 from .base import SoundAlignError, SoundLengthNotMatch, SoundResampleRateNotMatch, SoundChannelsNotMatch
 from .correlation import sound_pearson_similarity
 from .dtw import sound_fastdtw
+from .mfcc import sound_mfcc_similarity
 from .mse import sound_mse, sound_rmse
 from .spectral import sound_spectral_centroid_distance
diff --git a/soundutils/similarity/mfcc.py b/soundutils/similarity/mfcc.py
@@ -0,0 +1,40 @@
+from typing import Literal
+
+import librosa
+import numpy as np
+from scipy.spatial.distance import cosine
+
+from .base import _align_sounds
+from ..data import SoundTyping
+
+
+def sound_mfcc_similarity(
+        sound1: SoundTyping, sound2: SoundTyping,
+        n_mfcc: int = 13, mode: Literal['flat', 'mean'] = 'flat',
+        resample_rate_align: Literal['max', 'min', 'none'] = 'none',
+        time_align: Literal['none', 'pad', 'prefix', 'resample_max', 'resample_min'] = 'none',
+        channels_align: Literal['none'] = 'none',
+) -> float:
+    (data1, sr1), (data2, sr2) = _align_sounds(
+        sound1=sound1,
+        sound2=sound2,
+        resample_rate_align=resample_rate_align,
+        time_align=time_align,
+        channels_align=channels_align,
+    )
+
+    similarities = []
+    for ch in range(data1.shape[0]):
+        mfcc1 = librosa.feature.mfcc(y=data1[ch], sr=sr1, n_mfcc=n_mfcc)
+        mfcc2 = librosa.feature.mfcc(y=data2[ch], sr=sr2, n_mfcc=n_mfcc)
+        if mode == 'flat':
+            mfcc1_feat = mfcc1.flatten()
+            mfcc2_feat = mfcc2.flatten()
+        elif mode == 'mean':
+            mfcc1_feat = np.mean(mfcc1, axis=-1)
+            mfcc2_feat = np.mean(mfcc2, axis=-1)
+        else:
+            raise ValueError(f'Invalid mode for MFCC - {mode!r}.')
+        similarities.append(1 - cosine(mfcc1_feat, mfcc2_feat))
+
+    return np.mean(similarities).item()
diff --git a/test/data/test_sound.py b/test/data/test_sound.py
@@ -1,13 +1,13 @@
-import os
 import re
 
 import numpy as np
 import pytest
+import torch
 import torchaudio
 from hbutils.testing import isolated_directory, tmatrix
 
 from soundutils.data import Sound
-from soundutils.similarity import sound_spectral_centroid_distance
+from soundutils.similarity import sound_spectral_centroid_distance, sound_mfcc_similarity
 from ..testings import get_testfile
 
 
@@ -129,9 +129,15 @@ def test_repr(self, file, regex):
     def test_resample(self, file, sample_rate):
         sound_file = get_testfile('assets', file)
         sound = Sound.open(sound_file)
+        resampler = torchaudio.transforms.Resample(sound.sample_rate, sample_rate)
+        data, _ = sound.to_numpy()
+        r_data = resampler(torch.from_numpy(data).type(torch.float32)).numpy()
+        expected_new_sound = Sound.from_numpy(r_data, sample_rate)
 
-        body, ext = os.path.splitext(file)
-        expected_sound_file = get_testfile('assets', f'{body}_sr{sample_rate}{ext}')
         new_sound = sound.resample(sample_rate)
-        new_sound.save(expected_sound_file)
-        assert sound_spectral_centroid_distance(new_sound, expected_sound_file) < 2
+        assert new_sound.sample_rate == sample_rate
+        assert sound_mfcc_similarity(
+            new_sound, expected_new_sound,
+            time_align='pad',
+            resample_rate_align='min',
+        ) >= 0.98
diff --git a/test/similarity/test_mfcc.py b/test/similarity/test_mfcc.py
@@ -0,0 +1,121 @@
+import pytest
+
+from soundutils.similarity import SoundLengthNotMatch, SoundChannelsNotMatch, SoundResampleRateNotMatch, \
+    sound_mfcc_similarity
+from test.testings import get_testfile
+
+
+@pytest.mark.unittest
+class TestSimilarityMFCC:
+    @pytest.mark.parametrize(['file1', 'file2', 'v'], [
+        ('texas_short.wav', 'texas_short.wav', 1.0),
+        ('texas_short.wav', 'texas_long.wav', SoundLengthNotMatch),
+        ('texas_long.wav', 'texas_long.wav', 1.0),
+        ('texas_long.wav', 'texas_long.flac', 1.0),
+        ('texas_long.wav', 'texas_long.mp3', 0.9987497286333322),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', SoundResampleRateNotMatch),
+        ('texas_short.wav', 'stereo_sine_wave_44100.wav', SoundChannelsNotMatch),
+    ])
+    def test_sound_mfcc_similarity(self, file1, file2, v):
+        file1 = get_testfile('assets', file1)
+        file2 = get_testfile('assets', file2)
+        if isinstance(v, type) and issubclass(v, BaseException):
+            with pytest.raises(v):
+                _ = sound_mfcc_similarity(file1, file2)
+        else:
+            assert sound_mfcc_similarity(file1, file2) == pytest.approx(v)
+
+    @pytest.mark.parametrize(['file1', 'file2', 'v'], [
+        ('texas_short.wav', 'texas_short.wav', 1.0),
+        ('texas_short.wav', 'texas_long.wav', 0.8899687300652068),
+        ('texas_long.wav', 'texas_short.wav', 0.8899687300652068),
+        ('texas_long.wav', 'texas_long.wav', 1.0),
+        ('texas_long.wav', 'texas_long.flac', 1.0),
+        ('texas_long.wav', 'texas_long.mp3', 0.9987497286333322),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', SoundResampleRateNotMatch),
+        ('texas_short.wav', 'stereo_sine_wave_44100.wav', SoundChannelsNotMatch),
+    ])
+    def test_sound_mfcc_similarity_pad(self, file1, file2, v):
+        file1 = get_testfile('assets', file1)
+        file2 = get_testfile('assets', file2)
+        if isinstance(v, type) and issubclass(v, BaseException):
+            with pytest.raises(v):
+                _ = sound_mfcc_similarity(file1, file2, time_align='pad')
+        else:
+            assert sound_mfcc_similarity(file1, file2, time_align='pad') == pytest.approx(v)
+
+    @pytest.mark.parametrize(['file1', 'file2', 'time_align', 'v'], [
+        ('texas_short.wav', 'texas_short.wav', 'none', 1.0),
+        ('texas_short.wav', 'texas_short.wav', 'pad', 1.0),
+        ('texas_short.wav', 'texas_short.wav', 'prefix', 1.0),
+        ('texas_short.wav', 'texas_short.wav', 'resample_max', 1.0),
+        ('texas_short.wav', 'texas_short.wav', 'resample_min', 1.0),
+        ('texas_short.wav', 'texas_short.wav', 'bullshit', ValueError),
+
+        ('texas_short.wav', 'texas_long.wav', 'none', SoundLengthNotMatch),
+        ('texas_short.wav', 'texas_long.wav', 'pad', 0.8899687300652068),
+        ('texas_long.wav', 'texas_short.wav', 'pad', 0.8899687300652068),
+        ('texas_short.wav', 'texas_long.wav', 'prefix', 0.8728770796805216),
+        ('texas_short.wav', 'texas_long.wav', 'resample_max', 0.8713947848528786),
+        ('texas_long.wav', 'texas_short.wav', 'resample_max', 0.8713947848528786),
+        ('texas_short.wav', 'texas_long.wav', 'resample_min', 0.6971805796646183),
+        ('texas_long.wav', 'texas_short.wav', 'resample_min', 0.6971805796646183),
+
+        ('texas_long.wav', 'texas_long.wav', 'none', 1.0),
+        ('texas_long.wav', 'texas_long.flac', 'none', 1.0),
+        ('texas_long.wav', 'texas_long.mp3', 'none', 0.9987497286333322),
+    ])
+    def test_sound_mfcc_similarity_time_align(self, file1, file2, time_align, v):
+        file1 = get_testfile('assets', file1)
+        file2 = get_testfile('assets', file2)
+        if isinstance(v, type) and issubclass(v, BaseException):
+            with pytest.raises(v):
+                _ = sound_mfcc_similarity(file1, file2, time_align=time_align)
+        else:
+            assert sound_mfcc_similarity(file1, file2, time_align=time_align) == pytest.approx(v)
+
+    @pytest.mark.parametrize(['file1', 'file2', 'sr_align', 'v'], [
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'none', 1.0),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'min', 1.0),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'max', 1.0),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'bullshit', ValueError),
+
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'none', SoundResampleRateNotMatch),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'min', 0.9999701859565737),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'max', 0.9999913655328565),
+
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'none', SoundResampleRateNotMatch),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'min', 0.9623946880540923),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_3x_40_900.wav', 'max', 0.9517846942566877),
+    ])
+    def test_sound_mfcc_similarity_sr_align(self, file1, file2, sr_align, v):
+        file1 = get_testfile('assets', file1)
+        file2 = get_testfile('assets', file2)
+        if isinstance(v, type) and issubclass(v, BaseException):
+            with pytest.raises(v):
+                _ = sound_mfcc_similarity(file1, file2, time_align='pad', resample_rate_align=sr_align)
+        else:
+            assert sound_mfcc_similarity(file1, file2, time_align='pad', resample_rate_align=sr_align) \
+                   == pytest.approx(v)
+
+    @pytest.mark.parametrize(['file1', 'file2', 'mode', 'v'], [
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'flat', 1.0),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave.wav', 'mean', 1.0),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'flat', 0.9999701859565737),
+        ('stereo_sine_wave.wav', 'stereo_sine_wave_44100.wav', 'mean', 0.9999993147811599),
+
+        ('texas_long.wav', 'texas_long.wav', 'flat', 1.0),
+        ('texas_long.wav', 'texas_long.wav', 'mean', 1.0),
+        ('texas_long.wav', 'texas_long_sr8000.wav', 'flat', 0.999767841519668),
+        ('texas_long.wav', 'texas_long_sr16000.wav', 'flat', 0.9998774575084983),
+        ('texas_long.wav', 'texas_long.wav', 'bullshit', ValueError),
+    ])
+    def test_sound_mfcc_similarity_pad_min(self, file1, file2, mode, v):
+        file1 = get_testfile('assets', file1)
+        file2 = get_testfile('assets', file2)
+        if isinstance(v, type) and issubclass(v, BaseException):
+            with pytest.raises(v):
+                _ = sound_mfcc_similarity(file1, file2, mode=mode, time_align='pad', resample_rate_align='min')
+        else:
+            assert sound_mfcc_similarity(file1, file2, mode=mode, time_align='pad', resample_rate_align='min') \
+                   == pytest.approx(v)