Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RASR compatible feature extraction #44

Merged
Merged
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0f5a148
add rasr compatible feature extraction
kuacakuaca Dec 15, 2023
e8c5dde
remove f_min and f_max from config
kuacakuaca Dec 15, 2023
5fa4bf3
fix
kuacakuaca Jan 3, 2024
33dd047
add preemphasis, use amplitude instead of power spectrum, additive lo…
kuacakuaca Jan 29, 2024
3071b44
small change
kuacakuaca Jan 29, 2024
687854a
make alpha a parameter
kuacakuaca Jan 30, 2024
e7c850d
fix errors
kuacakuaca Feb 1, 2024
482f560
fix window broadcasting
albertz Feb 13, 2024
9e0cde3
test_rasr_compatible
albertz Feb 13, 2024
f45f01e
test_rasr_compatible more
albertz Feb 13, 2024
71f9e6a
test_rasr_compatible_raw_audio_samples (passing)
albertz Feb 13, 2024
259d7f3
test_rasr_compatible_preemphasis (failing)
albertz Feb 13, 2024
f41a60c
fix preemphasize
albertz Feb 13, 2024
acefd99
test_rasr_compatible_window (failing)
albertz Feb 13, 2024
ba47a78
testing custom hanning window implementations
albertz Feb 13, 2024
44aba81
cleanup, fix windowing (WIP)
albertz Feb 13, 2024
d845650
fix last Hanning window
albertz Feb 14, 2024
e2cda8b
fix device
albertz Feb 14, 2024
6925acc
simplify
albertz Feb 14, 2024
848c886
test_rasr_compatible_fft (failing)
albertz Feb 14, 2024
990a977
FFT test more direct (still failing)
albertz Feb 14, 2024
6556b3c
tests deterministic
albertz Feb 14, 2024
79a043e
copy RASR C++ FFT code for testing
albertz Feb 14, 2024
3476557
FFT fixes
albertz Feb 14, 2024
467f828
FFT becomes more exact
albertz Feb 14, 2024
69cd90d
FFT cleanup
albertz Feb 14, 2024
aca1eb3
test_rasr_compatible_amplitude_spectrum (failing)
albertz Feb 14, 2024
3ead58f
add fft scaling, updata test and remove spaces
kuacakuaca Mar 12, 2024
1af10fc
black
kuacakuaca Mar 12, 2024
fff39f0
adjust last window for different sequence lengths
kuacakuaca Mar 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test_rasr_compatible_fft (failing)
albertz committed Feb 14, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 848c88642cf024fc3306d083d17c934cbbee5316
90 changes: 90 additions & 0 deletions tests/test_feature_extraction.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import copy
import math
import numpy as np
import torch
import unittest
@@ -365,6 +366,95 @@ def _get_length_naive(audio_len: int) -> int:
torch.testing.assert_close(smoothed, rasr_feat, rtol=1e-30, atol=1e-30)


def test_rasr_compatible_fft():
try:
from i6_core.lib.rasr_cache import FileArchive
except ImportError:
raise unittest.SkipTest("i6_core not available")
try:
import soundfile
except ImportError:
raise unittest.SkipTest("soundfile not available")
rasr_feature_extractor_bin_path = (
"/work/tools22/asr/rasr/rasr_onnx_haswell_0623/arch/linux-x86_64-standard/"
"feature-extraction.linux-x86_64-standard"
)
if not os.path.exists(rasr_feature_extractor_bin_path):
raise unittest.SkipTest("RASR feature-extraction binary not found")

wav_file_path = tempfile.mktemp(suffix=".wav", prefix="tmp-i6models-random-audio")
atexit.register(os.remove, wav_file_path)
generate_random_speech_like_audio_wav(wav_file_path)
rasr_feature_cache_path = generate_rasr_feature_cache_from_wav_and_flow(
rasr_feature_extractor_bin_path,
wav_file_path,
textwrap.dedent(
f"""\
<node filter="generic-vector-s16-demultiplex" name="demultiplex" track="$(track)"/>
<link from="samples" to="demultiplex"/>
<node filter="generic-convert-vector-s16-to-vector-f32" name="convert"/>
<link from="demultiplex" to="convert"/>
<node alpha="1.0" filter="signal-preemphasis" name="preemphasis"/>
<link from="convert" to="preemphasis"/>
<node filter="signal-window" length="0.025" name="window" shift="0.01" type="hanning"/>
<link from="preemphasis" to="window"/>
<node filter="signal-real-fast-fourier-transform" maximum-input-size="0.025" name="fft"/>
<link from="window" to="fft"/>
<node filter="generic-vector-f32-multiplication" name="scaling" value="16000"/>
<link from="fft" to="scaling"/>
<node filter="signal-vector-alternating-complex-f32-amplitude" name="amplitude-spectrum"/>
<link from="scaling" to="amplitude-spectrum"/>
"""
),
flow_output_name="amplitude-spectrum",
)

rasr_cache = FileArchive(rasr_feature_cache_path, must_exists=True)
time_, rasr_feat = rasr_cache.read("corpus/recording/1", "feat")
assert len(time_) == len(rasr_feat)
rasr_feat = torch.tensor(np.stack(rasr_feat, axis=0), dtype=torch.float32)
print("RASR:", _torch_repr(rasr_feat))

audio, sample_rate = soundfile.read(open(wav_file_path, "rb"), dtype="int16")
audio = torch.tensor(audio.astype(np.float32)) # [-2**15, 2**15-1]

# preemphasize
audio[..., 1:] -= 1.0 * audio[..., :-1]
audio[..., 0] = 0.0

# windowing
win_size = 0.025
hop_size = 0.01
hop_length = int(hop_size * sample_rate)
win_length = int(win_size * sample_rate)

res_len = max(audio.shape[0] - win_length + hop_length - 1, 0) // hop_length + 1
assert res_len == len(rasr_feat)

last_win_size = audio.shape[0] - (res_len - 1) * hop_length
last_pad = win_length - last_win_size
padded = torch.nn.functional.pad(audio, (0, last_pad)) # zero pad for the last frame

windowed = padded.unfold(0, size=win_length, step=hop_length) # [T', W=win_length]
assert len(windowed) == res_len
window = torch.hann_window(win_length, periodic=False, dtype=torch.float64).to(torch.float32)
smoothed = windowed[:-1] * window[None, :] # [T'-1, W]

# The last window might be shorter. Will use a shorter Hanning window then. Need to fix that.
last_win = torch.hann_window(last_win_size, periodic=False, dtype=torch.float64).to(torch.float32)
last_win = torch.nn.functional.pad(last_win, (0, last_pad))
smoothed = torch.cat([smoothed, (windowed[-1] * last_win)[None, :]], dim=0)

n_fft = 2 ** math.ceil(math.log2(win_length))
# fft = torch.fft.rfftn(smoothed, s=n_fft) # [B, T', F]
# fft = torch.view_as_real(fft).flatten(-2) # [B, T', F*2]
amplitude_spectrum = torch.abs(torch.fft.rfftn(smoothed, s=n_fft)) # [B, T', F=n_fft//2+1]

print("i6_models", _torch_repr(amplitude_spectrum))

torch.testing.assert_close(amplitude_spectrum, rasr_feat, rtol=1e-30, atol=1e-30)


def generate_rasr_feature_cache_from_wav_and_flow(
rasr_feature_extractor_bin_path: str,
wav_file_path: str,