From a5745c2eb360f4d3ff3cb95965a5bd001b894b95 Mon Sep 17 00:00:00 2001 From: Yosshi999 Date: Wed, 27 Nov 2024 07:02:21 +0900 Subject: [PATCH] Streaming decoder for compatible engine (#875) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit この本文は @qryxip が記述している。 ストリーミング処理を`compatible_engine`に実装する。testcaseとして一括変 換・ストリーム変換の二つの生成結果を追加し、元の生成音声と比較して十分近 いことを確かめた。 `render_audio_segment`には将来の互換性のため、未使用引数 `int64_t margin_width`を入れる。 https://github.com/VOICEVOX/voicevox_core/pull/875#discussion_r1854519379 またRust APIの出力サイズをチェックしてパニックする仕組みも入れる。これに ついては他の関数にも後で導入することとする。 https://github.com/VOICEVOX/voicevox_core/pull/875#discussion_r1856958417 Refs: #866 --- crates/test_util/build.rs | 29 +++++ crates/test_util/compatible_engine.h | 7 ++ crates/test_util/src/typing.rs | 11 ++ crates/voicevox_core_c_api/Cargo.toml | 1 + .../src/compatible_engine.rs | 105 ++++++++++++++++++ .../tests/e2e/testcases/compatible_engine.rs | 76 ++++++++++++- 6 files changed, 228 insertions(+), 1 deletion(-) diff --git a/crates/test_util/build.rs b/crates/test_util/build.rs index 3cdc88d78..700321bd3 100644 --- a/crates/test_util/build.rs +++ b/crates/test_util/build.rs @@ -202,6 +202,35 @@ fn generate_example_data_json(dist: &Path) -> anyhow::Result<()> { phoneme.to_vec() }, }, + intermediate: typing::IntermediateExampleData { + f0_length: 69, + phoneme_size: 45, + feature_dim: 80, + margin_width: 14, + f0_vector: { + let mut f0 = [0.; 69]; + f0[9..24].fill(5.905218); + f0[37..60].fill(5.565851); + f0.to_vec() + }, + phoneme_vector: { + let mut phoneme = [0.; 45 * 69]; + let mut set_one = |index, range| { + for i in range { + phoneme[(i * 45 + index) as usize] = 1.; + } + }; + set_one(0, 0..9); + set_one(37, 9..13); + set_one(14, 13..24); + set_one(35, 24..30); + set_one(6, 30..37); + set_one(37, 37..45); + set_one(30, 45..60); + set_one(0, 60..69); + phoneme.to_vec() + }, + }, }; fs_err::write( diff --git a/crates/test_util/compatible_engine.h b/crates/test_util/compatible_engine.h index 254fd8161..2cd58b971 100644 --- a/crates/test_util/compatible_engine.h +++ b/crates/test_util/compatible_engine.h @@ -25,4 +25,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phoneme, int64_t *speaker_id, float *output); +bool generate_full_intermediate(int64_t length, int64_t phoneme_size, + float *f0, float *phoneme, int64_t *speaker_id, + float *output); + +bool render_audio_segment(int64_t length, int64_t margin_width, int64_t feature_size, + float *audio_feature, int64_t *speaker_id, float *output); + const char *last_error_message(); diff --git a/crates/test_util/src/typing.rs b/crates/test_util/src/typing.rs index 1d10c9cb9..ed0b6b40c 100644 --- a/crates/test_util/src/typing.rs +++ b/crates/test_util/src/typing.rs @@ -31,6 +31,16 @@ pub struct DecodeExampleData { pub phoneme_vector: Vec, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IntermediateExampleData { + pub f0_length: i64, + pub phoneme_size: i64, + pub feature_dim: i64, + pub margin_width: i64, + pub f0_vector: Vec, + pub phoneme_vector: Vec, +} + #[derive(Debug, Serialize, Deserialize)] pub struct ExampleData { pub speaker_id: i64, @@ -38,4 +48,5 @@ pub struct ExampleData { pub duration: DurationExampleData, pub intonation: IntonationExampleData, pub decode: DecodeExampleData, + pub intermediate: IntermediateExampleData, } diff --git a/crates/voicevox_core_c_api/Cargo.toml b/crates/voicevox_core_c_api/Cargo.toml index c9600c7a6..4c4001b4d 100644 --- a/crates/voicevox_core_c_api/Cargo.toml +++ b/crates/voicevox_core_c_api/Cargo.toml @@ -30,6 +30,7 @@ easy-ext.workspace = true educe.workspace = true itertools.workspace = true libc.workspace = true +ndarray.workspace = true parking_lot = { workspace = true, features = ["arc_lock"] } process_path.workspace = true ref-cast.workspace = true diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 75c69816d..5bcbed165 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -359,6 +359,111 @@ pub unsafe extern "C" fn decode_forward( } } +/// # Safety +/// +/// - `f0`はRustの`&[f32; length as usize]`として解釈できなければならない。 +/// - `phoneme`はRustの`&[f32; phoneme_size * length as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; ((length + 2 * 14) * 80) as usize]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn generate_full_intermediate( + length: i64, + phoneme_size: i64, + f0: *mut f32, + phoneme: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(f0); + assert_aligned(phoneme); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let phoneme_size = phoneme_size as usize; + const MARGIN_WIDTH: usize = 14; + const FEATURE_SIZE: usize = 80; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).generate_full_intermediate( + length, + phoneme_size, + // SAFETY: The safety contract must be upheld by the caller. + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(phoneme, phoneme_size * length) }, + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = (length + 2 * MARGIN_WIDTH) * FEATURE_SIZE; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +/// # Safety +/// +/// - `audio_feature`はRustの`&[f32; (length * feature_size) as usize]`として解釈できなければならない。 +/// - `speaker_id`はRustの`&[i64; 1]`として解釈できなければならない。 +/// - `output`はRustの`&mut [MaybeUninit; length as usize * 256]`として解釈できなければならない。 +#[unsafe(no_mangle)] // SAFETY: voicevox_core_c_apiを構成するライブラリの中に、これと同名のシンボルは存在しない +pub unsafe extern "C" fn render_audio_segment( + length: i64, + _margin_width: i64, + feature_size: i64, + audio_feature: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + init_logger_once(); + assert_aligned(audio_feature); + assert_aligned(speaker_id); + assert_aligned(output); + let length = length as usize; + let feature_size = feature_size as usize; + let synthesizer = &*lock_synthesizer(); + let result = ensure_initialized!(synthesizer).render_audio_segment( + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + ndarray::ArrayView2::from_shape_ptr([length, feature_size], audio_feature).to_owned() + }, + StyleId::new(unsafe { *speaker_id as u32 }), + ); + match result { + Ok(output_arr) => { + let output_len = length * 256; + if output_arr.len() != output_len { + panic!("expected {}, got {}", output_len, output_arr.len()); + } + let output_arr = output_arr.as_standard_layout(); + // SAFETY: The safety contract must be upheld by the caller. + unsafe { + output_arr + .as_ptr() + .copy_to_nonoverlapping(output, output_len); + } + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + #[track_caller] fn assert_aligned(ptr: *mut impl Sized) { assert!( diff --git a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs index e69ad68fd..badf165fd 100644 --- a/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs +++ b/crates/voicevox_core_c_api/tests/e2e/testcases/compatible_engine.rs @@ -1,7 +1,7 @@ // エンジンを起動してyukarin_s・yukarin_sa・decodeの推論を行う -use std::ffi::CStr; use std::sync::LazyLock; +use std::{cmp::min, ffi::CStr}; use assert_cmd::assert::AssertResult; use libloading::Library; @@ -83,12 +83,86 @@ impl assert_cdylib::TestCase for TestCase { wave }; + // 中間生成物を経由した場合の生成音声 + let wave2 = { + let length_with_margin = + EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width; + let mut audio_feature = + vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize]; + let mut wave = vec![0.; 256 * length_with_margin as usize]; + assert!(lib.generate_full_intermediate( + EXAMPLE_DATA.intermediate.f0_length, + EXAMPLE_DATA.intermediate.phoneme_size, + EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32, + EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + audio_feature.as_mut_ptr(), + )); + assert!(lib.render_audio_segment( + length_with_margin, + EXAMPLE_DATA.intermediate.margin_width, + EXAMPLE_DATA.intermediate.feature_dim, + audio_feature.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + wave.as_mut_ptr(), + )); + wave[256 * EXAMPLE_DATA.intermediate.margin_width as usize + ..wave.len() - 256 * EXAMPLE_DATA.intermediate.margin_width as usize] + .to_vec() + }; + + // 中間生成物を経由し、さらにチャンクごとに変換した場合の生成音声 + let wave3 = { + let length_with_margin = + EXAMPLE_DATA.intermediate.f0_length + 2 * EXAMPLE_DATA.intermediate.margin_width; + let mut audio_feature = + vec![0.; (length_with_margin * EXAMPLE_DATA.intermediate.feature_dim) as usize]; + let mut wave = vec![0.; 256 * EXAMPLE_DATA.intermediate.f0_length as usize]; + assert!(lib.generate_full_intermediate( + EXAMPLE_DATA.intermediate.f0_length, + EXAMPLE_DATA.intermediate.phoneme_size, + EXAMPLE_DATA.intermediate.f0_vector.as_ptr() as *mut f32, + EXAMPLE_DATA.intermediate.phoneme_vector.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + audio_feature.as_mut_ptr(), + )); + let full_length = EXAMPLE_DATA.intermediate.f0_length as usize; + let pitch = EXAMPLE_DATA.intermediate.feature_dim as usize; + for render_start in (0..full_length).step_by(10) { + // render_start .. render_end の音声を取得する + let render_end = min(render_start + 10, full_length); + let slice_start = render_start; + let slice_end = render_end + 2 * EXAMPLE_DATA.intermediate.margin_width as usize; + let feature_segment = &audio_feature[slice_start * pitch..slice_end * pitch]; + let slice_length = slice_end - slice_start; + let mut wave_segment_with_margin = vec![0.; 256 * slice_length]; + assert!(lib.render_audio_segment( + slice_length as i64, + EXAMPLE_DATA.intermediate.margin_width, + pitch as i64, + feature_segment.as_ptr() as *mut f32, + &mut { EXAMPLE_DATA.speaker_id } as *mut i64, + wave_segment_with_margin.as_mut_ptr(), + )); + let wave_segment = &wave_segment_with_margin[256 + * EXAMPLE_DATA.intermediate.margin_width as usize + ..wave_segment_with_margin.len() + - 256 * EXAMPLE_DATA.intermediate.margin_width as usize]; + wave[render_start * 256..render_end * 256].clone_from_slice(wave_segment); + } + wave + }; + std::assert_eq!(SNAPSHOTS.metas, metas_json); float_assert::close_l1(&phoneme_length, &EXAMPLE_DATA.duration.result, 0.01); float_assert::close_l1(&intonation_list, &EXAMPLE_DATA.intonation.result, 0.01); assert!(wave.iter().copied().all(f32::is_normal)); + assert!(wave2.iter().copied().all(f32::is_normal)); + assert!(wave3.iter().copied().all(f32::is_normal)); + float_assert::close_l1(&wave2, &wave, 0.001); + float_assert::close_l1(&wave3, &wave, 0.001); lib.finalize(); Ok(())