diff --git a/.gitignore b/.gitignore
index 9e8eea619..63a996ab7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,9 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
+
+# Guided Synthesis temp files
+/voicevox_engine/experimental/dictation-kit*
+first_pass*
+second_pass*
+tmp.wav
\ No newline at end of file
diff --git a/README.md b/README.md
index f228b39cd..a397c6f5a 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,37 @@ curl -s \
> audio.wav
```
+### Guidied Synthsis
+Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information.
+Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance.
+You can also get the kana text in AquesTalk section.
+```bash
+# Returns an audio file which is synthesised referencing uploaded audio
+# this example needs a recording whose content is
+# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い"
+
+curl -L -X POST 'localhost:50021/guided_synthesis' \
+ -F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
+ -F 'speaker_id="5"' \
+ -F 'audio_file=@"/full_path_to_your_recording"' \
+ -F 'normalize="true"' \
+ -F 'stereo="true"' \
+ -F 'sample_rate="24000"' \
+ -F 'volume_scale="1"' \
+ -F 'pitch_scale="0"' \
+ -F 'speed_scale="1"'
+
+# Returns a list of AccentPhrases
+
+curl -L -X POST 'localhost:50021/guided_accent_phrase' \
+ -F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
+ -F 'speaker="5"' \
+ -F 'audio_file=@"/full_path_to_your_recording"' \
+ -F 'normalize="true"' \
+ -F 'is_kana="true"' \
+ -F 'enable_interrogative="false"'
+```
+
### 話者の追加情報を取得するサンプルコード
追加情報の中の portrait.png を取得するコードです。
diff --git a/run.py b/run.py
index 3dafb4816..d166827a7 100644
--- a/run.py
+++ b/run.py
@@ -17,7 +17,7 @@
import soundfile
import uvicorn
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.params import Query
from pydantic import ValidationError
@@ -215,6 +215,76 @@ def accent_phrases(
else:
return engine.create_accent_phrases(text, speaker_id=speaker)
+ @app.post(
+ "/guided_accent_phrase",
+ response_model=List[AccentPhrase],
+ tags=["クエリ編集"],
+ summary="Create Accent Phrase from External Audio",
+ )
+ def guided_accent_phrase(
+ text: str = Form(...), # noqa:B008
+ speaker: int = Form(...), # noqa:B008
+ is_kana: bool = Form(...), # noqa:B008
+ audio_file: UploadFile = File(...), # noqa: B008
+ normalize: bool = Form(...), # noqa:B008
+ core_version: Optional[str] = None,
+ ):
+ """
+ Extracts f0 and aligned phonemes, calculates average f0 for every phoneme.
+ Returns a list of AccentPhrase.
+ **This API works in the resolution of phonemes.**
+ """
+ if not args.enable_guided_synthesis:
+ raise HTTPException(
+ status_code=404,
+ detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
+ )
+ engine = get_engine(core_version)
+ if is_kana:
+ try:
+ accent_phrases = parse_kana(text)
+ except ParseKanaError as err:
+ raise HTTPException(
+ status_code=400,
+ detail=ParseKanaBadRequest(err).dict(),
+ )
+ else:
+ accent_phrases = engine.create_accent_phrases(
+ text,
+ speaker_id=speaker,
+ )
+
+ try:
+ return engine.guided_accent_phrases(
+ accent_phrases=accent_phrases,
+ speaker=speaker,
+ audio_file=audio_file.file,
+ normalize=normalize,
+ )
+ except ParseKanaError as err:
+ raise HTTPException(
+ status_code=422,
+ detail=ParseKanaBadRequest(err).dict(),
+ )
+ except StopIteration:
+ print(traceback.format_exc())
+ raise HTTPException(
+ status_code=500,
+ detail="Failed in Forced Alignment",
+ )
+ except Exception as e:
+ print(traceback.format_exc())
+ if str(e) == "Decode Failed":
+ raise HTTPException(
+ status_code=500,
+ detail="Failed in Forced Alignment",
+ )
+ else:
+ raise HTTPException(
+ status_code=500,
+ detail="Internal Server Error",
+ )
+
@app.post(
"/mora_data",
response_model=List[AccentPhrase],
@@ -366,7 +436,7 @@ def multi_synthesis(
format="WAV",
)
wav_file.seek(0)
- zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
+ zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
return FileResponse(f.name, media_type="application/zip")
@@ -420,6 +490,90 @@ def _synthesis_morphing(
return FileResponse(f.name, media_type="audio/wav")
+ @app.post(
+ "/guided_synthesis",
+ responses={
+ 200: {
+ "content": {
+ "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+ },
+ }
+ },
+ tags=["音声合成"],
+ summary="Audio synthesis guided by external audio and phonemes",
+ )
+ def guided_synthesis(
+ kana: str = Form(...), # noqa: B008
+ speaker_id: int = Form(...), # noqa: B008
+ normalize: bool = Form(...), # noqa: B008
+ audio_file: UploadFile = File(...), # noqa: B008
+ stereo: bool = Form(...), # noqa: B008
+ sample_rate: int = Form(...), # noqa: B008
+ volume_scale: float = Form(...), # noqa: B008
+ pitch_scale: float = Form(...), # noqa: B008
+ speed_scale: float = Form(...), # noqa: B008
+ core_version: Optional[str] = None,
+ ):
+ """
+ Extracts and passes the f0 and aligned phonemes to engine.
+ Returns the synthesized audio.
+ **This API works in the resolution of frame.**
+ """
+ if not args.enable_guided_synthesis:
+ raise HTTPException(
+ status_code=404,
+ detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
+ )
+ engine = get_engine(core_version)
+ try:
+ accent_phrases = parse_kana(kana)
+ query = AudioQuery(
+ accent_phrases=accent_phrases,
+ speedScale=speed_scale,
+ pitchScale=pitch_scale,
+ intonationScale=1,
+ volumeScale=volume_scale,
+ prePhonemeLength=0.1,
+ postPhonemeLength=0.1,
+ outputSamplingRate=sample_rate,
+ outputStereo=stereo,
+ kana=kana,
+ )
+ wave = engine.guided_synthesis(
+ audio_file=audio_file.file,
+ query=query,
+ speaker=speaker_id,
+ normalize=normalize,
+ )
+
+ with NamedTemporaryFile(delete=False) as f:
+ soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+
+ return FileResponse(f.name, media_type="audio/wav")
+ except ParseKanaError as err:
+ raise HTTPException(
+ status_code=400,
+ detail=ParseKanaBadRequest(err).dict(),
+ )
+ except StopIteration:
+ print(traceback.format_exc())
+ raise HTTPException(
+ status_code=500,
+ detail="Failed in Forced Alignment.",
+ )
+ except Exception as e:
+ print(traceback.format_exc())
+ if str(e) == "Decode Failed":
+ raise HTTPException(
+ status_code=500,
+ detail="Failed in Forced Alignment.",
+ )
+ else:
+ raise HTTPException(
+ status_code=500,
+ detail="Internal Server Error.",
+ )
+
@app.post(
"/connect_waves",
response_class=FileResponse,
@@ -665,6 +819,7 @@ def supported_devices(
parser.add_argument("--runtime_dir", type=Path, default=None, action="append")
parser.add_argument("--enable_mock", action="store_true")
parser.add_argument("--enable_cancellable_synthesis", action="store_true")
+ parser.add_argument("--enable_guided_synthesis", action="store_true")
parser.add_argument("--init_processes", type=int, default=2)
# 引数へcpu_num_threadsの指定がなければ、環境変数をロールします。
diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py
index ed710274f..d291aeea2 100644
--- a/voicevox_engine/dev/synthesis_engine/mock.py
+++ b/voicevox_engine/dev/synthesis_engine/mock.py
@@ -1,5 +1,6 @@
from logging import getLogger
from typing import Any, Dict, List, Optional
+from typing.io import IO
import numpy as np
from pyopenjtalk import tts
@@ -130,3 +131,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
wave, sr = tts(text)
wave = resample(wave, 24000 * len(wave) // 48000)
return wave
+
+ def guided_synthesis(
+ self,
+ query: AudioQuery,
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ) -> np.ndarray:
+ """
+ Open jtalk doesn't have a guided function [Mock]
+ simply calling mock synthesis
+
+ Parameters
+ ----------
+ query
+ speaker
+ audio_file
+ normalize
+
+ Returns
+ -------
+
+ """
+ return self.synthesis(query=query, speaker_id=speaker)
+
+ def guided_accent_phrases(
+ self,
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ) -> List[AccentPhrase]:
+ """
+ guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
+
+ Parameters
+ ----------
+ query
+ speaker
+ audio_file
+ normalize
+
+ Returns
+ -------
+
+ """
+ return accent_phrases
diff --git a/voicevox_engine/experimental/__init__.py b/voicevox_engine/experimental/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/voicevox_engine/experimental/guided_extractor.py b/voicevox_engine/experimental/guided_extractor.py
new file mode 100644
index 000000000..8e4c280d2
--- /dev/null
+++ b/voicevox_engine/experimental/guided_extractor.py
@@ -0,0 +1,206 @@
+import os
+import re
+import tarfile
+from os.path import exists
+from pathlib import PurePath
+from typing.io import IO
+from urllib.request import urlretrieve
+
+import numpy as np
+import pkg_resources
+import pyworld as pw
+from scipy.io import wavfile
+from scipy.signal import resample
+
+from voicevox_engine.experimental.julius4seg import converter, sp_inserter
+from voicevox_engine.experimental.julius4seg.sp_inserter import ModelType, space_symbols
+from voicevox_engine.kana_parser import parse_kana
+
+JULIUS_SAMPLE_RATE = 16000
+FRAME_PERIOD = 1.0
+PUNCTUATION = ["_", "'", "/", "、"]
+SIL_SYMBOL = ["silB", "silE", "sp"]
+TMP_PATH = "tmp.wav"
+UUT_ID = "tmp"
+TEMP_FILE_LIST = [
+ "first_pass.dfa",
+ "first_pass.dict",
+ "second_pass.dfa",
+ "second_pass.dict",
+ "tmp.wav",
+]
+
+_JULIUS_DICTATION_URL = "https://github.com/julius-speech/dictation-kit/archive/refs/tags/dictation-kit-v4.3.1.tar.gz" # noqa: B950
+JULIUS_DICTATION_DIR = os.environ.get(
+ "JULIUS_DICTATION_DIR",
+ # they did put two "dictation-kit"s in extracted folder name
+ pkg_resources.resource_filename(__name__, "dictation-kit-dictation-kit-v4.3.1"),
+)
+
+sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR)
+
+
+class PhraseInfo:
+ def __init__(self, pitch: float, length: float, phoneme: str):
+ self.pitch = pitch
+ self.length = length
+ self.phoneme = phoneme
+
+
+def _lazy_init():
+ if not exists(JULIUS_DICTATION_DIR):
+ print("Julius not found, Downloading")
+ _extract_julius()
+
+
+def _extract_julius():
+ global JULIUS_DICTATION_DIR
+ filename = pkg_resources.resource_filename(__name__, "dictation-kit.tar.gz")
+ print("Downloading Julius...", _JULIUS_DICTATION_URL)
+ urlretrieve(_JULIUS_DICTATION_URL, filename)
+ print("Extracting Julius...", JULIUS_DICTATION_DIR)
+ with tarfile.open(filename, mode="r|gz") as f:
+ f.extractall(path=pkg_resources.resource_filename(__name__, ""))
+ JULIUS_DICTATION_DIR = pkg_resources.resource_filename(
+ __name__, "dictation-kit-dictation-kit-v4.3.1"
+ )
+ sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR)
+ os.remove(filename)
+
+
+def resample_ts(timestamp: str):
+ """
+ 0.9375 = 24000 / 256 / 1000 * 10
+ 10 is for julius4seg produces timestamp in 10 ms
+ """
+ return int((float(timestamp) * 0.9375))
+
+
+def get_normalize_diff(engine, kana: str, f0: np.ndarray, speaker_id: int):
+ f0_avg = _no_nan(np.average(f0[f0 != 0]))
+ predicted_phrases = parse_kana(kana)
+ engine.replace_mora_data(predicted_phrases, speaker_id=speaker_id)
+ pitch_list = []
+ for phrase in predicted_phrases:
+ for mora in phrase.moras:
+ pitch_list.append(mora.pitch)
+ pitch_list = np.array(pitch_list, dtype=np.float64)
+ predicted_avg = _no_nan(np.average(pitch_list[pitch_list != 0]))
+ return predicted_avg - f0_avg
+
+
+def _no_nan(num):
+ return 0.0 if np.isnan(num) else num
+
+
+def extract_guided_feature(audio_file: IO, kana: str):
+ _lazy_init()
+ sr, wave = wavfile.read(audio_file)
+ # stereo to mono
+ if len(wave.shape) == 2:
+ wave = wave.sum(axis=1) / 2
+
+ f0 = extract_f0(wave, sr, 256 / 24000 * 1000)
+
+ julius_wave = resample(wave, JULIUS_SAMPLE_RATE * len(wave) // sr)
+
+ # normalization for different WAV format
+ if julius_wave.dtype == "float32":
+ julius_wave *= 32767
+ if julius_wave.dtype == "int32":
+ julius_wave = np.floor_divide(julius_wave, 2147483392 / 32767)
+ if julius_wave.dtype == "uint8":
+ # floor of 32767 / 255
+ julius_wave *= 128
+
+ julius_wave = julius_wave.astype(np.int16)
+
+ julius_kana = re.sub(
+ "|".join(PUNCTUATION), "", kana.replace("/", "").replace("、", " ")
+ )
+
+ phones = forced_align(julius_wave, julius_kana)
+ return f0, phones
+
+
+def forced_align(julius_wave: np.ndarray, base_kata_text: str):
+ model_type = ModelType.gmm
+ hmm_model = os.path.join(
+ JULIUS_DICTATION_DIR, "model/phone_m/jnas-mono-16mix-gid.binhmm"
+ )
+ options = []
+
+ base_kata_text = sp_inserter.kata2hira(base_kata_text)
+
+ julius_phones = [converter.conv2openjtalk(hira) for hira in base_kata_text.split()]
+
+ base_kan_text = ["sym_{}".format(i) for i in range(len(julius_phones))]
+
+ assert len(base_kan_text) == len(julius_phones), f"{base_kan_text}\n{julius_phones}"
+
+ dict_1st = sp_inserter.gen_julius_dict_1st(base_kan_text, julius_phones, model_type)
+ dfa_1st = sp_inserter.gen_julius_dfa(dict_1st.count("\n"))
+
+ with open("first_pass.dict", "w", encoding="utf-8") as f:
+ f.write(dict_1st)
+
+ with open("first_pass.dfa", "w", encoding="utf-8") as f:
+ f.write(dfa_1st)
+ wavfile.write(TMP_PATH, JULIUS_SAMPLE_RATE, julius_wave)
+
+ raw_first_output = sp_inserter.julius_sp_insert(
+ TMP_PATH,
+ "first_pass",
+ hmm_model,
+ model_type,
+ options,
+ )
+
+ forced_phones_with_sp = []
+ try:
+ _, sp_position = sp_inserter.get_sp_inserted_text(raw_first_output)
+
+ for j, (_t, p) in enumerate(zip(base_kan_text, julius_phones)):
+ forced_phones_with_sp.append(p)
+ if j in sp_position:
+ forced_phones_with_sp.append(space_symbols[model_type])
+
+ forced_phones_with_sp = " ".join(forced_phones_with_sp)
+ except Exception:
+ pass
+
+ phones_with_sp = sp_inserter.get_sp_inserterd_phone_seqence(
+ raw_first_output, model_type
+ )
+ if len(phones_with_sp) < 2:
+ forced_phones_with_sp = phones_with_sp
+
+ dict_2nd = sp_inserter.gen_julius_dict_2nd(forced_phones_with_sp, model_type)
+ dfa_2nd = sp_inserter.gen_julius_aliment_dfa(dict_2nd.count("\n"))
+
+ with open("second_pass.dict", "w") as f:
+ f.write(dict_2nd)
+
+ with open("second_pass.dfa", "w") as f:
+ f.write(dfa_2nd)
+
+ raw_second_output = sp_inserter.julius_phone_alignment(
+ TMP_PATH, "second_pass", hmm_model, model_type, options
+ )
+ time_alimented_list = sp_inserter.get_time_alimented_list(raw_second_output)
+
+ assert len(time_alimented_list) > 0, raw_second_output
+
+ for file in TEMP_FILE_LIST:
+ os.remove(file)
+
+ return time_alimented_list
+
+
+def extract_f0(wave: np.ndarray, sr: int, frame_period: float):
+ w = wave.astype(np.float64)
+ f0, t = pw.harvest(w, sr, frame_period=frame_period)
+ vuv = f0 != 0
+ f0_log = np.zeros_like(f0)
+ f0_log[vuv] = np.log(f0[vuv])
+ return f0_log
diff --git a/voicevox_engine/experimental/julius4seg/__init__.py b/voicevox_engine/experimental/julius4seg/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/voicevox_engine/experimental/julius4seg/converter.py b/voicevox_engine/experimental/julius4seg/converter.py
new file mode 100644
index 000000000..efc596cad
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/converter.py
@@ -0,0 +1,319 @@
+import re
+
+
+def conv2julius(s: str) -> str:
+ """入力の単語の読み(ひらがな)をJuliusの音素列に変換
+ args:
+ kana(str): カタカナ文字列
+ "やきにく"
+ returns:
+ (str): ひらがな文字列
+ " y a k i n i k u"
+ """
+ s = s.replace("あぁ", " a a")
+ s = s.replace("いぃ", " i i")
+ s = s.replace("いぇ", " i e")
+ s = s.replace("いゃ", " y a")
+ s = s.replace("うぁ", " u a")
+ s = s.replace("うぃ", " w i")
+ s = s.replace("うぅ", " u:")
+ s = s.replace("うぇ", " w e")
+ s = s.replace("うぉ", " w o")
+ s = s.replace("えぇ", " e e")
+ s = s.replace("おぉ", " o:")
+ s = s.replace("かぁ", " k a:")
+ s = s.replace("がぁ", " g a:")
+ s = s.replace("きぃ", " k i:")
+ s = s.replace("きぇ", " ky e")
+ s = s.replace("きゃ", " ky a")
+ s = s.replace("きゅ", " ky u")
+ s = s.replace("きょ", " ky o")
+ s = s.replace("ぎぃ", " g i:")
+ s = s.replace("ぎぇ", " gy e")
+ s = s.replace("ぎゃ", " gy a")
+ s = s.replace("ぎゅ", " gy u")
+ s = s.replace("ぎょ", " gy o")
+ s = s.replace("くぅ", " k u:")
+ s = s.replace("くゃ", " ky a")
+ s = s.replace("くゅ", " ky u")
+ s = s.replace("くょ", " ky o")
+ s = s.replace("ぐぅ", " g u:")
+ s = s.replace("ぐゃ", " gy a")
+ s = s.replace("ぐゅ", " gy u")
+ s = s.replace("ぐょ", " gy o")
+ s = s.replace("けぇ", " k e:")
+ s = s.replace("げぇ", " g e:")
+ s = s.replace("こぉ", " k o:")
+ s = s.replace("ごぉ", " g o:")
+ s = s.replace("さぁ", " s a:")
+ s = s.replace("ざぁ", " z a:")
+ s = s.replace("しぃ", " sh i:")
+ s = s.replace("しぇ", " sh e")
+ s = s.replace("しゃ", " sh a")
+ s = s.replace("しゅ", " sh u")
+ s = s.replace("しょ", " sh o")
+ s = s.replace("じぃ", " j i:")
+ s = s.replace("じぇ", " j e")
+ s = s.replace("じゃ", " j a")
+ s = s.replace("じゅ", " j u")
+ s = s.replace("じょ", " j o")
+ s = s.replace("すぃ", " s i")
+ s = s.replace("すぅ", " s u:")
+ s = s.replace("すゃ", " sh a")
+ s = s.replace("すゅ", " sh u")
+ s = s.replace("すょ", " sh o")
+ s = s.replace("ずぁ", " z u a")
+ s = s.replace("ずぃ", " z i")
+ s = s.replace("ずぅ", " z u")
+ s = s.replace("ずぅ", " z u:")
+ s = s.replace("ずぇ", " z e")
+ s = s.replace("ずぉ", " z o")
+ s = s.replace("ずゃ", " zy a")
+ s = s.replace("ずゃ", " zy a")
+ s = s.replace("ずゅ", " zy u")
+ s = s.replace("ずゅ", " zy u")
+ s = s.replace("ずょ", " zy o")
+ s = s.replace("ずょ", " zy o")
+ s = s.replace("せぇ", " s e:")
+ s = s.replace("ぜぇ", " z e:")
+ s = s.replace("そぉ", " s o:")
+ s = s.replace("ぞぉ", " z o:")
+ s = s.replace("たぁ", " t a:")
+ s = s.replace("だぁ", " d a:")
+ s = s.replace("ちぃ", " ch i:")
+ s = s.replace("ちぇ", " ch e")
+ s = s.replace("ちゃ", " ch a")
+ s = s.replace("ちゅ", " ch u")
+ s = s.replace("ちょ", " ch o")
+ s = s.replace("ぢぃ", " j i:")
+ s = s.replace("ぢぇ", " j e")
+ s = s.replace("ぢゃ", " j a")
+ s = s.replace("ぢゅ", " j u")
+ s = s.replace("ぢょ", " j o")
+ s = s.replace("つぁ", " ts a")
+ s = s.replace("つぃ", " ts i")
+ s = s.replace("つぅ", " ts u:")
+ s = s.replace("つぇ", " ts e")
+ s = s.replace("つぉ", " ts o")
+ s = s.replace("つゃ", " ch a")
+ s = s.replace("つゅ", " ch u")
+ s = s.replace("つょ", " ch o")
+ s = s.replace("づぅ", " d u:")
+ s = s.replace("づゃ", " zy a")
+ s = s.replace("づゅ", " zy u")
+ s = s.replace("づょ", " zy o")
+ s = s.replace("てぃ", " t i")
+ s = s.replace("てぇ", " t e:")
+ s = s.replace("てぇ", " t e:")
+ s = s.replace("てゃ", " t a")
+ s = s.replace("てゅ", " t u")
+ s = s.replace("てょ", " t o")
+ s = s.replace("でぃ", " d i")
+ s = s.replace("でぇ", " d e:")
+ s = s.replace("でぇ", " d e:")
+ s = s.replace("でゃ", " d a")
+ s = s.replace("でゅ", " d u")
+ s = s.replace("でょ", " d o")
+ s = s.replace("とぅ", " t u")
+ s = s.replace("とぉ", " t o:")
+ s = s.replace("とゃ", " t a")
+ s = s.replace("とゅ", " t u")
+ s = s.replace("とょ", " t o")
+ s = s.replace("どぁ", " d o a")
+ s = s.replace("どぅ", " d u")
+ s = s.replace("どぉ", " d o:")
+ s = s.replace("どぉ", " d o:")
+ s = s.replace("どゃ", " d a")
+ s = s.replace("どゅ", " d u")
+ s = s.replace("どょ", " d o")
+ s = s.replace("なぁ", " n a:")
+ s = s.replace("にぃ", " n i:")
+ s = s.replace("にぇ", " ny e")
+ s = s.replace("にゃ", " ny a")
+ s = s.replace("にゅ", " ny u")
+ s = s.replace("にょ", " ny o")
+ s = s.replace("ぬぅ", " n u:")
+ s = s.replace("ぬゃ", " ny a")
+ s = s.replace("ぬゅ", " ny u")
+ s = s.replace("ぬょ", " ny o")
+ s = s.replace("ねぇ", " n e:")
+ s = s.replace("のぉ", " n o:")
+ s = s.replace("はぁ", " h a:")
+ s = s.replace("ばぁ", " b a:")
+ s = s.replace("ぱぁ", " p a:")
+ s = s.replace("ひぃ", " h i:")
+ s = s.replace("ひぇ", " hy e")
+ s = s.replace("ひゃ", " hy a")
+ s = s.replace("ひゅ", " hy u")
+ s = s.replace("ひょ", " hy o")
+ s = s.replace("びぃ", " b i:")
+ s = s.replace("びぇ", " by e")
+ s = s.replace("びゃ", " by a")
+ s = s.replace("びゅ", " by u")
+ s = s.replace("びょ", " by o")
+ s = s.replace("ぴぃ", " p i:")
+ s = s.replace("ぴぇ", " py e")
+ s = s.replace("ぴゃ", " py a")
+ s = s.replace("ぴゅ", " py u")
+ s = s.replace("ぴょ", " py o")
+ s = s.replace("ふぁ", " f a")
+ s = s.replace("ふぃ", " f i")
+ s = s.replace("ふぅ", " f u")
+ s = s.replace("ふぅ", " f u:")
+ s = s.replace("ふぇ", " f e")
+ s = s.replace("ふぉ", " f o")
+ s = s.replace("ふゃ", " hy a")
+ s = s.replace("ふゃ", " hy a")
+ s = s.replace("ふゃ", " hy a")
+ s = s.replace("ふゅ", " hy u")
+ s = s.replace("ふゅ", " hy u")
+ s = s.replace("ふょ", " hy o")
+ s = s.replace("ふょ", " hy o")
+ s = s.replace("ふょ", " hy o")
+ s = s.replace("ぶぅ", " b u:")
+ s = s.replace("ぶゅ", " by u")
+ s = s.replace("ぷぅ", " p u:")
+ s = s.replace("ぷゃ", " py a")
+ s = s.replace("ぷゅ", " py u")
+ s = s.replace("ぷょ", " py o")
+ s = s.replace("へぇ", " h e:")
+ s = s.replace("べぇ", " b e:")
+ s = s.replace("ぺぇ", " p e:")
+ s = s.replace("ほぉ", " h o:")
+ s = s.replace("ぼぉ", " b o:")
+ s = s.replace("ぽぉ", " p o:")
+ s = s.replace("まぁ", " m a:")
+ s = s.replace("みぃ", " m i:")
+ s = s.replace("みぇ", " my e")
+ s = s.replace("みゃ", " my a")
+ s = s.replace("みゅ", " my u")
+ s = s.replace("みょ", " my o")
+ s = s.replace("むぅ", " m u:")
+ s = s.replace("むゃ", " my a")
+ s = s.replace("むゅ", " my u")
+ s = s.replace("むょ", " my o")
+ s = s.replace("めぇ", " m e:")
+ s = s.replace("もぉ", " m o:")
+ s = s.replace("やぁ", " y a:")
+ s = s.replace("ゆぅ", " y u:")
+ s = s.replace("ゆゃ", " y a:")
+ s = s.replace("ゆゅ", " y u:")
+ s = s.replace("ゆょ", " y o:")
+ s = s.replace("よぉ", " y o:")
+ s = s.replace("らぁ", " r a:")
+ s = s.replace("りぃ", " r i:")
+ s = s.replace("りぇ", " ry e")
+ s = s.replace("りゃ", " ry a")
+ s = s.replace("りゅ", " ry u")
+ s = s.replace("りょ", " ry o")
+ s = s.replace("るぅ", " r u:")
+ s = s.replace("るゃ", " ry a")
+ s = s.replace("るゅ", " ry u")
+ s = s.replace("るょ", " ry o")
+ s = s.replace("れぇ", " r e:")
+ s = s.replace("ろぉ", " r o:")
+ s = s.replace("わぁ", " w a:")
+ s = s.replace("をぉ", " o:")
+ s = s.replace("ゔぁ", " b a")
+ s = s.replace("ゔぃ", " b i")
+ s = s.replace("ゔぇ", " b e")
+ s = s.replace("ゔぉ", " b o")
+ s = s.replace("ゔゅ", " by u")
+
+ # 1音からなる変換規則
+ s = s.replace("あ", " a")
+ s = s.replace("い", " i")
+ s = s.replace("う", " u")
+ s = s.replace("え", " e")
+ s = s.replace("お", " o")
+ s = s.replace("か", " k a")
+ s = s.replace("き", " k i")
+ s = s.replace("く", " k u")
+ s = s.replace("け", " k e")
+ s = s.replace("こ", " k o")
+ s = s.replace("さ", " s a")
+ s = s.replace("し", " sh i")
+ s = s.replace("す", " s u")
+ s = s.replace("せ", " s e")
+ s = s.replace("そ", " s o")
+ s = s.replace("た", " t a")
+ s = s.replace("ち", " ch i")
+ s = s.replace("つ", " ts u")
+ s = s.replace("て", " t e")
+ s = s.replace("と", " t o")
+ s = s.replace("な", " n a")
+ s = s.replace("に", " n i")
+ s = s.replace("ぬ", " n u")
+ s = s.replace("ね", " n e")
+ s = s.replace("の", " n o")
+ s = s.replace("は", " h a")
+ s = s.replace("ひ", " h i")
+ s = s.replace("ふ", " f u")
+ s = s.replace("へ", " h e")
+ s = s.replace("ほ", " h o")
+ s = s.replace("ま", " m a")
+ s = s.replace("み", " m i")
+ s = s.replace("む", " m u")
+ s = s.replace("め", " m e")
+ s = s.replace("も", " m o")
+ s = s.replace("ら", " r a")
+ s = s.replace("り", " r i")
+ s = s.replace("る", " r u")
+ s = s.replace("れ", " r e")
+ s = s.replace("ろ", " r o")
+ s = s.replace("が", " g a")
+ s = s.replace("ぎ", " g i")
+ s = s.replace("ぐ", " g u")
+ s = s.replace("げ", " g e")
+ s = s.replace("ご", " g o")
+ s = s.replace("ざ", " z a")
+ s = s.replace("じ", " j i")
+ s = s.replace("ず", " z u")
+ s = s.replace("ぜ", " z e")
+ s = s.replace("ぞ", " z o")
+ s = s.replace("だ", " d a")
+ s = s.replace("ぢ", " j i")
+ s = s.replace("づ", " z u")
+ s = s.replace("で", " d e")
+ s = s.replace("ど", " d o")
+ s = s.replace("ば", " b a")
+ s = s.replace("び", " b i")
+ s = s.replace("ぶ", " b u")
+ s = s.replace("べ", " b e")
+ s = s.replace("ぼ", " b o")
+ s = s.replace("ぱ", " p a")
+ s = s.replace("ぴ", " p i")
+ s = s.replace("ぷ", " p u")
+ s = s.replace("ぺ", " p e")
+ s = s.replace("ぽ", " p o")
+ s = s.replace("や", " y a")
+ s = s.replace("ゆ", " y u")
+ s = s.replace("よ", " y o")
+ s = s.replace("わ", " w a")
+ s = s.replace("を", " o")
+ s = s.replace("ん", " N")
+ s = s.replace("っ", " q")
+ s = s.replace("ー", ":")
+ s = s.replace("ゔ", " b u")
+
+ s = s.replace("ぁ", " a")
+ s = s.replace("ぃ", " i")
+ s = s.replace("ぅ", " u")
+ s = s.replace("ぇ", " e")
+ s = s.replace("ぉ", " o")
+ s = s.replace("ゎ", " w a")
+
+ s = s[1:]
+
+ s = re.sub(r":+", ":", s)
+
+ return s
+
+
+def conv2openjtalk(s: str) -> str:
+ """入力の単語の読み(ひらがな)をOpenJTalkのような音素列に変換"""
+ s = conv2julius(s)
+
+ s = re.sub(r"(.):", r"\1 \1", s)
+
+ return s
diff --git a/voicevox_engine/experimental/julius4seg/sp_inserter.py b/voicevox_engine/experimental/julius4seg/sp_inserter.py
new file mode 100644
index 000000000..cbc94dd35
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/sp_inserter.py
@@ -0,0 +1,353 @@
+import re
+import subprocess
+import sys
+from enum import Enum
+from itertools import chain
+from typing import List, Optional, Tuple
+
+
+class ModelType(str, Enum):
+ gmm = "gmm"
+ dnn = "dnn"
+
+
+JULIUS_ROOT = "."
+
+begin_silent_symbols = {ModelType.gmm: "silB", ModelType.dnn: "sp_B"}
+end_silent_symbols = {ModelType.gmm: "silE", ModelType.dnn: "sp_E"}
+space_symbols = {ModelType.gmm: "sp", ModelType.dnn: "sp_S"}
+
+
+def get_os_dependent_directory() -> str:
+ """Juluis Segmentaion-Kitのディレクトリ名をOSの種類から取得
+ returns:
+ (str): OS依存のパスの一部
+ """
+ if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+ return "windows"
+ elif sys.platform.startswith("darwin"):
+ return "osx"
+ elif sys.platform.startswith("linux"):
+ return "linux"
+
+
+def get_os_dependent_exec() -> str:
+ """Juliusの実行ファイル名を取得
+ returns:
+ (str): Juliusの実行ファイル名
+ """
+ if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+ return "julius.exe"
+ else:
+ return "julius"
+
+
+def get_os_dependent_echo(filename: str) -> list:
+ """Get parameters of echo referencing platforms
+ Returns:
+ list[str]: echo parameters
+ """
+ if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+ return ["cmd.exe", "/c", "echo " + filename]
+ else:
+ return ["echo", filename]
+
+
+def kata2hira(kana: str) -> str:
+ """ヵ,ヶ以外のカタカナをひらがなに変換
+ args:
+ kana(str): カタカナ文字列
+ "ヤキニク"
+ returns:
+ (str): ひらがな文字列
+ "やきにく"
+ """
+ return "".join(
+ [
+ chr(ord(c) + ord("あ") - ord("ア")) if ord("ァ") <= ord(c) <= ord("ヴ") else c
+ for c in kana
+ ]
+ )
+
+
+def gen_julius_dict_1st(
+ text_symbols: List[str], word_phones: List[str], model_type: ModelType
+) -> str:
+ """テキストのシンボルと読みの音素のJulius dictファイルの中身を生成
+ args:
+ text_symbols ([str]): 単語のシンボル
+ ['今回', 'は']
+ word_phones ([str]): 単語の音素系列
+ ['k o N k a i', 'w a']
+ returns:
+ (str): Juliusのdictファイルの中身
+ """
+ tmp = []
+ finit = len(text_symbols)
+
+ for i, zipped in enumerate(zip(text_symbols, word_phones)):
+ tmp.append("{}\t[{}]\t{}".format(i * 2, *zipped))
+ if i + 1 != finit:
+ tmp.append(
+ "{}\t[{}]\t{}".format(
+ i * 2 + 1, "sp_{}".format(i), space_symbols[model_type]
+ )
+ )
+
+ # append sp and Start, End symbol
+ tmp.append(
+ "{}\t[{}]\t{}".format(i * 2 + 1, "", begin_silent_symbols[model_type])
+ )
+ tmp.append(
+ "{}\t[{}]\t{}".format((i + 1) * 2, "", end_silent_symbols[model_type])
+ )
+
+ return "\n".join(tmp) + "\n"
+
+
+def gen_julius_dfa(number_of_words: int) -> str:
+ """単語数から遷移のためのJuliusのdfaファイルの中身を生成
+ args:
+ number_of_words (int): 遷移する単語の単語数
+ returns:
+ (str): Juliusのdfaファイルの中身
+ """
+ i = 0
+ current_word = number_of_words - 3
+ isLast = False
+ tmp = []
+ while True:
+ if i == 0:
+ tmp.append("{} {} {} {} {}".format(i, number_of_words - 1, i + 1, 0, 1))
+ i += 1
+ elif i > 0 and not isLast:
+ tmp.append("{} {} {} {} {}".format(i, current_word, i + 1, 0, 0))
+ current_word -= 1
+ isLast = current_word == -1
+ i += 1
+ elif i > 0 and isLast:
+ tmp.append("{} {} {} {} {}".format(i, i - 1, i + 1, 0, 0))
+ tmp.append("{} {} {} {} {}".format(i + 1, -1, -1, 1, 0))
+ break
+
+ return "\n".join(tmp) + "\n"
+
+
+def gen_julius_dict_2nd(phone_seqence: str, model_type: ModelType) -> str:
+ """音素系列から強制アライメントのためのdictファイルの中身を生成
+ args:
+ phone_seqence (str):
+ 'k o N k a i w a '
+ returns:
+ (str): Juliusのdictファイルの中身
+ """
+ phone_seqences = phone_seqence.split(f" {space_symbols[model_type]} ")
+ return (
+ "\n".join(
+ [
+ f"{i}\t[w_{i}]\t"
+ + phone_seqence
+ + (
+ f" {space_symbols[model_type]}"
+ if i != len(phone_seqences) - 1
+ else ""
+ )
+ for i, phone_seqence in enumerate(phone_seqences)
+ ]
+ + [
+ f"{len(phone_seqences)}\t[w_{len(phone_seqences)}]\t"
+ + begin_silent_symbols[model_type]
+ ]
+ + [
+ f"{len(phone_seqences) + 1}\t[w_{len(phone_seqences) + 1}]\t"
+ + end_silent_symbols[model_type]
+ ]
+ )
+ + "\n"
+ )
+
+
+def gen_julius_aliment_dfa(number_of_words: int) -> str:
+ """強制アライメント用のdfaファイルの中身を生成
+ returns:
+ (str): Juliusのdfaファイルの中身
+ """
+ return gen_julius_dfa(number_of_words)
+
+
+def julius_sp_insert(
+ target_wav_file: str,
+ aliment_file_signiture: str,
+ model_path: str,
+ model_type: ModelType,
+ options: Optional[List[str]],
+) -> List[str]:
+ if options is None:
+ options = []
+
+ julius_args = {
+ "-h": model_path,
+ "-input": "file",
+ "-debug": "",
+ "-gram": aliment_file_signiture,
+ "-nostrip": "",
+ "-spmodel": space_symbols[model_type],
+ }
+
+ file_echo_p = subprocess.Popen(
+ get_os_dependent_echo(target_wav_file),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL,
+ )
+ julius_p = subprocess.Popen(
+ " ".join(
+ [
+ str(
+ JULIUS_ROOT
+ / "bin"
+ / get_os_dependent_directory()
+ / get_os_dependent_exec()
+ ),
+ *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])),
+ ]
+ + options
+ ).split(),
+ stdin=file_echo_p.stdout,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL,
+ )
+ file_echo_p.stdout.close()
+ return julius_p.communicate()[0].decode("utf-8").split("\n")
+
+
+def get_sp_inserted_text(raw_output: List[str]) -> Tuple[str, List[int]] or None:
+ """デコード結果からsp挿入後のテキストとspのインデックスを取得する
+ args:
+ raw_output: `julius_sp_insert`の出力
+ returns:
+ Tuple(str, [int]): デコード結果とspのindex
+ """
+ r = re.compile(" (.*) ")
+ pass1_best = next(s for s in raw_output if s.startswith("pass1_best"))
+ matched = r.search(pass1_best)
+ if matched is None:
+ raise Exception("Decode Failed")
+
+ return (
+ re.sub(r"sp_[\d+]", "", matched.group(1)),
+ [int(s.split("_")[1]) for s in matched.group().split() if "sp_" in s],
+ )
+
+
+def get_sp_inserterd_phone_seqence(raw_output: List[str], model_type: ModelType) -> str:
+ try:
+ pass1_best_phonemeseq = next(
+ s.rstrip("\r") for s in raw_output if s.startswith("pass1_best_phonemeseq")
+ )
+ except Exception as e:
+ raise (e)
+
+ complete_re = re.compile(
+ begin_silent_symbols[model_type]
+ + r" \| (.*) \| "
+ + end_silent_symbols[model_type]
+ )
+ failed_re_1 = re.compile(
+ end_silent_symbols[model_type]
+ + r" \| (.*) \| "
+ + begin_silent_symbols[model_type]
+ )
+ failed_re_2 = re.compile(end_silent_symbols[model_type] + r" \| (.*)")
+
+ if complete_re.search(pass1_best_phonemeseq) is not None:
+ matched = complete_re.search(pass1_best_phonemeseq)
+ elif failed_re_1.search(pass1_best_phonemeseq) is not None:
+ matched = failed_re_1.search(pass1_best_phonemeseq)
+ elif failed_re_2.search(pass1_best_phonemeseq) is not None:
+ matched = failed_re_2.search(pass1_best_phonemeseq)
+ else:
+ raise Exception("Decode Failed")
+
+ tmp = matched.group(1)
+ return " ".join([s.strip() for s in tmp.split("|")])
+
+
+def julius_phone_alignment(
+ target_wav_file: str,
+ aliment_file_signiture: str,
+ model_path: str,
+ model_type: ModelType,
+ options: Optional[List[str]],
+) -> List[str]:
+ if options is None:
+ options = []
+
+ julius_args = {
+ "-h": model_path,
+ "-palign": "",
+ "-input": "file",
+ "-gram": aliment_file_signiture,
+ "-nostrip": "",
+ "-n": "10",
+ "-s": "10000",
+ "-sb": "5000",
+ "-spmodel": space_symbols[model_type],
+ }
+
+ file_echo_p = subprocess.Popen(
+ ["echo", target_wav_file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
+ )
+ julius_p = subprocess.Popen(
+ " ".join(
+ [
+ str(
+ JULIUS_ROOT
+ / "bin"
+ / get_os_dependent_directory()
+ / get_os_dependent_exec()
+ ),
+ *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])),
+ ]
+ + options
+ ).split(),
+ stdin=file_echo_p.stdout,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL,
+ )
+ file_echo_p.stdout.close()
+ return julius_p.communicate()[0].decode("utf-8").split("\n")
+
+
+def get_time_alimented_list(raw_output: List[str]) -> List[Tuple[str, str, str]]:
+ r = re.compile(
+ r"\[\s*(\d+)\s+(\d+)\s*\]"
+ r"\s*[\-]*[\d,\.]+\s*"
+ r"\{?([\w,\:]+)\-?([\w,\:]*)\+?([\w,\:]*)\}?\[?[\w,\:,\-,\+]*\]?$"
+ )
+
+ def get_phoneme(left: str, center: str, right: str):
+ if len(center) == 0 and len(right) == 0: # monophone
+ return left
+ elif len(center) > 0:
+ return center
+ elif len(center) == 0:
+ return left
+ else:
+ raise ValueError(f"{left} {center} {right}")
+
+ return [
+ (s.group(1), s.group(2), get_phoneme(s.group(3), s.group(4), s.group(5)))
+ for s in map(lambda x: r.search(x.rstrip("\r")), raw_output)
+ if s is not None
+ ]
+
+
+def frame_to_second(time_list: List[Tuple[str, str, str]]):
+ return [
+ (
+ f"{int(start) * 0.01 + (0.0125 if i > 0 else 0):.4f}",
+ f"{(int(end) + 1) * 0.01 + 0.0125:.4f}",
+ phoneme,
+ )
+ for i, (start, end, phoneme) in enumerate(time_list)
+ ]
diff --git a/voicevox_engine/experimental/julius4seg/sp_remover.py b/voicevox_engine/experimental/julius4seg/sp_remover.py
new file mode 100644
index 000000000..50838c650
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/sp_remover.py
@@ -0,0 +1,64 @@
+import struct
+import wave
+from typing import List
+
+# 有声音素を削らないためのマージン
+MARGIN = 5
+
+
+def get_sp_segment(time_list: List[str]) -> List[List[int]]:
+ """音素セグメントリストから無音区間の部分のみを抽出
+ args:
+ time_list ([str]): 音素セグメントリスト
+ returns:
+ [[int]]: 無音区間の初めと終わりのフレームのリスト
+ """
+ sps = [
+ list(map(int, s.split()[:2]))
+ for s in time_list
+ if "silB" in s or "silE" in s or "sp" in s
+ ]
+ return sps
+
+
+def get_wav_sp_removed(
+ wav_file_name: str,
+ sp_segment: List[List[int]],
+ only_edge: bool = False,
+ start_margin: int = MARGIN,
+ end_margin: int = MARGIN,
+) -> List[int]:
+ with wave.open(wav_file_name) as f:
+ n = f.getnframes()
+ data = struct.unpack("h" * n, f.readframes(n))
+
+ removed = []
+
+ seg_start = 0
+
+ if only_edge:
+ tmp = sp_segment[0][1] * 10 - start_margin
+ seg_start = tmp if tmp > 0 else sp_segment[0][0] * 10
+
+ tmp = sp_segment[-1][0] * 10 + end_margin
+ seg_end = tmp if tmp < sp_segment[-1][1] * 10 else sp_segment[-1][1] * 10
+
+ removed.extend(
+ data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)]
+ )
+ else:
+ for i, seg in enumerate(sp_segment):
+ if i == 0:
+ seg_start = seg[1] * 10 - MARGIN # ms
+ continue
+
+ seg_end = seg[0] * 10 + MARGIN
+
+ removed.extend(
+ data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)]
+ )
+
+ if i != len(sp_segment) - 1:
+ seg_start = seg[1] * 10 - MARGIN
+
+ return removed
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
index add5cf6ef..fa54e387a 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -1,10 +1,21 @@
+from copy import deepcopy
from itertools import chain
from typing import List, Optional, Tuple
+from typing.io import IO
import numpy
from scipy.signal import resample
+from voicevox_engine.experimental.guided_extractor import (
+ PhraseInfo,
+ extract_guided_feature,
+ get_normalize_diff,
+ resample_ts,
+)
+from voicevox_engine.experimental.julius4seg.sp_inserter import frame_to_second
+
from ..acoustic_feature_extractor import OjtPhoneme
+from ..kana_parser import create_kana
from ..model import AccentPhrase, AudioQuery, Mora
from .synthesis_engine_base import SynthesisEngineBase
@@ -476,3 +487,121 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int):
wave = numpy.array([wave, wave]).T
return wave
+
+ def guided_synthesis(
+ self,
+ query: AudioQuery,
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ):
+ f0, phonemes = extract_guided_feature(audio_file, query.kana)
+
+ phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32)
+
+ for s, e, p in phonemes:
+ s, e = (resample_ts(v) for v in (s, e))
+ if p == "silB":
+ f0[:e] = 0.0
+ s += 1
+ p = "pau"
+ elif p == "silE":
+ f0[s:] = 0.0
+ p = "pau"
+ elif p == "sp":
+ f0[s:e] = 0.0
+ p = "pau"
+ elif p == "q":
+ p = "cl"
+ phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot
+
+ if normalize:
+ f0 += get_normalize_diff(
+ engine=self, kana=query.kana, f0=f0, speaker_id=speaker
+ )
+
+ f0 *= 2 ** query.pitchScale
+ f0[f0 > 6.5] = 6.5
+ f0[(0 < f0) & (f0 < 3)] = 3.0
+
+ f0 = resample(f0, int(len(f0) / query.speedScale))
+ phone_list = resample(phone_list, int(len(phone_list) / query.speedScale))
+
+ wave = self.decode_forwarder(
+ length=phone_list.shape[0],
+ phoneme_size=phone_list.shape[1],
+ f0=f0[:, numpy.newaxis].astype(numpy.float32),
+ phoneme=phone_list,
+ speaker_id=numpy.array([speaker], dtype=numpy.int64).reshape(-1),
+ )
+
+ if query.volumeScale != 1:
+ wave *= query.volumeScale
+
+ if query.outputSamplingRate != self.default_sampling_rate:
+ wave = resample(
+ wave,
+ query.outputSamplingRate * len(wave) // self.default_sampling_rate,
+ )
+
+ if query.outputStereo:
+ wave = numpy.array([wave, wave]).T
+
+ return wave
+
+ def guided_accent_phrases(
+ self,
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ) -> List[AccentPhrase]:
+ kana = create_kana(accent_phrases=accent_phrases)
+ f0, phonemes = extract_guided_feature(audio_file, kana)
+ timed_phonemes = frame_to_second(deepcopy(phonemes))
+
+ phrase_info = []
+ for ((s, e, p), (ts, te, _tp)) in zip(phonemes, timed_phonemes):
+ if p not in unvoiced_mora_phoneme_list:
+ clip = f0[resample_ts(s) : resample_ts(e)]
+ clip = clip[clip != 0]
+ pitch = numpy.average(clip) if len(clip) != 0 else 0
+ else:
+ pitch = 0
+ pitch = 0 if numpy.isnan(pitch) else pitch
+ length = float(te) - float(ts)
+ phrase_info.append(PhraseInfo(pitch, length, p))
+
+ if normalize:
+ normalize_diff = get_normalize_diff(
+ engine=self, kana=kana, f0=f0, speaker_id=speaker
+ )
+ for p in phrase_info:
+ p.pitch += normalize_diff
+
+ idx = 1
+ for phrase in accent_phrases:
+ for mora in phrase.moras:
+ if mora.consonant is not None:
+ mora.pitch = (
+ phrase_info[idx].pitch + phrase_info[idx + 1].pitch
+ ) / 2
+ mora.consonant_length = phrase_info[idx].length
+ mora.vowel_length = phrase_info[idx + 1].length
+ idx += 2
+ else:
+ mora.pitch = phrase_info[idx].pitch
+ mora.vowel_length = phrase_info[idx].length
+ idx += 1
+ if phrase_info[idx].phoneme == "sp":
+ phrase.pause_mora = Mora(
+ text="、",
+ consonant=None,
+ consonant_length=None,
+ vowel="pau",
+ vowel_length=phrase_info[idx].length,
+ pitch=0,
+ )
+ idx += 1
+
+ return accent_phrases
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
index ba567bd44..c84a213fe 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
@@ -1,6 +1,7 @@
import copy
from abc import ABCMeta, abstractmethod
from typing import List, Optional
+from typing.io import IO
from .. import full_context_label
from ..full_context_label import extract_full_context_label
@@ -224,3 +225,23 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int):
音声合成結果
"""
raise NotImplementedError()
+
+ @abstractmethod
+ def guided_synthesis(
+ self,
+ query: AudioQuery,
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def guided_accent_phrases(
+ self,
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ audio_file: IO,
+ normalize: int,
+ ) -> List[AccentPhrase]:
+ raise NotImplementedError()