diff --git a/.gitignore b/.gitignore
index 9e8eea619..63a996ab7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,9 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# Guided Synthesis temp files
+/voicevox_engine/experimental/dictation-kit*
+first_pass*
+second_pass*
+tmp.wav
\ No newline at end of file
diff --git a/README.md b/README.md
index f228b39cd..a397c6f5a 100644
--- a/README.md
+++ b/README.md
@@ -167,6 +167,37 @@ curl -s \
     > audio.wav
 ```
 
+### Guidied Synthsis
+Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information.  
+Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance.  
+You can also get the kana text in AquesTalk section.  
+```bash
+# Returns an audio file which is synthesised referencing uploaded audio
+# this example needs a recording whose content is
+# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い"
+
+curl -L -X POST 'localhost:50021/guided_synthesis' \
+    -F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
+    -F 'speaker_id="5"' \
+    -F 'audio_file=@"/full_path_to_your_recording"' \
+    -F 'normalize="true"' \
+    -F 'stereo="true"' \
+    -F 'sample_rate="24000"' \
+    -F 'volume_scale="1"' \
+    -F 'pitch_scale="0"' \
+    -F 'speed_scale="1"'
+
+# Returns a list of AccentPhrases
+
+curl -L -X POST 'localhost:50021/guided_accent_phrase' \
+    -F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \
+    -F 'speaker="5"' \
+    -F 'audio_file=@"/full_path_to_your_recording"' \
+    -F 'normalize="true"' \
+    -F 'is_kana="true"' \
+    -F 'enable_interrogative="false"'
+```
+
 ### 話者の追加情報を取得するサンプルコード
 
 追加情報の中の portrait.png を取得するコードです。  
diff --git a/run.py b/run.py
index 3dafb4816..d166827a7 100644
--- a/run.py
+++ b/run.py
@@ -17,7 +17,7 @@
 
 import soundfile
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.params import Query
 from pydantic import ValidationError
@@ -215,6 +215,76 @@ def accent_phrases(
         else:
             return engine.create_accent_phrases(text, speaker_id=speaker)
 
+    @app.post(
+        "/guided_accent_phrase",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="Create Accent Phrase from External Audio",
+    )
+    def guided_accent_phrase(
+        text: str = Form(...),  # noqa:B008
+        speaker: int = Form(...),  # noqa:B008
+        is_kana: bool = Form(...),  # noqa:B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+        normalize: bool = Form(...),  # noqa:B008
+        core_version: Optional[str] = None,
+    ):
+        """
+        Extracts f0 and aligned phonemes, calculates average f0 for every phoneme.
+        Returns a list of AccentPhrase.
+        **This API works in the resolution of phonemes.**
+        """
+        if not args.enable_guided_synthesis:
+            raise HTTPException(
+                status_code=404,
+                detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
+            )
+        engine = get_engine(core_version)
+        if is_kana:
+            try:
+                accent_phrases = parse_kana(text)
+            except ParseKanaError as err:
+                raise HTTPException(
+                    status_code=400,
+                    detail=ParseKanaBadRequest(err).dict(),
+                )
+        else:
+            accent_phrases = engine.create_accent_phrases(
+                text,
+                speaker_id=speaker,
+            )
+
+        try:
+            return engine.guided_accent_phrases(
+                accent_phrases=accent_phrases,
+                speaker=speaker,
+                audio_file=audio_file.file,
+                normalize=normalize,
+            )
+        except ParseKanaError as err:
+            raise HTTPException(
+                status_code=422,
+                detail=ParseKanaBadRequest(err).dict(),
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error",
+                )
+
     @app.post(
         "/mora_data",
         response_model=List[AccentPhrase],
@@ -366,7 +436,7 @@ def multi_synthesis(
                             format="WAV",
                         )
                         wav_file.seek(0)
-                        zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
+                        zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
 
         return FileResponse(f.name, media_type="application/zip")
 
@@ -420,6 +490,90 @@ def _synthesis_morphing(
 
         return FileResponse(f.name, media_type="audio/wav")
 
+    @app.post(
+        "/guided_synthesis",
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="Audio synthesis guided by external audio and phonemes",
+    )
+    def guided_synthesis(
+        kana: str = Form(...),  # noqa: B008
+        speaker_id: int = Form(...),  # noqa: B008
+        normalize: bool = Form(...),  # noqa: B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+        stereo: bool = Form(...),  # noqa: B008
+        sample_rate: int = Form(...),  # noqa: B008
+        volume_scale: float = Form(...),  # noqa: B008
+        pitch_scale: float = Form(...),  # noqa: B008
+        speed_scale: float = Form(...),  # noqa: B008
+        core_version: Optional[str] = None,
+    ):
+        """
+        Extracts and passes the f0 and aligned phonemes to engine.
+        Returns the synthesized audio.
+        **This API works in the resolution of frame.**
+        """
+        if not args.enable_guided_synthesis:
+            raise HTTPException(
+                status_code=404,
+                detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
+            )
+        engine = get_engine(core_version)
+        try:
+            accent_phrases = parse_kana(kana)
+            query = AudioQuery(
+                accent_phrases=accent_phrases,
+                speedScale=speed_scale,
+                pitchScale=pitch_scale,
+                intonationScale=1,
+                volumeScale=volume_scale,
+                prePhonemeLength=0.1,
+                postPhonemeLength=0.1,
+                outputSamplingRate=sample_rate,
+                outputStereo=stereo,
+                kana=kana,
+            )
+            wave = engine.guided_synthesis(
+                audio_file=audio_file.file,
+                query=query,
+                speaker=speaker_id,
+                normalize=normalize,
+            )
+
+            with NamedTemporaryFile(delete=False) as f:
+                soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+
+            return FileResponse(f.name, media_type="audio/wav")
+        except ParseKanaError as err:
+            raise HTTPException(
+                status_code=400,
+                detail=ParseKanaBadRequest(err).dict(),
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment.",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment.",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error.",
+                )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,
@@ -665,6 +819,7 @@ def supported_devices(
     parser.add_argument("--runtime_dir", type=Path, default=None, action="append")
     parser.add_argument("--enable_mock", action="store_true")
     parser.add_argument("--enable_cancellable_synthesis", action="store_true")
+    parser.add_argument("--enable_guided_synthesis", action="store_true")
     parser.add_argument("--init_processes", type=int, default=2)
 
     # 引数へcpu_num_threadsの指定がなければ、環境変数をロールします。
diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py
index ed710274f..d291aeea2 100644
--- a/voicevox_engine/dev/synthesis_engine/mock.py
+++ b/voicevox_engine/dev/synthesis_engine/mock.py
@@ -1,5 +1,6 @@
 from logging import getLogger
 from typing import Any, Dict, List, Optional
+from typing.io import IO
 
 import numpy as np
 from pyopenjtalk import tts
@@ -130,3 +131,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
         wave, sr = tts(text)
         wave = resample(wave, 24000 * len(wave) // 48000)
         return wave
+
+    def guided_synthesis(
+        self,
+        query: AudioQuery,
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> np.ndarray:
+        """
+        Open jtalk doesn't have a guided function [Mock]
+        simply calling mock synthesis
+
+        Parameters
+        ----------
+        query
+        speaker
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return self.synthesis(query=query, speaker_id=speaker)
+
+    def guided_accent_phrases(
+        self,
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> List[AccentPhrase]:
+        """
+        guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
+
+        Parameters
+        ----------
+        query
+        speaker
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return accent_phrases
diff --git a/voicevox_engine/experimental/__init__.py b/voicevox_engine/experimental/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/voicevox_engine/experimental/guided_extractor.py b/voicevox_engine/experimental/guided_extractor.py
new file mode 100644
index 000000000..8e4c280d2
--- /dev/null
+++ b/voicevox_engine/experimental/guided_extractor.py
@@ -0,0 +1,206 @@
+import os
+import re
+import tarfile
+from os.path import exists
+from pathlib import PurePath
+from typing.io import IO
+from urllib.request import urlretrieve
+
+import numpy as np
+import pkg_resources
+import pyworld as pw
+from scipy.io import wavfile
+from scipy.signal import resample
+
+from voicevox_engine.experimental.julius4seg import converter, sp_inserter
+from voicevox_engine.experimental.julius4seg.sp_inserter import ModelType, space_symbols
+from voicevox_engine.kana_parser import parse_kana
+
+JULIUS_SAMPLE_RATE = 16000
+FRAME_PERIOD = 1.0
+PUNCTUATION = ["_", "'", "/", "、"]
+SIL_SYMBOL = ["silB", "silE", "sp"]
+TMP_PATH = "tmp.wav"
+UUT_ID = "tmp"
+TEMP_FILE_LIST = [
+    "first_pass.dfa",
+    "first_pass.dict",
+    "second_pass.dfa",
+    "second_pass.dict",
+    "tmp.wav",
+]
+
+_JULIUS_DICTATION_URL = "https://github.com/julius-speech/dictation-kit/archive/refs/tags/dictation-kit-v4.3.1.tar.gz"  # noqa: B950
+JULIUS_DICTATION_DIR = os.environ.get(
+    "JULIUS_DICTATION_DIR",
+    # they did put two "dictation-kit"s in extracted folder name
+    pkg_resources.resource_filename(__name__, "dictation-kit-dictation-kit-v4.3.1"),
+)
+
+sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR)
+
+
+class PhraseInfo:
+    def __init__(self, pitch: float, length: float, phoneme: str):
+        self.pitch = pitch
+        self.length = length
+        self.phoneme = phoneme
+
+
+def _lazy_init():
+    if not exists(JULIUS_DICTATION_DIR):
+        print("Julius not found, Downloading")
+        _extract_julius()
+
+
+def _extract_julius():
+    global JULIUS_DICTATION_DIR
+    filename = pkg_resources.resource_filename(__name__, "dictation-kit.tar.gz")
+    print("Downloading Julius...", _JULIUS_DICTATION_URL)
+    urlretrieve(_JULIUS_DICTATION_URL, filename)
+    print("Extracting Julius...", JULIUS_DICTATION_DIR)
+    with tarfile.open(filename, mode="r|gz") as f:
+        f.extractall(path=pkg_resources.resource_filename(__name__, ""))
+    JULIUS_DICTATION_DIR = pkg_resources.resource_filename(
+        __name__, "dictation-kit-dictation-kit-v4.3.1"
+    )
+    sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR)
+    os.remove(filename)
+
+
+def resample_ts(timestamp: str):
+    """
+    0.9375 = 24000 / 256 / 1000 * 10
+    10 is for julius4seg produces timestamp in 10 ms
+    """
+    return int((float(timestamp) * 0.9375))
+
+
+def get_normalize_diff(engine, kana: str, f0: np.ndarray, speaker_id: int):
+    f0_avg = _no_nan(np.average(f0[f0 != 0]))
+    predicted_phrases = parse_kana(kana)
+    engine.replace_mora_data(predicted_phrases, speaker_id=speaker_id)
+    pitch_list = []
+    for phrase in predicted_phrases:
+        for mora in phrase.moras:
+            pitch_list.append(mora.pitch)
+    pitch_list = np.array(pitch_list, dtype=np.float64)
+    predicted_avg = _no_nan(np.average(pitch_list[pitch_list != 0]))
+    return predicted_avg - f0_avg
+
+
+def _no_nan(num):
+    return 0.0 if np.isnan(num) else num
+
+
+def extract_guided_feature(audio_file: IO, kana: str):
+    _lazy_init()
+    sr, wave = wavfile.read(audio_file)
+    # stereo to mono
+    if len(wave.shape) == 2:
+        wave = wave.sum(axis=1) / 2
+
+    f0 = extract_f0(wave, sr, 256 / 24000 * 1000)
+
+    julius_wave = resample(wave, JULIUS_SAMPLE_RATE * len(wave) // sr)
+
+    # normalization for different WAV format
+    if julius_wave.dtype == "float32":
+        julius_wave *= 32767
+    if julius_wave.dtype == "int32":
+        julius_wave = np.floor_divide(julius_wave, 2147483392 / 32767)
+    if julius_wave.dtype == "uint8":
+        # floor of 32767 / 255
+        julius_wave *= 128
+
+    julius_wave = julius_wave.astype(np.int16)
+
+    julius_kana = re.sub(
+        "|".join(PUNCTUATION), "", kana.replace("/", "").replace("、", " ")
+    )
+
+    phones = forced_align(julius_wave, julius_kana)
+    return f0, phones
+
+
+def forced_align(julius_wave: np.ndarray, base_kata_text: str):
+    model_type = ModelType.gmm
+    hmm_model = os.path.join(
+        JULIUS_DICTATION_DIR, "model/phone_m/jnas-mono-16mix-gid.binhmm"
+    )
+    options = []
+
+    base_kata_text = sp_inserter.kata2hira(base_kata_text)
+
+    julius_phones = [converter.conv2openjtalk(hira) for hira in base_kata_text.split()]
+
+    base_kan_text = ["sym_{}".format(i) for i in range(len(julius_phones))]
+
+    assert len(base_kan_text) == len(julius_phones), f"{base_kan_text}\n{julius_phones}"
+
+    dict_1st = sp_inserter.gen_julius_dict_1st(base_kan_text, julius_phones, model_type)
+    dfa_1st = sp_inserter.gen_julius_dfa(dict_1st.count("\n"))
+
+    with open("first_pass.dict", "w", encoding="utf-8") as f:
+        f.write(dict_1st)
+
+    with open("first_pass.dfa", "w", encoding="utf-8") as f:
+        f.write(dfa_1st)
+    wavfile.write(TMP_PATH, JULIUS_SAMPLE_RATE, julius_wave)
+
+    raw_first_output = sp_inserter.julius_sp_insert(
+        TMP_PATH,
+        "first_pass",
+        hmm_model,
+        model_type,
+        options,
+    )
+
+    forced_phones_with_sp = []
+    try:
+        _, sp_position = sp_inserter.get_sp_inserted_text(raw_first_output)
+
+        for j, (_t, p) in enumerate(zip(base_kan_text, julius_phones)):
+            forced_phones_with_sp.append(p)
+            if j in sp_position:
+                forced_phones_with_sp.append(space_symbols[model_type])
+
+        forced_phones_with_sp = " ".join(forced_phones_with_sp)
+    except Exception:
+        pass
+
+    phones_with_sp = sp_inserter.get_sp_inserterd_phone_seqence(
+        raw_first_output, model_type
+    )
+    if len(phones_with_sp) < 2:
+        forced_phones_with_sp = phones_with_sp
+
+    dict_2nd = sp_inserter.gen_julius_dict_2nd(forced_phones_with_sp, model_type)
+    dfa_2nd = sp_inserter.gen_julius_aliment_dfa(dict_2nd.count("\n"))
+
+    with open("second_pass.dict", "w") as f:
+        f.write(dict_2nd)
+
+    with open("second_pass.dfa", "w") as f:
+        f.write(dfa_2nd)
+
+    raw_second_output = sp_inserter.julius_phone_alignment(
+        TMP_PATH, "second_pass", hmm_model, model_type, options
+    )
+    time_alimented_list = sp_inserter.get_time_alimented_list(raw_second_output)
+
+    assert len(time_alimented_list) > 0, raw_second_output
+
+    for file in TEMP_FILE_LIST:
+        os.remove(file)
+
+    return time_alimented_list
+
+
+def extract_f0(wave: np.ndarray, sr: int, frame_period: float):
+    w = wave.astype(np.float64)
+    f0, t = pw.harvest(w, sr, frame_period=frame_period)
+    vuv = f0 != 0
+    f0_log = np.zeros_like(f0)
+    f0_log[vuv] = np.log(f0[vuv])
+    return f0_log
diff --git a/voicevox_engine/experimental/julius4seg/__init__.py b/voicevox_engine/experimental/julius4seg/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/voicevox_engine/experimental/julius4seg/converter.py b/voicevox_engine/experimental/julius4seg/converter.py
new file mode 100644
index 000000000..efc596cad
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/converter.py
@@ -0,0 +1,319 @@
+import re
+
+
+def conv2julius(s: str) -> str:
+    """入力の単語の読み（ひらがな）をJuliusの音素列に変換
+    args:
+        kana(str): カタカナ文字列
+            "やきにく"
+    returns:
+        (str): ひらがな文字列
+            " y a k i n i k u"
+    """
+    s = s.replace("あぁ", " a a")
+    s = s.replace("いぃ", " i i")
+    s = s.replace("いぇ", " i e")
+    s = s.replace("いゃ", " y a")
+    s = s.replace("うぁ", " u a")
+    s = s.replace("うぃ", " w i")
+    s = s.replace("うぅ", " u:")
+    s = s.replace("うぇ", " w e")
+    s = s.replace("うぉ", " w o")
+    s = s.replace("えぇ", " e e")
+    s = s.replace("おぉ", " o:")
+    s = s.replace("かぁ", " k a:")
+    s = s.replace("がぁ", " g a:")
+    s = s.replace("きぃ", " k i:")
+    s = s.replace("きぇ", " ky e")
+    s = s.replace("きゃ", " ky a")
+    s = s.replace("きゅ", " ky u")
+    s = s.replace("きょ", " ky o")
+    s = s.replace("ぎぃ", " g i:")
+    s = s.replace("ぎぇ", " gy e")
+    s = s.replace("ぎゃ", " gy a")
+    s = s.replace("ぎゅ", " gy u")
+    s = s.replace("ぎょ", " gy o")
+    s = s.replace("くぅ", " k u:")
+    s = s.replace("くゃ", " ky a")
+    s = s.replace("くゅ", " ky u")
+    s = s.replace("くょ", " ky o")
+    s = s.replace("ぐぅ", " g u:")
+    s = s.replace("ぐゃ", " gy a")
+    s = s.replace("ぐゅ", " gy u")
+    s = s.replace("ぐょ", " gy o")
+    s = s.replace("けぇ", " k e:")
+    s = s.replace("げぇ", " g e:")
+    s = s.replace("こぉ", " k o:")
+    s = s.replace("ごぉ", " g o:")
+    s = s.replace("さぁ", " s a:")
+    s = s.replace("ざぁ", " z a:")
+    s = s.replace("しぃ", " sh i:")
+    s = s.replace("しぇ", " sh e")
+    s = s.replace("しゃ", " sh a")
+    s = s.replace("しゅ", " sh u")
+    s = s.replace("しょ", " sh o")
+    s = s.replace("じぃ", " j i:")
+    s = s.replace("じぇ", " j e")
+    s = s.replace("じゃ", " j a")
+    s = s.replace("じゅ", " j u")
+    s = s.replace("じょ", " j o")
+    s = s.replace("すぃ", " s i")
+    s = s.replace("すぅ", " s u:")
+    s = s.replace("すゃ", " sh a")
+    s = s.replace("すゅ", " sh u")
+    s = s.replace("すょ", " sh o")
+    s = s.replace("ずぁ", " z u a")
+    s = s.replace("ずぃ", " z i")
+    s = s.replace("ずぅ", " z u")
+    s = s.replace("ずぅ", " z u:")
+    s = s.replace("ずぇ", " z e")
+    s = s.replace("ずぉ", " z o")
+    s = s.replace("ずゃ", " zy a")
+    s = s.replace("ずゃ", " zy a")
+    s = s.replace("ずゅ", " zy u")
+    s = s.replace("ずゅ", " zy u")
+    s = s.replace("ずょ", " zy o")
+    s = s.replace("ずょ", " zy o")
+    s = s.replace("せぇ", " s e:")
+    s = s.replace("ぜぇ", " z e:")
+    s = s.replace("そぉ", " s o:")
+    s = s.replace("ぞぉ", " z o:")
+    s = s.replace("たぁ", " t a:")
+    s = s.replace("だぁ", " d a:")
+    s = s.replace("ちぃ", " ch i:")
+    s = s.replace("ちぇ", " ch e")
+    s = s.replace("ちゃ", " ch a")
+    s = s.replace("ちゅ", " ch u")
+    s = s.replace("ちょ", " ch o")
+    s = s.replace("ぢぃ", " j i:")
+    s = s.replace("ぢぇ", " j e")
+    s = s.replace("ぢゃ", " j a")
+    s = s.replace("ぢゅ", " j u")
+    s = s.replace("ぢょ", " j o")
+    s = s.replace("つぁ", " ts a")
+    s = s.replace("つぃ", " ts i")
+    s = s.replace("つぅ", " ts u:")
+    s = s.replace("つぇ", " ts e")
+    s = s.replace("つぉ", " ts o")
+    s = s.replace("つゃ", " ch a")
+    s = s.replace("つゅ", " ch u")
+    s = s.replace("つょ", " ch o")
+    s = s.replace("づぅ", " d u:")
+    s = s.replace("づゃ", " zy a")
+    s = s.replace("づゅ", " zy u")
+    s = s.replace("づょ", " zy o")
+    s = s.replace("てぃ", " t i")
+    s = s.replace("てぇ", " t e:")
+    s = s.replace("てぇ", " t e:")
+    s = s.replace("てゃ", " t a")
+    s = s.replace("てゅ", " t u")
+    s = s.replace("てょ", " t o")
+    s = s.replace("でぃ", " d i")
+    s = s.replace("でぇ", " d e:")
+    s = s.replace("でぇ", " d e:")
+    s = s.replace("でゃ", " d a")
+    s = s.replace("でゅ", " d u")
+    s = s.replace("でょ", " d o")
+    s = s.replace("とぅ", " t u")
+    s = s.replace("とぉ", " t o:")
+    s = s.replace("とゃ", " t a")
+    s = s.replace("とゅ", " t u")
+    s = s.replace("とょ", " t o")
+    s = s.replace("どぁ", " d o a")
+    s = s.replace("どぅ", " d u")
+    s = s.replace("どぉ", " d o:")
+    s = s.replace("どぉ", " d o:")
+    s = s.replace("どゃ", " d a")
+    s = s.replace("どゅ", " d u")
+    s = s.replace("どょ", " d o")
+    s = s.replace("なぁ", " n a:")
+    s = s.replace("にぃ", " n i:")
+    s = s.replace("にぇ", " ny e")
+    s = s.replace("にゃ", " ny a")
+    s = s.replace("にゅ", " ny u")
+    s = s.replace("にょ", " ny o")
+    s = s.replace("ぬぅ", " n u:")
+    s = s.replace("ぬゃ", " ny a")
+    s = s.replace("ぬゅ", " ny u")
+    s = s.replace("ぬょ", " ny o")
+    s = s.replace("ねぇ", " n e:")
+    s = s.replace("のぉ", " n o:")
+    s = s.replace("はぁ", " h a:")
+    s = s.replace("ばぁ", " b a:")
+    s = s.replace("ぱぁ", " p a:")
+    s = s.replace("ひぃ", " h i:")
+    s = s.replace("ひぇ", " hy e")
+    s = s.replace("ひゃ", " hy a")
+    s = s.replace("ひゅ", " hy u")
+    s = s.replace("ひょ", " hy o")
+    s = s.replace("びぃ", " b i:")
+    s = s.replace("びぇ", " by e")
+    s = s.replace("びゃ", " by a")
+    s = s.replace("びゅ", " by u")
+    s = s.replace("びょ", " by o")
+    s = s.replace("ぴぃ", " p i:")
+    s = s.replace("ぴぇ", " py e")
+    s = s.replace("ぴゃ", " py a")
+    s = s.replace("ぴゅ", " py u")
+    s = s.replace("ぴょ", " py o")
+    s = s.replace("ふぁ", " f a")
+    s = s.replace("ふぃ", " f i")
+    s = s.replace("ふぅ", " f u")
+    s = s.replace("ふぅ", " f u:")
+    s = s.replace("ふぇ", " f e")
+    s = s.replace("ふぉ", " f o")
+    s = s.replace("ふゃ", " hy a")
+    s = s.replace("ふゃ", " hy a")
+    s = s.replace("ふゃ", " hy a")
+    s = s.replace("ふゅ", " hy u")
+    s = s.replace("ふゅ", " hy u")
+    s = s.replace("ふょ", " hy o")
+    s = s.replace("ふょ", " hy o")
+    s = s.replace("ふょ", " hy o")
+    s = s.replace("ぶぅ", " b u:")
+    s = s.replace("ぶゅ", " by u")
+    s = s.replace("ぷぅ", " p u:")
+    s = s.replace("ぷゃ", " py a")
+    s = s.replace("ぷゅ", " py u")
+    s = s.replace("ぷょ", " py o")
+    s = s.replace("へぇ", " h e:")
+    s = s.replace("べぇ", " b e:")
+    s = s.replace("ぺぇ", " p e:")
+    s = s.replace("ほぉ", " h o:")
+    s = s.replace("ぼぉ", " b o:")
+    s = s.replace("ぽぉ", " p o:")
+    s = s.replace("まぁ", " m a:")
+    s = s.replace("みぃ", " m i:")
+    s = s.replace("みぇ", " my e")
+    s = s.replace("みゃ", " my a")
+    s = s.replace("みゅ", " my u")
+    s = s.replace("みょ", " my o")
+    s = s.replace("むぅ", " m u:")
+    s = s.replace("むゃ", " my a")
+    s = s.replace("むゅ", " my u")
+    s = s.replace("むょ", " my o")
+    s = s.replace("めぇ", " m e:")
+    s = s.replace("もぉ", " m o:")
+    s = s.replace("やぁ", " y a:")
+    s = s.replace("ゆぅ", " y u:")
+    s = s.replace("ゆゃ", " y a:")
+    s = s.replace("ゆゅ", " y u:")
+    s = s.replace("ゆょ", " y o:")
+    s = s.replace("よぉ", " y o:")
+    s = s.replace("らぁ", " r a:")
+    s = s.replace("りぃ", " r i:")
+    s = s.replace("りぇ", " ry e")
+    s = s.replace("りゃ", " ry a")
+    s = s.replace("りゅ", " ry u")
+    s = s.replace("りょ", " ry o")
+    s = s.replace("るぅ", " r u:")
+    s = s.replace("るゃ", " ry a")
+    s = s.replace("るゅ", " ry u")
+    s = s.replace("るょ", " ry o")
+    s = s.replace("れぇ", " r e:")
+    s = s.replace("ろぉ", " r o:")
+    s = s.replace("わぁ", " w a:")
+    s = s.replace("をぉ", " o:")
+    s = s.replace("ゔぁ", " b a")
+    s = s.replace("ゔぃ", " b i")
+    s = s.replace("ゔぇ", " b e")
+    s = s.replace("ゔぉ", " b o")
+    s = s.replace("ゔゅ", " by u")
+
+    # 1音からなる変換規則
+    s = s.replace("あ", " a")
+    s = s.replace("い", " i")
+    s = s.replace("う", " u")
+    s = s.replace("え", " e")
+    s = s.replace("お", " o")
+    s = s.replace("か", " k a")
+    s = s.replace("き", " k i")
+    s = s.replace("く", " k u")
+    s = s.replace("け", " k e")
+    s = s.replace("こ", " k o")
+    s = s.replace("さ", " s a")
+    s = s.replace("し", " sh i")
+    s = s.replace("す", " s u")
+    s = s.replace("せ", " s e")
+    s = s.replace("そ", " s o")
+    s = s.replace("た", " t a")
+    s = s.replace("ち", " ch i")
+    s = s.replace("つ", " ts u")
+    s = s.replace("て", " t e")
+    s = s.replace("と", " t o")
+    s = s.replace("な", " n a")
+    s = s.replace("に", " n i")
+    s = s.replace("ぬ", " n u")
+    s = s.replace("ね", " n e")
+    s = s.replace("の", " n o")
+    s = s.replace("は", " h a")
+    s = s.replace("ひ", " h i")
+    s = s.replace("ふ", " f u")
+    s = s.replace("へ", " h e")
+    s = s.replace("ほ", " h o")
+    s = s.replace("ま", " m a")
+    s = s.replace("み", " m i")
+    s = s.replace("む", " m u")
+    s = s.replace("め", " m e")
+    s = s.replace("も", " m o")
+    s = s.replace("ら", " r a")
+    s = s.replace("り", " r i")
+    s = s.replace("る", " r u")
+    s = s.replace("れ", " r e")
+    s = s.replace("ろ", " r o")
+    s = s.replace("が", " g a")
+    s = s.replace("ぎ", " g i")
+    s = s.replace("ぐ", " g u")
+    s = s.replace("げ", " g e")
+    s = s.replace("ご", " g o")
+    s = s.replace("ざ", " z a")
+    s = s.replace("じ", " j i")
+    s = s.replace("ず", " z u")
+    s = s.replace("ぜ", " z e")
+    s = s.replace("ぞ", " z o")
+    s = s.replace("だ", " d a")
+    s = s.replace("ぢ", " j i")
+    s = s.replace("づ", " z u")
+    s = s.replace("で", " d e")
+    s = s.replace("ど", " d o")
+    s = s.replace("ば", " b a")
+    s = s.replace("び", " b i")
+    s = s.replace("ぶ", " b u")
+    s = s.replace("べ", " b e")
+    s = s.replace("ぼ", " b o")
+    s = s.replace("ぱ", " p a")
+    s = s.replace("ぴ", " p i")
+    s = s.replace("ぷ", " p u")
+    s = s.replace("ぺ", " p e")
+    s = s.replace("ぽ", " p o")
+    s = s.replace("や", " y a")
+    s = s.replace("ゆ", " y u")
+    s = s.replace("よ", " y o")
+    s = s.replace("わ", " w a")
+    s = s.replace("を", " o")
+    s = s.replace("ん", " N")
+    s = s.replace("っ", " q")
+    s = s.replace("ー", ":")
+    s = s.replace("ゔ", " b u")
+
+    s = s.replace("ぁ", " a")
+    s = s.replace("ぃ", " i")
+    s = s.replace("ぅ", " u")
+    s = s.replace("ぇ", " e")
+    s = s.replace("ぉ", " o")
+    s = s.replace("ゎ", " w a")
+
+    s = s[1:]
+
+    s = re.sub(r":+", ":", s)
+
+    return s
+
+
+def conv2openjtalk(s: str) -> str:
+    """入力の単語の読み（ひらがな）をOpenJTalkのような音素列に変換"""
+    s = conv2julius(s)
+
+    s = re.sub(r"(.):", r"\1 \1", s)
+
+    return s
diff --git a/voicevox_engine/experimental/julius4seg/sp_inserter.py b/voicevox_engine/experimental/julius4seg/sp_inserter.py
new file mode 100644
index 000000000..cbc94dd35
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/sp_inserter.py
@@ -0,0 +1,353 @@
+import re
+import subprocess
+import sys
+from enum import Enum
+from itertools import chain
+from typing import List, Optional, Tuple
+
+
+class ModelType(str, Enum):
+    gmm = "gmm"
+    dnn = "dnn"
+
+
+JULIUS_ROOT = "."
+
+begin_silent_symbols = {ModelType.gmm: "silB", ModelType.dnn: "sp_B"}
+end_silent_symbols = {ModelType.gmm: "silE", ModelType.dnn: "sp_E"}
+space_symbols = {ModelType.gmm: "sp", ModelType.dnn: "sp_S"}
+
+
+def get_os_dependent_directory() -> str:
+    """Juluis Segmentaion-Kitのディレクトリ名をOSの種類から取得
+    returns:
+        (str): OS依存のパスの一部
+    """
+    if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+        return "windows"
+    elif sys.platform.startswith("darwin"):
+        return "osx"
+    elif sys.platform.startswith("linux"):
+        return "linux"
+
+
+def get_os_dependent_exec() -> str:
+    """Juliusの実行ファイル名を取得
+    returns:
+        (str): Juliusの実行ファイル名
+    """
+    if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+        return "julius.exe"
+    else:
+        return "julius"
+
+
+def get_os_dependent_echo(filename: str) -> list:
+    """Get parameters of echo referencing platforms
+    Returns:
+        list[str]: echo parameters
+    """
+    if sys.platform.startswith("win") or sys.platform.startswith("cygwin"):
+        return ["cmd.exe", "/c", "echo " + filename]
+    else:
+        return ["echo", filename]
+
+
+def kata2hira(kana: str) -> str:
+    """ヵ，ヶ以外のカタカナをひらがなに変換
+    args:
+        kana(str): カタカナ文字列
+            "ヤキニク"
+    returns:
+        (str): ひらがな文字列
+            "やきにく"
+    """
+    return "".join(
+        [
+            chr(ord(c) + ord("あ") - ord("ア")) if ord("ァ") <= ord(c) <= ord("ヴ") else c
+            for c in kana
+        ]
+    )
+
+
+def gen_julius_dict_1st(
+    text_symbols: List[str], word_phones: List[str], model_type: ModelType
+) -> str:
+    """テキストのシンボルと読みの音素のJulius dictファイルの中身を生成
+    args:
+        text_symbols ([str]): 単語のシンボル
+            ['今回', 'は']
+        word_phones ([str]): 単語の音素系列
+            ['k o N k a i', 'w a']
+    returns:
+        (str): Juliusのdictファイルの中身
+    """
+    tmp = []
+    finit = len(text_symbols)
+
+    for i, zipped in enumerate(zip(text_symbols, word_phones)):
+        tmp.append("{}\t[{}]\t{}".format(i * 2, *zipped))
+        if i + 1 != finit:
+            tmp.append(
+                "{}\t[{}]\t{}".format(
+                    i * 2 + 1, "sp_{}".format(i), space_symbols[model_type]
+                )
+            )
+
+    # append sp and Start, End symbol
+    tmp.append(
+        "{}\t[{}]\t{}".format(i * 2 + 1, "<s>", begin_silent_symbols[model_type])
+    )
+    tmp.append(
+        "{}\t[{}]\t{}".format((i + 1) * 2, "</s>", end_silent_symbols[model_type])
+    )
+
+    return "\n".join(tmp) + "\n"
+
+
+def gen_julius_dfa(number_of_words: int) -> str:
+    """単語数から遷移のためのJuliusのdfaファイルの中身を生成
+    args:
+        number_of_words (int): 遷移する単語の単語数
+    returns:
+        (str): Juliusのdfaファイルの中身
+    """
+    i = 0
+    current_word = number_of_words - 3
+    isLast = False
+    tmp = []
+    while True:
+        if i == 0:
+            tmp.append("{} {} {} {} {}".format(i, number_of_words - 1, i + 1, 0, 1))
+            i += 1
+        elif i > 0 and not isLast:
+            tmp.append("{} {} {} {} {}".format(i, current_word, i + 1, 0, 0))
+            current_word -= 1
+            isLast = current_word == -1
+            i += 1
+        elif i > 0 and isLast:
+            tmp.append("{} {} {} {} {}".format(i, i - 1, i + 1, 0, 0))
+            tmp.append("{} {} {} {} {}".format(i + 1, -1, -1, 1, 0))
+            break
+
+    return "\n".join(tmp) + "\n"
+
+
+def gen_julius_dict_2nd(phone_seqence: str, model_type: ModelType) -> str:
+    """音素系列から強制アライメントのためのdictファイルの中身を生成
+    args:
+        phone_seqence (str):
+            'k o N k a i w a '
+    returns:
+        (str): Juliusのdictファイルの中身
+    """
+    phone_seqences = phone_seqence.split(f" {space_symbols[model_type]} ")
+    return (
+        "\n".join(
+            [
+                f"{i}\t[w_{i}]\t"
+                + phone_seqence
+                + (
+                    f" {space_symbols[model_type]}"
+                    if i != len(phone_seqences) - 1
+                    else ""
+                )
+                for i, phone_seqence in enumerate(phone_seqences)
+            ]
+            + [
+                f"{len(phone_seqences)}\t[w_{len(phone_seqences)}]\t"
+                + begin_silent_symbols[model_type]
+            ]
+            + [
+                f"{len(phone_seqences) + 1}\t[w_{len(phone_seqences) + 1}]\t"
+                + end_silent_symbols[model_type]
+            ]
+        )
+        + "\n"
+    )
+
+
+def gen_julius_aliment_dfa(number_of_words: int) -> str:
+    """強制アライメント用のdfaファイルの中身を生成
+    returns:
+        (str): Juliusのdfaファイルの中身
+    """
+    return gen_julius_dfa(number_of_words)
+
+
+def julius_sp_insert(
+    target_wav_file: str,
+    aliment_file_signiture: str,
+    model_path: str,
+    model_type: ModelType,
+    options: Optional[List[str]],
+) -> List[str]:
+    if options is None:
+        options = []
+
+    julius_args = {
+        "-h": model_path,
+        "-input": "file",
+        "-debug": "",
+        "-gram": aliment_file_signiture,
+        "-nostrip": "",
+        "-spmodel": space_symbols[model_type],
+    }
+
+    file_echo_p = subprocess.Popen(
+        get_os_dependent_echo(target_wav_file),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+    )
+    julius_p = subprocess.Popen(
+        " ".join(
+            [
+                str(
+                    JULIUS_ROOT
+                    / "bin"
+                    / get_os_dependent_directory()
+                    / get_os_dependent_exec()
+                ),
+                *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])),
+            ]
+            + options
+        ).split(),
+        stdin=file_echo_p.stdout,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+    )
+    file_echo_p.stdout.close()
+    return julius_p.communicate()[0].decode("utf-8").split("\n")
+
+
+def get_sp_inserted_text(raw_output: List[str]) -> Tuple[str, List[int]] or None:
+    """デコード結果からsp挿入後のテキストとspのインデックスを取得する
+    args:
+        raw_output: `julius_sp_insert`の出力
+    returns:
+        Tuple(str, [int]): デコード結果とspのindex
+    """
+    r = re.compile("<s> (.*) </s>")
+    pass1_best = next(s for s in raw_output if s.startswith("pass1_best"))
+    matched = r.search(pass1_best)
+    if matched is None:
+        raise Exception("Decode Failed")
+
+    return (
+        re.sub(r"sp_[\d+]", "<sp>", matched.group(1)),
+        [int(s.split("_")[1]) for s in matched.group().split() if "sp_" in s],
+    )
+
+
+def get_sp_inserterd_phone_seqence(raw_output: List[str], model_type: ModelType) -> str:
+    try:
+        pass1_best_phonemeseq = next(
+            s.rstrip("\r") for s in raw_output if s.startswith("pass1_best_phonemeseq")
+        )
+    except Exception as e:
+        raise (e)
+
+    complete_re = re.compile(
+        begin_silent_symbols[model_type]
+        + r" \| (.*) \| "
+        + end_silent_symbols[model_type]
+    )
+    failed_re_1 = re.compile(
+        end_silent_symbols[model_type]
+        + r" \| (.*) \| "
+        + begin_silent_symbols[model_type]
+    )
+    failed_re_2 = re.compile(end_silent_symbols[model_type] + r" \| (.*)")
+
+    if complete_re.search(pass1_best_phonemeseq) is not None:
+        matched = complete_re.search(pass1_best_phonemeseq)
+    elif failed_re_1.search(pass1_best_phonemeseq) is not None:
+        matched = failed_re_1.search(pass1_best_phonemeseq)
+    elif failed_re_2.search(pass1_best_phonemeseq) is not None:
+        matched = failed_re_2.search(pass1_best_phonemeseq)
+    else:
+        raise Exception("Decode Failed")
+
+    tmp = matched.group(1)
+    return " ".join([s.strip() for s in tmp.split("|")])
+
+
+def julius_phone_alignment(
+    target_wav_file: str,
+    aliment_file_signiture: str,
+    model_path: str,
+    model_type: ModelType,
+    options: Optional[List[str]],
+) -> List[str]:
+    if options is None:
+        options = []
+
+    julius_args = {
+        "-h": model_path,
+        "-palign": "",
+        "-input": "file",
+        "-gram": aliment_file_signiture,
+        "-nostrip": "",
+        "-n": "10",
+        "-s": "10000",
+        "-sb": "5000",
+        "-spmodel": space_symbols[model_type],
+    }
+
+    file_echo_p = subprocess.Popen(
+        ["echo", target_wav_file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL
+    )
+    julius_p = subprocess.Popen(
+        " ".join(
+            [
+                str(
+                    JULIUS_ROOT
+                    / "bin"
+                    / get_os_dependent_directory()
+                    / get_os_dependent_exec()
+                ),
+                *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])),
+            ]
+            + options
+        ).split(),
+        stdin=file_echo_p.stdout,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+    )
+    file_echo_p.stdout.close()
+    return julius_p.communicate()[0].decode("utf-8").split("\n")
+
+
+def get_time_alimented_list(raw_output: List[str]) -> List[Tuple[str, str, str]]:
+    r = re.compile(
+        r"\[\s*(\d+)\s+(\d+)\s*\]"
+        r"\s*[\-]*[\d,\.]+\s*"
+        r"\{?([\w,\:]+)\-?([\w,\:]*)\+?([\w,\:]*)\}?\[?[\w,\:,\-,\+]*\]?$"
+    )
+
+    def get_phoneme(left: str, center: str, right: str):
+        if len(center) == 0 and len(right) == 0:  # monophone
+            return left
+        elif len(center) > 0:
+            return center
+        elif len(center) == 0:
+            return left
+        else:
+            raise ValueError(f"{left} {center} {right}")
+
+    return [
+        (s.group(1), s.group(2), get_phoneme(s.group(3), s.group(4), s.group(5)))
+        for s in map(lambda x: r.search(x.rstrip("\r")), raw_output)
+        if s is not None
+    ]
+
+
+def frame_to_second(time_list: List[Tuple[str, str, str]]):
+    return [
+        (
+            f"{int(start) * 0.01 + (0.0125 if i > 0 else 0):.4f}",
+            f"{(int(end) + 1) * 0.01 + 0.0125:.4f}",
+            phoneme,
+        )
+        for i, (start, end, phoneme) in enumerate(time_list)
+    ]
diff --git a/voicevox_engine/experimental/julius4seg/sp_remover.py b/voicevox_engine/experimental/julius4seg/sp_remover.py
new file mode 100644
index 000000000..50838c650
--- /dev/null
+++ b/voicevox_engine/experimental/julius4seg/sp_remover.py
@@ -0,0 +1,64 @@
+import struct
+import wave
+from typing import List
+
+# 有声音素を削らないためのマージン
+MARGIN = 5
+
+
+def get_sp_segment(time_list: List[str]) -> List[List[int]]:
+    """音素セグメントリストから無音区間の部分のみを抽出
+    args:
+        time_list ([str]): 音素セグメントリスト
+    returns:
+        [[int]]: 無音区間の初めと終わりのフレームのリスト
+    """
+    sps = [
+        list(map(int, s.split()[:2]))
+        for s in time_list
+        if "silB" in s or "silE" in s or "sp" in s
+    ]
+    return sps
+
+
+def get_wav_sp_removed(
+    wav_file_name: str,
+    sp_segment: List[List[int]],
+    only_edge: bool = False,
+    start_margin: int = MARGIN,
+    end_margin: int = MARGIN,
+) -> List[int]:
+    with wave.open(wav_file_name) as f:
+        n = f.getnframes()
+        data = struct.unpack("h" * n, f.readframes(n))
+
+    removed = []
+
+    seg_start = 0
+
+    if only_edge:
+        tmp = sp_segment[0][1] * 10 - start_margin
+        seg_start = tmp if tmp > 0 else sp_segment[0][0] * 10
+
+        tmp = sp_segment[-1][0] * 10 + end_margin
+        seg_end = tmp if tmp < sp_segment[-1][1] * 10 else sp_segment[-1][1] * 10
+
+        removed.extend(
+            data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)]
+        )
+    else:
+        for i, seg in enumerate(sp_segment):
+            if i == 0:
+                seg_start = seg[1] * 10 - MARGIN  # ms
+                continue
+
+            seg_end = seg[0] * 10 + MARGIN
+
+            removed.extend(
+                data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)]
+            )
+
+            if i != len(sp_segment) - 1:
+                seg_start = seg[1] * 10 - MARGIN
+
+    return removed
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
index add5cf6ef..fa54e387a 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -1,10 +1,21 @@
+from copy import deepcopy
 from itertools import chain
 from typing import List, Optional, Tuple
+from typing.io import IO
 
 import numpy
 from scipy.signal import resample
 
+from voicevox_engine.experimental.guided_extractor import (
+    PhraseInfo,
+    extract_guided_feature,
+    get_normalize_diff,
+    resample_ts,
+)
+from voicevox_engine.experimental.julius4seg.sp_inserter import frame_to_second
+
 from ..acoustic_feature_extractor import OjtPhoneme
+from ..kana_parser import create_kana
 from ..model import AccentPhrase, AudioQuery, Mora
 from .synthesis_engine_base import SynthesisEngineBase
 
@@ -476,3 +487,121 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int):
             wave = numpy.array([wave, wave]).T
 
         return wave
+
+    def guided_synthesis(
+        self,
+        query: AudioQuery,
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ):
+        f0, phonemes = extract_guided_feature(audio_file, query.kana)
+
+        phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32)
+
+        for s, e, p in phonemes:
+            s, e = (resample_ts(v) for v in (s, e))
+            if p == "silB":
+                f0[:e] = 0.0
+                s += 1
+                p = "pau"
+            elif p == "silE":
+                f0[s:] = 0.0
+                p = "pau"
+            elif p == "sp":
+                f0[s:e] = 0.0
+                p = "pau"
+            elif p == "q":
+                p = "cl"
+            phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot
+
+        if normalize:
+            f0 += get_normalize_diff(
+                engine=self, kana=query.kana, f0=f0, speaker_id=speaker
+            )
+
+        f0 *= 2 ** query.pitchScale
+        f0[f0 > 6.5] = 6.5
+        f0[(0 < f0) & (f0 < 3)] = 3.0
+
+        f0 = resample(f0, int(len(f0) / query.speedScale))
+        phone_list = resample(phone_list, int(len(phone_list) / query.speedScale))
+
+        wave = self.decode_forwarder(
+            length=phone_list.shape[0],
+            phoneme_size=phone_list.shape[1],
+            f0=f0[:, numpy.newaxis].astype(numpy.float32),
+            phoneme=phone_list,
+            speaker_id=numpy.array([speaker], dtype=numpy.int64).reshape(-1),
+        )
+
+        if query.volumeScale != 1:
+            wave *= query.volumeScale
+
+        if query.outputSamplingRate != self.default_sampling_rate:
+            wave = resample(
+                wave,
+                query.outputSamplingRate * len(wave) // self.default_sampling_rate,
+            )
+
+        if query.outputStereo:
+            wave = numpy.array([wave, wave]).T
+
+        return wave
+
+    def guided_accent_phrases(
+        self,
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> List[AccentPhrase]:
+        kana = create_kana(accent_phrases=accent_phrases)
+        f0, phonemes = extract_guided_feature(audio_file, kana)
+        timed_phonemes = frame_to_second(deepcopy(phonemes))
+
+        phrase_info = []
+        for ((s, e, p), (ts, te, _tp)) in zip(phonemes, timed_phonemes):
+            if p not in unvoiced_mora_phoneme_list:
+                clip = f0[resample_ts(s) : resample_ts(e)]
+                clip = clip[clip != 0]
+                pitch = numpy.average(clip) if len(clip) != 0 else 0
+            else:
+                pitch = 0
+            pitch = 0 if numpy.isnan(pitch) else pitch
+            length = float(te) - float(ts)
+            phrase_info.append(PhraseInfo(pitch, length, p))
+
+        if normalize:
+            normalize_diff = get_normalize_diff(
+                engine=self, kana=kana, f0=f0, speaker_id=speaker
+            )
+            for p in phrase_info:
+                p.pitch += normalize_diff
+
+        idx = 1
+        for phrase in accent_phrases:
+            for mora in phrase.moras:
+                if mora.consonant is not None:
+                    mora.pitch = (
+                        phrase_info[idx].pitch + phrase_info[idx + 1].pitch
+                    ) / 2
+                    mora.consonant_length = phrase_info[idx].length
+                    mora.vowel_length = phrase_info[idx + 1].length
+                    idx += 2
+                else:
+                    mora.pitch = phrase_info[idx].pitch
+                    mora.vowel_length = phrase_info[idx].length
+                    idx += 1
+            if phrase_info[idx].phoneme == "sp":
+                phrase.pause_mora = Mora(
+                    text="、",
+                    consonant=None,
+                    consonant_length=None,
+                    vowel="pau",
+                    vowel_length=phrase_info[idx].length,
+                    pitch=0,
+                )
+                idx += 1
+
+        return accent_phrases
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
index ba567bd44..c84a213fe 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py
@@ -1,6 +1,7 @@
 import copy
 from abc import ABCMeta, abstractmethod
 from typing import List, Optional
+from typing.io import IO
 
 from .. import full_context_label
 from ..full_context_label import extract_full_context_label
@@ -224,3 +225,23 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int):
             音声合成結果
         """
         raise NotImplementedError()
+
+    @abstractmethod
+    def guided_synthesis(
+        self,
+        query: AudioQuery,
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def guided_accent_phrases(
+        self,
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> List[AccentPhrase]:
+        raise NotImplementedError()