diff --git a/.gitignore b/.gitignore index 9e8eea619..63a996ab7 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,9 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ + +# Guided Synthesis temp files +/voicevox_engine/experimental/dictation-kit* +first_pass* +second_pass* +tmp.wav \ No newline at end of file diff --git a/README.md b/README.md index f228b39cd..a397c6f5a 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,37 @@ curl -s \ > audio.wav ``` +### Guidied Synthsis +Currently, we have two apis which accept an uploaded audio file and return corresponding synthesis information. +Both of them recommend setting `is_kana` to be `true` and use `kana` section from `AudioQuery` for the best performance. +You can also get the kana text in AquesTalk section. +```bash +# Returns an audio file which is synthesised referencing uploaded audio +# this example needs a recording whose content is +# "また 東寺のように 五大明王と 呼ばれる 主要な 明王の 中央に 配されることも多い" + +curl -L -X POST 'localhost:50021/guided_synthesis' \ + -F 'kana="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \ + -F 'speaker_id="5"' \ + -F 'audio_file=@"/full_path_to_your_recording"' \ + -F 'normalize="true"' \ + -F 'stereo="true"' \ + -F 'sample_rate="24000"' \ + -F 'volume_scale="1"' \ + -F 'pitch_scale="0"' \ + -F 'speed_scale="1"' + +# Returns a list of AccentPhrases + +curl -L -X POST 'localhost:50021/guided_accent_phrase' \ + -F 'text="マ'\''タ、ト'\''オジノヨオニ、ゴダイミョオオ'\''オト、ヨ'\''/バレ'\''ル、シュ'\''ヨオナ、ミョオ'\''オオ/ノ'\''、チュ'\''ウオオニ、ハイサレルコ'\''/トモ'\''オオイ"' \ + -F 'speaker="5"' \ + -F 'audio_file=@"/full_path_to_your_recording"' \ + -F 'normalize="true"' \ + -F 'is_kana="true"' \ + -F 'enable_interrogative="false"' +``` + ### 話者の追加情報を取得するサンプルコード 追加情報の中の portrait.png を取得するコードです。 diff --git a/run.py b/run.py index 3dafb4816..d166827a7 100644 --- a/run.py +++ b/run.py @@ -17,7 +17,7 @@ import soundfile import uvicorn -from fastapi import FastAPI, HTTPException, Request, Response +from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile from fastapi.middleware.cors import CORSMiddleware from fastapi.params import Query from pydantic import ValidationError @@ -215,6 +215,76 @@ def accent_phrases( else: return engine.create_accent_phrases(text, speaker_id=speaker) + @app.post( + "/guided_accent_phrase", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="Create Accent Phrase from External Audio", + ) + def guided_accent_phrase( + text: str = Form(...), # noqa:B008 + speaker: int = Form(...), # noqa:B008 + is_kana: bool = Form(...), # noqa:B008 + audio_file: UploadFile = File(...), # noqa: B008 + normalize: bool = Form(...), # noqa:B008 + core_version: Optional[str] = None, + ): + """ + Extracts f0 and aligned phonemes, calculates average f0 for every phoneme. + Returns a list of AccentPhrase. + **This API works in the resolution of phonemes.** + """ + if not args.enable_guided_synthesis: + raise HTTPException( + status_code=404, + detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。", + ) + engine = get_engine(core_version) + if is_kana: + try: + accent_phrases = parse_kana(text) + except ParseKanaError as err: + raise HTTPException( + status_code=400, + detail=ParseKanaBadRequest(err).dict(), + ) + else: + accent_phrases = engine.create_accent_phrases( + text, + speaker_id=speaker, + ) + + try: + return engine.guided_accent_phrases( + accent_phrases=accent_phrases, + speaker=speaker, + audio_file=audio_file.file, + normalize=normalize, + ) + except ParseKanaError as err: + raise HTTPException( + status_code=422, + detail=ParseKanaBadRequest(err).dict(), + ) + except StopIteration: + print(traceback.format_exc()) + raise HTTPException( + status_code=500, + detail="Failed in Forced Alignment", + ) + except Exception as e: + print(traceback.format_exc()) + if str(e) == "Decode Failed": + raise HTTPException( + status_code=500, + detail="Failed in Forced Alignment", + ) + else: + raise HTTPException( + status_code=500, + detail="Internal Server Error", + ) + @app.post( "/mora_data", response_model=List[AccentPhrase], @@ -366,7 +436,7 @@ def multi_synthesis( format="WAV", ) wav_file.seek(0) - zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read()) + zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read()) return FileResponse(f.name, media_type="application/zip") @@ -420,6 +490,90 @@ def _synthesis_morphing( return FileResponse(f.name, media_type="audio/wav") + @app.post( + "/guided_synthesis", + responses={ + 200: { + "content": { + "audio/wav": {"schema": {"type": "string", "format": "binary"}} + }, + } + }, + tags=["音声合成"], + summary="Audio synthesis guided by external audio and phonemes", + ) + def guided_synthesis( + kana: str = Form(...), # noqa: B008 + speaker_id: int = Form(...), # noqa: B008 + normalize: bool = Form(...), # noqa: B008 + audio_file: UploadFile = File(...), # noqa: B008 + stereo: bool = Form(...), # noqa: B008 + sample_rate: int = Form(...), # noqa: B008 + volume_scale: float = Form(...), # noqa: B008 + pitch_scale: float = Form(...), # noqa: B008 + speed_scale: float = Form(...), # noqa: B008 + core_version: Optional[str] = None, + ): + """ + Extracts and passes the f0 and aligned phonemes to engine. + Returns the synthesized audio. + **This API works in the resolution of frame.** + """ + if not args.enable_guided_synthesis: + raise HTTPException( + status_code=404, + detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。", + ) + engine = get_engine(core_version) + try: + accent_phrases = parse_kana(kana) + query = AudioQuery( + accent_phrases=accent_phrases, + speedScale=speed_scale, + pitchScale=pitch_scale, + intonationScale=1, + volumeScale=volume_scale, + prePhonemeLength=0.1, + postPhonemeLength=0.1, + outputSamplingRate=sample_rate, + outputStereo=stereo, + kana=kana, + ) + wave = engine.guided_synthesis( + audio_file=audio_file.file, + query=query, + speaker=speaker_id, + normalize=normalize, + ) + + with NamedTemporaryFile(delete=False) as f: + soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV") + + return FileResponse(f.name, media_type="audio/wav") + except ParseKanaError as err: + raise HTTPException( + status_code=400, + detail=ParseKanaBadRequest(err).dict(), + ) + except StopIteration: + print(traceback.format_exc()) + raise HTTPException( + status_code=500, + detail="Failed in Forced Alignment.", + ) + except Exception as e: + print(traceback.format_exc()) + if str(e) == "Decode Failed": + raise HTTPException( + status_code=500, + detail="Failed in Forced Alignment.", + ) + else: + raise HTTPException( + status_code=500, + detail="Internal Server Error.", + ) + @app.post( "/connect_waves", response_class=FileResponse, @@ -665,6 +819,7 @@ def supported_devices( parser.add_argument("--runtime_dir", type=Path, default=None, action="append") parser.add_argument("--enable_mock", action="store_true") parser.add_argument("--enable_cancellable_synthesis", action="store_true") + parser.add_argument("--enable_guided_synthesis", action="store_true") parser.add_argument("--init_processes", type=int, default=2) # 引数へcpu_num_threadsの指定がなければ、環境変数をロールします。 diff --git a/voicevox_engine/dev/synthesis_engine/mock.py b/voicevox_engine/dev/synthesis_engine/mock.py index ed710274f..d291aeea2 100644 --- a/voicevox_engine/dev/synthesis_engine/mock.py +++ b/voicevox_engine/dev/synthesis_engine/mock.py @@ -1,5 +1,6 @@ from logging import getLogger from typing import Any, Dict, List, Optional +from typing.io import IO import numpy as np from pyopenjtalk import tts @@ -130,3 +131,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray: wave, sr = tts(text) wave = resample(wave, 24000 * len(wave) // 48000) return wave + + def guided_synthesis( + self, + query: AudioQuery, + speaker: int, + audio_file: IO, + normalize: int, + ) -> np.ndarray: + """ + Open jtalk doesn't have a guided function [Mock] + simply calling mock synthesis + + Parameters + ---------- + query + speaker + audio_file + normalize + + Returns + ------- + + """ + return self.synthesis(query=query, speaker_id=speaker) + + def guided_accent_phrases( + self, + accent_phrases: List[AccentPhrase], + speaker: int, + audio_file: IO, + normalize: int, + ) -> List[AccentPhrase]: + """ + guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock] + + Parameters + ---------- + query + speaker + audio_file + normalize + + Returns + ------- + + """ + return accent_phrases diff --git a/voicevox_engine/experimental/__init__.py b/voicevox_engine/experimental/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/voicevox_engine/experimental/guided_extractor.py b/voicevox_engine/experimental/guided_extractor.py new file mode 100644 index 000000000..8e4c280d2 --- /dev/null +++ b/voicevox_engine/experimental/guided_extractor.py @@ -0,0 +1,206 @@ +import os +import re +import tarfile +from os.path import exists +from pathlib import PurePath +from typing.io import IO +from urllib.request import urlretrieve + +import numpy as np +import pkg_resources +import pyworld as pw +from scipy.io import wavfile +from scipy.signal import resample + +from voicevox_engine.experimental.julius4seg import converter, sp_inserter +from voicevox_engine.experimental.julius4seg.sp_inserter import ModelType, space_symbols +from voicevox_engine.kana_parser import parse_kana + +JULIUS_SAMPLE_RATE = 16000 +FRAME_PERIOD = 1.0 +PUNCTUATION = ["_", "'", "/", "、"] +SIL_SYMBOL = ["silB", "silE", "sp"] +TMP_PATH = "tmp.wav" +UUT_ID = "tmp" +TEMP_FILE_LIST = [ + "first_pass.dfa", + "first_pass.dict", + "second_pass.dfa", + "second_pass.dict", + "tmp.wav", +] + +_JULIUS_DICTATION_URL = "https://github.com/julius-speech/dictation-kit/archive/refs/tags/dictation-kit-v4.3.1.tar.gz" # noqa: B950 +JULIUS_DICTATION_DIR = os.environ.get( + "JULIUS_DICTATION_DIR", + # they did put two "dictation-kit"s in extracted folder name + pkg_resources.resource_filename(__name__, "dictation-kit-dictation-kit-v4.3.1"), +) + +sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR) + + +class PhraseInfo: + def __init__(self, pitch: float, length: float, phoneme: str): + self.pitch = pitch + self.length = length + self.phoneme = phoneme + + +def _lazy_init(): + if not exists(JULIUS_DICTATION_DIR): + print("Julius not found, Downloading") + _extract_julius() + + +def _extract_julius(): + global JULIUS_DICTATION_DIR + filename = pkg_resources.resource_filename(__name__, "dictation-kit.tar.gz") + print("Downloading Julius...", _JULIUS_DICTATION_URL) + urlretrieve(_JULIUS_DICTATION_URL, filename) + print("Extracting Julius...", JULIUS_DICTATION_DIR) + with tarfile.open(filename, mode="r|gz") as f: + f.extractall(path=pkg_resources.resource_filename(__name__, "")) + JULIUS_DICTATION_DIR = pkg_resources.resource_filename( + __name__, "dictation-kit-dictation-kit-v4.3.1" + ) + sp_inserter.JULIUS_ROOT = PurePath(JULIUS_DICTATION_DIR) + os.remove(filename) + + +def resample_ts(timestamp: str): + """ + 0.9375 = 24000 / 256 / 1000 * 10 + 10 is for julius4seg produces timestamp in 10 ms + """ + return int((float(timestamp) * 0.9375)) + + +def get_normalize_diff(engine, kana: str, f0: np.ndarray, speaker_id: int): + f0_avg = _no_nan(np.average(f0[f0 != 0])) + predicted_phrases = parse_kana(kana) + engine.replace_mora_data(predicted_phrases, speaker_id=speaker_id) + pitch_list = [] + for phrase in predicted_phrases: + for mora in phrase.moras: + pitch_list.append(mora.pitch) + pitch_list = np.array(pitch_list, dtype=np.float64) + predicted_avg = _no_nan(np.average(pitch_list[pitch_list != 0])) + return predicted_avg - f0_avg + + +def _no_nan(num): + return 0.0 if np.isnan(num) else num + + +def extract_guided_feature(audio_file: IO, kana: str): + _lazy_init() + sr, wave = wavfile.read(audio_file) + # stereo to mono + if len(wave.shape) == 2: + wave = wave.sum(axis=1) / 2 + + f0 = extract_f0(wave, sr, 256 / 24000 * 1000) + + julius_wave = resample(wave, JULIUS_SAMPLE_RATE * len(wave) // sr) + + # normalization for different WAV format + if julius_wave.dtype == "float32": + julius_wave *= 32767 + if julius_wave.dtype == "int32": + julius_wave = np.floor_divide(julius_wave, 2147483392 / 32767) + if julius_wave.dtype == "uint8": + # floor of 32767 / 255 + julius_wave *= 128 + + julius_wave = julius_wave.astype(np.int16) + + julius_kana = re.sub( + "|".join(PUNCTUATION), "", kana.replace("/", "").replace("、", " ") + ) + + phones = forced_align(julius_wave, julius_kana) + return f0, phones + + +def forced_align(julius_wave: np.ndarray, base_kata_text: str): + model_type = ModelType.gmm + hmm_model = os.path.join( + JULIUS_DICTATION_DIR, "model/phone_m/jnas-mono-16mix-gid.binhmm" + ) + options = [] + + base_kata_text = sp_inserter.kata2hira(base_kata_text) + + julius_phones = [converter.conv2openjtalk(hira) for hira in base_kata_text.split()] + + base_kan_text = ["sym_{}".format(i) for i in range(len(julius_phones))] + + assert len(base_kan_text) == len(julius_phones), f"{base_kan_text}\n{julius_phones}" + + dict_1st = sp_inserter.gen_julius_dict_1st(base_kan_text, julius_phones, model_type) + dfa_1st = sp_inserter.gen_julius_dfa(dict_1st.count("\n")) + + with open("first_pass.dict", "w", encoding="utf-8") as f: + f.write(dict_1st) + + with open("first_pass.dfa", "w", encoding="utf-8") as f: + f.write(dfa_1st) + wavfile.write(TMP_PATH, JULIUS_SAMPLE_RATE, julius_wave) + + raw_first_output = sp_inserter.julius_sp_insert( + TMP_PATH, + "first_pass", + hmm_model, + model_type, + options, + ) + + forced_phones_with_sp = [] + try: + _, sp_position = sp_inserter.get_sp_inserted_text(raw_first_output) + + for j, (_t, p) in enumerate(zip(base_kan_text, julius_phones)): + forced_phones_with_sp.append(p) + if j in sp_position: + forced_phones_with_sp.append(space_symbols[model_type]) + + forced_phones_with_sp = " ".join(forced_phones_with_sp) + except Exception: + pass + + phones_with_sp = sp_inserter.get_sp_inserterd_phone_seqence( + raw_first_output, model_type + ) + if len(phones_with_sp) < 2: + forced_phones_with_sp = phones_with_sp + + dict_2nd = sp_inserter.gen_julius_dict_2nd(forced_phones_with_sp, model_type) + dfa_2nd = sp_inserter.gen_julius_aliment_dfa(dict_2nd.count("\n")) + + with open("second_pass.dict", "w") as f: + f.write(dict_2nd) + + with open("second_pass.dfa", "w") as f: + f.write(dfa_2nd) + + raw_second_output = sp_inserter.julius_phone_alignment( + TMP_PATH, "second_pass", hmm_model, model_type, options + ) + time_alimented_list = sp_inserter.get_time_alimented_list(raw_second_output) + + assert len(time_alimented_list) > 0, raw_second_output + + for file in TEMP_FILE_LIST: + os.remove(file) + + return time_alimented_list + + +def extract_f0(wave: np.ndarray, sr: int, frame_period: float): + w = wave.astype(np.float64) + f0, t = pw.harvest(w, sr, frame_period=frame_period) + vuv = f0 != 0 + f0_log = np.zeros_like(f0) + f0_log[vuv] = np.log(f0[vuv]) + return f0_log diff --git a/voicevox_engine/experimental/julius4seg/__init__.py b/voicevox_engine/experimental/julius4seg/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/voicevox_engine/experimental/julius4seg/converter.py b/voicevox_engine/experimental/julius4seg/converter.py new file mode 100644 index 000000000..efc596cad --- /dev/null +++ b/voicevox_engine/experimental/julius4seg/converter.py @@ -0,0 +1,319 @@ +import re + + +def conv2julius(s: str) -> str: + """入力の単語の読み(ひらがな)をJuliusの音素列に変換 + args: + kana(str): カタカナ文字列 + "やきにく" + returns: + (str): ひらがな文字列 + " y a k i n i k u" + """ + s = s.replace("あぁ", " a a") + s = s.replace("いぃ", " i i") + s = s.replace("いぇ", " i e") + s = s.replace("いゃ", " y a") + s = s.replace("うぁ", " u a") + s = s.replace("うぃ", " w i") + s = s.replace("うぅ", " u:") + s = s.replace("うぇ", " w e") + s = s.replace("うぉ", " w o") + s = s.replace("えぇ", " e e") + s = s.replace("おぉ", " o:") + s = s.replace("かぁ", " k a:") + s = s.replace("がぁ", " g a:") + s = s.replace("きぃ", " k i:") + s = s.replace("きぇ", " ky e") + s = s.replace("きゃ", " ky a") + s = s.replace("きゅ", " ky u") + s = s.replace("きょ", " ky o") + s = s.replace("ぎぃ", " g i:") + s = s.replace("ぎぇ", " gy e") + s = s.replace("ぎゃ", " gy a") + s = s.replace("ぎゅ", " gy u") + s = s.replace("ぎょ", " gy o") + s = s.replace("くぅ", " k u:") + s = s.replace("くゃ", " ky a") + s = s.replace("くゅ", " ky u") + s = s.replace("くょ", " ky o") + s = s.replace("ぐぅ", " g u:") + s = s.replace("ぐゃ", " gy a") + s = s.replace("ぐゅ", " gy u") + s = s.replace("ぐょ", " gy o") + s = s.replace("けぇ", " k e:") + s = s.replace("げぇ", " g e:") + s = s.replace("こぉ", " k o:") + s = s.replace("ごぉ", " g o:") + s = s.replace("さぁ", " s a:") + s = s.replace("ざぁ", " z a:") + s = s.replace("しぃ", " sh i:") + s = s.replace("しぇ", " sh e") + s = s.replace("しゃ", " sh a") + s = s.replace("しゅ", " sh u") + s = s.replace("しょ", " sh o") + s = s.replace("じぃ", " j i:") + s = s.replace("じぇ", " j e") + s = s.replace("じゃ", " j a") + s = s.replace("じゅ", " j u") + s = s.replace("じょ", " j o") + s = s.replace("すぃ", " s i") + s = s.replace("すぅ", " s u:") + s = s.replace("すゃ", " sh a") + s = s.replace("すゅ", " sh u") + s = s.replace("すょ", " sh o") + s = s.replace("ずぁ", " z u a") + s = s.replace("ずぃ", " z i") + s = s.replace("ずぅ", " z u") + s = s.replace("ずぅ", " z u:") + s = s.replace("ずぇ", " z e") + s = s.replace("ずぉ", " z o") + s = s.replace("ずゃ", " zy a") + s = s.replace("ずゃ", " zy a") + s = s.replace("ずゅ", " zy u") + s = s.replace("ずゅ", " zy u") + s = s.replace("ずょ", " zy o") + s = s.replace("ずょ", " zy o") + s = s.replace("せぇ", " s e:") + s = s.replace("ぜぇ", " z e:") + s = s.replace("そぉ", " s o:") + s = s.replace("ぞぉ", " z o:") + s = s.replace("たぁ", " t a:") + s = s.replace("だぁ", " d a:") + s = s.replace("ちぃ", " ch i:") + s = s.replace("ちぇ", " ch e") + s = s.replace("ちゃ", " ch a") + s = s.replace("ちゅ", " ch u") + s = s.replace("ちょ", " ch o") + s = s.replace("ぢぃ", " j i:") + s = s.replace("ぢぇ", " j e") + s = s.replace("ぢゃ", " j a") + s = s.replace("ぢゅ", " j u") + s = s.replace("ぢょ", " j o") + s = s.replace("つぁ", " ts a") + s = s.replace("つぃ", " ts i") + s = s.replace("つぅ", " ts u:") + s = s.replace("つぇ", " ts e") + s = s.replace("つぉ", " ts o") + s = s.replace("つゃ", " ch a") + s = s.replace("つゅ", " ch u") + s = s.replace("つょ", " ch o") + s = s.replace("づぅ", " d u:") + s = s.replace("づゃ", " zy a") + s = s.replace("づゅ", " zy u") + s = s.replace("づょ", " zy o") + s = s.replace("てぃ", " t i") + s = s.replace("てぇ", " t e:") + s = s.replace("てぇ", " t e:") + s = s.replace("てゃ", " t a") + s = s.replace("てゅ", " t u") + s = s.replace("てょ", " t o") + s = s.replace("でぃ", " d i") + s = s.replace("でぇ", " d e:") + s = s.replace("でぇ", " d e:") + s = s.replace("でゃ", " d a") + s = s.replace("でゅ", " d u") + s = s.replace("でょ", " d o") + s = s.replace("とぅ", " t u") + s = s.replace("とぉ", " t o:") + s = s.replace("とゃ", " t a") + s = s.replace("とゅ", " t u") + s = s.replace("とょ", " t o") + s = s.replace("どぁ", " d o a") + s = s.replace("どぅ", " d u") + s = s.replace("どぉ", " d o:") + s = s.replace("どぉ", " d o:") + s = s.replace("どゃ", " d a") + s = s.replace("どゅ", " d u") + s = s.replace("どょ", " d o") + s = s.replace("なぁ", " n a:") + s = s.replace("にぃ", " n i:") + s = s.replace("にぇ", " ny e") + s = s.replace("にゃ", " ny a") + s = s.replace("にゅ", " ny u") + s = s.replace("にょ", " ny o") + s = s.replace("ぬぅ", " n u:") + s = s.replace("ぬゃ", " ny a") + s = s.replace("ぬゅ", " ny u") + s = s.replace("ぬょ", " ny o") + s = s.replace("ねぇ", " n e:") + s = s.replace("のぉ", " n o:") + s = s.replace("はぁ", " h a:") + s = s.replace("ばぁ", " b a:") + s = s.replace("ぱぁ", " p a:") + s = s.replace("ひぃ", " h i:") + s = s.replace("ひぇ", " hy e") + s = s.replace("ひゃ", " hy a") + s = s.replace("ひゅ", " hy u") + s = s.replace("ひょ", " hy o") + s = s.replace("びぃ", " b i:") + s = s.replace("びぇ", " by e") + s = s.replace("びゃ", " by a") + s = s.replace("びゅ", " by u") + s = s.replace("びょ", " by o") + s = s.replace("ぴぃ", " p i:") + s = s.replace("ぴぇ", " py e") + s = s.replace("ぴゃ", " py a") + s = s.replace("ぴゅ", " py u") + s = s.replace("ぴょ", " py o") + s = s.replace("ふぁ", " f a") + s = s.replace("ふぃ", " f i") + s = s.replace("ふぅ", " f u") + s = s.replace("ふぅ", " f u:") + s = s.replace("ふぇ", " f e") + s = s.replace("ふぉ", " f o") + s = s.replace("ふゃ", " hy a") + s = s.replace("ふゃ", " hy a") + s = s.replace("ふゃ", " hy a") + s = s.replace("ふゅ", " hy u") + s = s.replace("ふゅ", " hy u") + s = s.replace("ふょ", " hy o") + s = s.replace("ふょ", " hy o") + s = s.replace("ふょ", " hy o") + s = s.replace("ぶぅ", " b u:") + s = s.replace("ぶゅ", " by u") + s = s.replace("ぷぅ", " p u:") + s = s.replace("ぷゃ", " py a") + s = s.replace("ぷゅ", " py u") + s = s.replace("ぷょ", " py o") + s = s.replace("へぇ", " h e:") + s = s.replace("べぇ", " b e:") + s = s.replace("ぺぇ", " p e:") + s = s.replace("ほぉ", " h o:") + s = s.replace("ぼぉ", " b o:") + s = s.replace("ぽぉ", " p o:") + s = s.replace("まぁ", " m a:") + s = s.replace("みぃ", " m i:") + s = s.replace("みぇ", " my e") + s = s.replace("みゃ", " my a") + s = s.replace("みゅ", " my u") + s = s.replace("みょ", " my o") + s = s.replace("むぅ", " m u:") + s = s.replace("むゃ", " my a") + s = s.replace("むゅ", " my u") + s = s.replace("むょ", " my o") + s = s.replace("めぇ", " m e:") + s = s.replace("もぉ", " m o:") + s = s.replace("やぁ", " y a:") + s = s.replace("ゆぅ", " y u:") + s = s.replace("ゆゃ", " y a:") + s = s.replace("ゆゅ", " y u:") + s = s.replace("ゆょ", " y o:") + s = s.replace("よぉ", " y o:") + s = s.replace("らぁ", " r a:") + s = s.replace("りぃ", " r i:") + s = s.replace("りぇ", " ry e") + s = s.replace("りゃ", " ry a") + s = s.replace("りゅ", " ry u") + s = s.replace("りょ", " ry o") + s = s.replace("るぅ", " r u:") + s = s.replace("るゃ", " ry a") + s = s.replace("るゅ", " ry u") + s = s.replace("るょ", " ry o") + s = s.replace("れぇ", " r e:") + s = s.replace("ろぉ", " r o:") + s = s.replace("わぁ", " w a:") + s = s.replace("をぉ", " o:") + s = s.replace("ゔぁ", " b a") + s = s.replace("ゔぃ", " b i") + s = s.replace("ゔぇ", " b e") + s = s.replace("ゔぉ", " b o") + s = s.replace("ゔゅ", " by u") + + # 1音からなる変換規則 + s = s.replace("あ", " a") + s = s.replace("い", " i") + s = s.replace("う", " u") + s = s.replace("え", " e") + s = s.replace("お", " o") + s = s.replace("か", " k a") + s = s.replace("き", " k i") + s = s.replace("く", " k u") + s = s.replace("け", " k e") + s = s.replace("こ", " k o") + s = s.replace("さ", " s a") + s = s.replace("し", " sh i") + s = s.replace("す", " s u") + s = s.replace("せ", " s e") + s = s.replace("そ", " s o") + s = s.replace("た", " t a") + s = s.replace("ち", " ch i") + s = s.replace("つ", " ts u") + s = s.replace("て", " t e") + s = s.replace("と", " t o") + s = s.replace("な", " n a") + s = s.replace("に", " n i") + s = s.replace("ぬ", " n u") + s = s.replace("ね", " n e") + s = s.replace("の", " n o") + s = s.replace("は", " h a") + s = s.replace("ひ", " h i") + s = s.replace("ふ", " f u") + s = s.replace("へ", " h e") + s = s.replace("ほ", " h o") + s = s.replace("ま", " m a") + s = s.replace("み", " m i") + s = s.replace("む", " m u") + s = s.replace("め", " m e") + s = s.replace("も", " m o") + s = s.replace("ら", " r a") + s = s.replace("り", " r i") + s = s.replace("る", " r u") + s = s.replace("れ", " r e") + s = s.replace("ろ", " r o") + s = s.replace("が", " g a") + s = s.replace("ぎ", " g i") + s = s.replace("ぐ", " g u") + s = s.replace("げ", " g e") + s = s.replace("ご", " g o") + s = s.replace("ざ", " z a") + s = s.replace("じ", " j i") + s = s.replace("ず", " z u") + s = s.replace("ぜ", " z e") + s = s.replace("ぞ", " z o") + s = s.replace("だ", " d a") + s = s.replace("ぢ", " j i") + s = s.replace("づ", " z u") + s = s.replace("で", " d e") + s = s.replace("ど", " d o") + s = s.replace("ば", " b a") + s = s.replace("び", " b i") + s = s.replace("ぶ", " b u") + s = s.replace("べ", " b e") + s = s.replace("ぼ", " b o") + s = s.replace("ぱ", " p a") + s = s.replace("ぴ", " p i") + s = s.replace("ぷ", " p u") + s = s.replace("ぺ", " p e") + s = s.replace("ぽ", " p o") + s = s.replace("や", " y a") + s = s.replace("ゆ", " y u") + s = s.replace("よ", " y o") + s = s.replace("わ", " w a") + s = s.replace("を", " o") + s = s.replace("ん", " N") + s = s.replace("っ", " q") + s = s.replace("ー", ":") + s = s.replace("ゔ", " b u") + + s = s.replace("ぁ", " a") + s = s.replace("ぃ", " i") + s = s.replace("ぅ", " u") + s = s.replace("ぇ", " e") + s = s.replace("ぉ", " o") + s = s.replace("ゎ", " w a") + + s = s[1:] + + s = re.sub(r":+", ":", s) + + return s + + +def conv2openjtalk(s: str) -> str: + """入力の単語の読み(ひらがな)をOpenJTalkのような音素列に変換""" + s = conv2julius(s) + + s = re.sub(r"(.):", r"\1 \1", s) + + return s diff --git a/voicevox_engine/experimental/julius4seg/sp_inserter.py b/voicevox_engine/experimental/julius4seg/sp_inserter.py new file mode 100644 index 000000000..cbc94dd35 --- /dev/null +++ b/voicevox_engine/experimental/julius4seg/sp_inserter.py @@ -0,0 +1,353 @@ +import re +import subprocess +import sys +from enum import Enum +from itertools import chain +from typing import List, Optional, Tuple + + +class ModelType(str, Enum): + gmm = "gmm" + dnn = "dnn" + + +JULIUS_ROOT = "." + +begin_silent_symbols = {ModelType.gmm: "silB", ModelType.dnn: "sp_B"} +end_silent_symbols = {ModelType.gmm: "silE", ModelType.dnn: "sp_E"} +space_symbols = {ModelType.gmm: "sp", ModelType.dnn: "sp_S"} + + +def get_os_dependent_directory() -> str: + """Juluis Segmentaion-Kitのディレクトリ名をOSの種類から取得 + returns: + (str): OS依存のパスの一部 + """ + if sys.platform.startswith("win") or sys.platform.startswith("cygwin"): + return "windows" + elif sys.platform.startswith("darwin"): + return "osx" + elif sys.platform.startswith("linux"): + return "linux" + + +def get_os_dependent_exec() -> str: + """Juliusの実行ファイル名を取得 + returns: + (str): Juliusの実行ファイル名 + """ + if sys.platform.startswith("win") or sys.platform.startswith("cygwin"): + return "julius.exe" + else: + return "julius" + + +def get_os_dependent_echo(filename: str) -> list: + """Get parameters of echo referencing platforms + Returns: + list[str]: echo parameters + """ + if sys.platform.startswith("win") or sys.platform.startswith("cygwin"): + return ["cmd.exe", "/c", "echo " + filename] + else: + return ["echo", filename] + + +def kata2hira(kana: str) -> str: + """ヵ,ヶ以外のカタカナをひらがなに変換 + args: + kana(str): カタカナ文字列 + "ヤキニク" + returns: + (str): ひらがな文字列 + "やきにく" + """ + return "".join( + [ + chr(ord(c) + ord("あ") - ord("ア")) if ord("ァ") <= ord(c) <= ord("ヴ") else c + for c in kana + ] + ) + + +def gen_julius_dict_1st( + text_symbols: List[str], word_phones: List[str], model_type: ModelType +) -> str: + """テキストのシンボルと読みの音素のJulius dictファイルの中身を生成 + args: + text_symbols ([str]): 単語のシンボル + ['今回', 'は'] + word_phones ([str]): 単語の音素系列 + ['k o N k a i', 'w a'] + returns: + (str): Juliusのdictファイルの中身 + """ + tmp = [] + finit = len(text_symbols) + + for i, zipped in enumerate(zip(text_symbols, word_phones)): + tmp.append("{}\t[{}]\t{}".format(i * 2, *zipped)) + if i + 1 != finit: + tmp.append( + "{}\t[{}]\t{}".format( + i * 2 + 1, "sp_{}".format(i), space_symbols[model_type] + ) + ) + + # append sp and Start, End symbol + tmp.append( + "{}\t[{}]\t{}".format(i * 2 + 1, "", begin_silent_symbols[model_type]) + ) + tmp.append( + "{}\t[{}]\t{}".format((i + 1) * 2, "", end_silent_symbols[model_type]) + ) + + return "\n".join(tmp) + "\n" + + +def gen_julius_dfa(number_of_words: int) -> str: + """単語数から遷移のためのJuliusのdfaファイルの中身を生成 + args: + number_of_words (int): 遷移する単語の単語数 + returns: + (str): Juliusのdfaファイルの中身 + """ + i = 0 + current_word = number_of_words - 3 + isLast = False + tmp = [] + while True: + if i == 0: + tmp.append("{} {} {} {} {}".format(i, number_of_words - 1, i + 1, 0, 1)) + i += 1 + elif i > 0 and not isLast: + tmp.append("{} {} {} {} {}".format(i, current_word, i + 1, 0, 0)) + current_word -= 1 + isLast = current_word == -1 + i += 1 + elif i > 0 and isLast: + tmp.append("{} {} {} {} {}".format(i, i - 1, i + 1, 0, 0)) + tmp.append("{} {} {} {} {}".format(i + 1, -1, -1, 1, 0)) + break + + return "\n".join(tmp) + "\n" + + +def gen_julius_dict_2nd(phone_seqence: str, model_type: ModelType) -> str: + """音素系列から強制アライメントのためのdictファイルの中身を生成 + args: + phone_seqence (str): + 'k o N k a i w a ' + returns: + (str): Juliusのdictファイルの中身 + """ + phone_seqences = phone_seqence.split(f" {space_symbols[model_type]} ") + return ( + "\n".join( + [ + f"{i}\t[w_{i}]\t" + + phone_seqence + + ( + f" {space_symbols[model_type]}" + if i != len(phone_seqences) - 1 + else "" + ) + for i, phone_seqence in enumerate(phone_seqences) + ] + + [ + f"{len(phone_seqences)}\t[w_{len(phone_seqences)}]\t" + + begin_silent_symbols[model_type] + ] + + [ + f"{len(phone_seqences) + 1}\t[w_{len(phone_seqences) + 1}]\t" + + end_silent_symbols[model_type] + ] + ) + + "\n" + ) + + +def gen_julius_aliment_dfa(number_of_words: int) -> str: + """強制アライメント用のdfaファイルの中身を生成 + returns: + (str): Juliusのdfaファイルの中身 + """ + return gen_julius_dfa(number_of_words) + + +def julius_sp_insert( + target_wav_file: str, + aliment_file_signiture: str, + model_path: str, + model_type: ModelType, + options: Optional[List[str]], +) -> List[str]: + if options is None: + options = [] + + julius_args = { + "-h": model_path, + "-input": "file", + "-debug": "", + "-gram": aliment_file_signiture, + "-nostrip": "", + "-spmodel": space_symbols[model_type], + } + + file_echo_p = subprocess.Popen( + get_os_dependent_echo(target_wav_file), + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + julius_p = subprocess.Popen( + " ".join( + [ + str( + JULIUS_ROOT + / "bin" + / get_os_dependent_directory() + / get_os_dependent_exec() + ), + *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])), + ] + + options + ).split(), + stdin=file_echo_p.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + file_echo_p.stdout.close() + return julius_p.communicate()[0].decode("utf-8").split("\n") + + +def get_sp_inserted_text(raw_output: List[str]) -> Tuple[str, List[int]] or None: + """デコード結果からsp挿入後のテキストとspのインデックスを取得する + args: + raw_output: `julius_sp_insert`の出力 + returns: + Tuple(str, [int]): デコード結果とspのindex + """ + r = re.compile(" (.*) ") + pass1_best = next(s for s in raw_output if s.startswith("pass1_best")) + matched = r.search(pass1_best) + if matched is None: + raise Exception("Decode Failed") + + return ( + re.sub(r"sp_[\d+]", "", matched.group(1)), + [int(s.split("_")[1]) for s in matched.group().split() if "sp_" in s], + ) + + +def get_sp_inserterd_phone_seqence(raw_output: List[str], model_type: ModelType) -> str: + try: + pass1_best_phonemeseq = next( + s.rstrip("\r") for s in raw_output if s.startswith("pass1_best_phonemeseq") + ) + except Exception as e: + raise (e) + + complete_re = re.compile( + begin_silent_symbols[model_type] + + r" \| (.*) \| " + + end_silent_symbols[model_type] + ) + failed_re_1 = re.compile( + end_silent_symbols[model_type] + + r" \| (.*) \| " + + begin_silent_symbols[model_type] + ) + failed_re_2 = re.compile(end_silent_symbols[model_type] + r" \| (.*)") + + if complete_re.search(pass1_best_phonemeseq) is not None: + matched = complete_re.search(pass1_best_phonemeseq) + elif failed_re_1.search(pass1_best_phonemeseq) is not None: + matched = failed_re_1.search(pass1_best_phonemeseq) + elif failed_re_2.search(pass1_best_phonemeseq) is not None: + matched = failed_re_2.search(pass1_best_phonemeseq) + else: + raise Exception("Decode Failed") + + tmp = matched.group(1) + return " ".join([s.strip() for s in tmp.split("|")]) + + +def julius_phone_alignment( + target_wav_file: str, + aliment_file_signiture: str, + model_path: str, + model_type: ModelType, + options: Optional[List[str]], +) -> List[str]: + if options is None: + options = [] + + julius_args = { + "-h": model_path, + "-palign": "", + "-input": "file", + "-gram": aliment_file_signiture, + "-nostrip": "", + "-n": "10", + "-s": "10000", + "-sb": "5000", + "-spmodel": space_symbols[model_type], + } + + file_echo_p = subprocess.Popen( + ["echo", target_wav_file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL + ) + julius_p = subprocess.Popen( + " ".join( + [ + str( + JULIUS_ROOT + / "bin" + / get_os_dependent_directory() + / get_os_dependent_exec() + ), + *list(chain.from_iterable([[k, v] for k, v in julius_args.items()])), + ] + + options + ).split(), + stdin=file_echo_p.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ) + file_echo_p.stdout.close() + return julius_p.communicate()[0].decode("utf-8").split("\n") + + +def get_time_alimented_list(raw_output: List[str]) -> List[Tuple[str, str, str]]: + r = re.compile( + r"\[\s*(\d+)\s+(\d+)\s*\]" + r"\s*[\-]*[\d,\.]+\s*" + r"\{?([\w,\:]+)\-?([\w,\:]*)\+?([\w,\:]*)\}?\[?[\w,\:,\-,\+]*\]?$" + ) + + def get_phoneme(left: str, center: str, right: str): + if len(center) == 0 and len(right) == 0: # monophone + return left + elif len(center) > 0: + return center + elif len(center) == 0: + return left + else: + raise ValueError(f"{left} {center} {right}") + + return [ + (s.group(1), s.group(2), get_phoneme(s.group(3), s.group(4), s.group(5))) + for s in map(lambda x: r.search(x.rstrip("\r")), raw_output) + if s is not None + ] + + +def frame_to_second(time_list: List[Tuple[str, str, str]]): + return [ + ( + f"{int(start) * 0.01 + (0.0125 if i > 0 else 0):.4f}", + f"{(int(end) + 1) * 0.01 + 0.0125:.4f}", + phoneme, + ) + for i, (start, end, phoneme) in enumerate(time_list) + ] diff --git a/voicevox_engine/experimental/julius4seg/sp_remover.py b/voicevox_engine/experimental/julius4seg/sp_remover.py new file mode 100644 index 000000000..50838c650 --- /dev/null +++ b/voicevox_engine/experimental/julius4seg/sp_remover.py @@ -0,0 +1,64 @@ +import struct +import wave +from typing import List + +# 有声音素を削らないためのマージン +MARGIN = 5 + + +def get_sp_segment(time_list: List[str]) -> List[List[int]]: + """音素セグメントリストから無音区間の部分のみを抽出 + args: + time_list ([str]): 音素セグメントリスト + returns: + [[int]]: 無音区間の初めと終わりのフレームのリスト + """ + sps = [ + list(map(int, s.split()[:2])) + for s in time_list + if "silB" in s or "silE" in s or "sp" in s + ] + return sps + + +def get_wav_sp_removed( + wav_file_name: str, + sp_segment: List[List[int]], + only_edge: bool = False, + start_margin: int = MARGIN, + end_margin: int = MARGIN, +) -> List[int]: + with wave.open(wav_file_name) as f: + n = f.getnframes() + data = struct.unpack("h" * n, f.readframes(n)) + + removed = [] + + seg_start = 0 + + if only_edge: + tmp = sp_segment[0][1] * 10 - start_margin + seg_start = tmp if tmp > 0 else sp_segment[0][0] * 10 + + tmp = sp_segment[-1][0] * 10 + end_margin + seg_end = tmp if tmp < sp_segment[-1][1] * 10 else sp_segment[-1][1] * 10 + + removed.extend( + data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)] + ) + else: + for i, seg in enumerate(sp_segment): + if i == 0: + seg_start = seg[1] * 10 - MARGIN # ms + continue + + seg_end = seg[0] * 10 + MARGIN + + removed.extend( + data[int(seg_start / 1000 * 16000) : int(seg_end / 1000 * 16000)] + ) + + if i != len(sp_segment) - 1: + seg_start = seg[1] * 10 - MARGIN + + return removed diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index add5cf6ef..fa54e387a 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -1,10 +1,21 @@ +from copy import deepcopy from itertools import chain from typing import List, Optional, Tuple +from typing.io import IO import numpy from scipy.signal import resample +from voicevox_engine.experimental.guided_extractor import ( + PhraseInfo, + extract_guided_feature, + get_normalize_diff, + resample_ts, +) +from voicevox_engine.experimental.julius4seg.sp_inserter import frame_to_second + from ..acoustic_feature_extractor import OjtPhoneme +from ..kana_parser import create_kana from ..model import AccentPhrase, AudioQuery, Mora from .synthesis_engine_base import SynthesisEngineBase @@ -476,3 +487,121 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int): wave = numpy.array([wave, wave]).T return wave + + def guided_synthesis( + self, + query: AudioQuery, + speaker: int, + audio_file: IO, + normalize: int, + ): + f0, phonemes = extract_guided_feature(audio_file, query.kana) + + phone_list = numpy.zeros((len(f0), OjtPhoneme.num_phoneme), dtype=numpy.float32) + + for s, e, p in phonemes: + s, e = (resample_ts(v) for v in (s, e)) + if p == "silB": + f0[:e] = 0.0 + s += 1 + p = "pau" + elif p == "silE": + f0[s:] = 0.0 + p = "pau" + elif p == "sp": + f0[s:e] = 0.0 + p = "pau" + elif p == "q": + p = "cl" + phone_list[s - 1 : e] = OjtPhoneme(start=s, end=e, phoneme=p).onehot + + if normalize: + f0 += get_normalize_diff( + engine=self, kana=query.kana, f0=f0, speaker_id=speaker + ) + + f0 *= 2 ** query.pitchScale + f0[f0 > 6.5] = 6.5 + f0[(0 < f0) & (f0 < 3)] = 3.0 + + f0 = resample(f0, int(len(f0) / query.speedScale)) + phone_list = resample(phone_list, int(len(phone_list) / query.speedScale)) + + wave = self.decode_forwarder( + length=phone_list.shape[0], + phoneme_size=phone_list.shape[1], + f0=f0[:, numpy.newaxis].astype(numpy.float32), + phoneme=phone_list, + speaker_id=numpy.array([speaker], dtype=numpy.int64).reshape(-1), + ) + + if query.volumeScale != 1: + wave *= query.volumeScale + + if query.outputSamplingRate != self.default_sampling_rate: + wave = resample( + wave, + query.outputSamplingRate * len(wave) // self.default_sampling_rate, + ) + + if query.outputStereo: + wave = numpy.array([wave, wave]).T + + return wave + + def guided_accent_phrases( + self, + accent_phrases: List[AccentPhrase], + speaker: int, + audio_file: IO, + normalize: int, + ) -> List[AccentPhrase]: + kana = create_kana(accent_phrases=accent_phrases) + f0, phonemes = extract_guided_feature(audio_file, kana) + timed_phonemes = frame_to_second(deepcopy(phonemes)) + + phrase_info = [] + for ((s, e, p), (ts, te, _tp)) in zip(phonemes, timed_phonemes): + if p not in unvoiced_mora_phoneme_list: + clip = f0[resample_ts(s) : resample_ts(e)] + clip = clip[clip != 0] + pitch = numpy.average(clip) if len(clip) != 0 else 0 + else: + pitch = 0 + pitch = 0 if numpy.isnan(pitch) else pitch + length = float(te) - float(ts) + phrase_info.append(PhraseInfo(pitch, length, p)) + + if normalize: + normalize_diff = get_normalize_diff( + engine=self, kana=kana, f0=f0, speaker_id=speaker + ) + for p in phrase_info: + p.pitch += normalize_diff + + idx = 1 + for phrase in accent_phrases: + for mora in phrase.moras: + if mora.consonant is not None: + mora.pitch = ( + phrase_info[idx].pitch + phrase_info[idx + 1].pitch + ) / 2 + mora.consonant_length = phrase_info[idx].length + mora.vowel_length = phrase_info[idx + 1].length + idx += 2 + else: + mora.pitch = phrase_info[idx].pitch + mora.vowel_length = phrase_info[idx].length + idx += 1 + if phrase_info[idx].phoneme == "sp": + phrase.pause_mora = Mora( + text="、", + consonant=None, + consonant_length=None, + vowel="pau", + vowel_length=phrase_info[idx].length, + pitch=0, + ) + idx += 1 + + return accent_phrases diff --git a/voicevox_engine/synthesis_engine/synthesis_engine_base.py b/voicevox_engine/synthesis_engine/synthesis_engine_base.py index ba567bd44..c84a213fe 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine_base.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine_base.py @@ -1,6 +1,7 @@ import copy from abc import ABCMeta, abstractmethod from typing import List, Optional +from typing.io import IO from .. import full_context_label from ..full_context_label import extract_full_context_label @@ -224,3 +225,23 @@ def _synthesis_impl(self, query: AudioQuery, speaker_id: int): 音声合成結果 """ raise NotImplementedError() + + @abstractmethod + def guided_synthesis( + self, + query: AudioQuery, + speaker: int, + audio_file: IO, + normalize: int, + ): + raise NotImplementedError() + + @abstractmethod + def guided_accent_phrases( + self, + accent_phrases: List[AccentPhrase], + speaker: int, + audio_file: IO, + normalize: int, + ) -> List[AccentPhrase]: + raise NotImplementedError()