VOICEVOX · Patchethium · Aug 13, 2023 · Aug 14, 2023 · Aug 15, 2023 · Aug 15, 2023
@@ -96,6 +96,10 @@ jobs:
           username: ${{ vars.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
 
+      # Download snfa forced aligner model
+      - name: Download model for guided synthesis
+        run: curl -N -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
+
       # Download VOICEVOX RESOURCE
       - name: Prepare VOICEVOX RESOURCE cache
         uses: actions/cache@v3

@@ -383,6 +383,10 @@ jobs:
           key: ${{ steps.onnxruntime-cache-restore.outputs.cache-primary-key }}
           path: download/onnxruntime
 
+      # Download snfa forced aligner model
+      - name: Download model for guided synthesis
+        run: curl -N -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
+
       # Download VOICEVOX RESOURCE
       - name: Prepare VOICEVOX RESOURCE cache
         uses: actions/cache@v3

@@ -22,6 +22,10 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      # Download snfa forced aligner model
+      - name: Download model for guided synthesis
+        run: curl -N -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
+
       - name: Set up Python ${{ matrix.python }}
         uses: actions/setup-python@v4
         with:

@@ -155,3 +155,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# snfa forced aligner model file
+# for `/guide` API
+cv_jp.bin
@@ -272,6 +272,11 @@ RUN <<EOF
     fi
 EOF
 
+# Download snfa's forced aligner model
+RUN <<EOF
+    curl -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
+EOF
+
 # Download Resource
 ARG VOICEVOX_RESOURCE_VERSION=0.14.3
 RUN <<EOF

@@ -359,6 +359,8 @@ Issue 側で取り組み始めたことを伝えるか、最初に Draft プル
 ```bash
 # 開発に必要なライブラリのインストール
 python -m pip install -r requirements-dev.txt -r requirements-test.txt
+# `guide`API用のモデルをダウンロードする
+curl -N -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
 
 # とりあえず実行したいだけなら代わりにこちら
 python -m pip install -r requirements.txt
@@ -492,6 +494,10 @@ python -m pip install -r requirements-dev.txt
 OUTPUT_LICENSE_JSON_PATH=licenses.json \
 bash build_util/create_venv_and_generate_licenses.bash
 
+# `guide`を有効化するモデル、重複ダウンロードしないように`-N`をつけます
+curl -N -L https://github.com/Patchethium/snfa/releases/download/v0.0.1/cv_jp.bin -o ./cv_jp.bin
+
+# ビルド自体はLIBCORE_PATH及びLIBONNXRUNTIME_PATHの指定がなくても可能です
 # モックでビルドする場合
 pyinstaller --noconfirm run.spec
 

@@ -54,6 +54,7 @@ requests = "^2.28.1"
 jinja2 = "^3.1.2"
 pyopenjtalk = {git = "https://github.com/VOICEVOX/pyopenjtalk", rev = "acd4f02d2af3129382c151590238b9370465e360"}
 semver = "^3.0.0"
+snfa = "^0.0.1"
 platformdirs = "^3.10.0"
 
 [tool.poetry.group.dev.dependencies]
@@ -63,6 +64,7 @@ pre-commit = "^2.16.0"
 atomicwrites = "^1.4.0"
 colorama = "^0.4.4"
 poetry = "^1.3.1"
+snfa = "^0.0.1"
 
 [tool.poetry.group.test.dependencies]
 pysen = "~0.10.5"
@@ -74,6 +76,7 @@ mypy = "~0.991"
 pytest = "^6.2.5"
 coveralls = "^3.2.0"
 poetry = "^1.3.1"
+snfa = "^0.0.1"
 
 [tool.poetry.group.license.dependencies]
 pip-licenses = "^4.2.0"

@@ -66,6 +66,7 @@ semver==3.0.1 ; python_version >= "3.11" and python_version < "3.12"
 setuptools==68.1.2 ; python_version >= "3.11" and python_version < "3.12"
 shellingham==1.5.3 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
+snfa==0.0.1 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 soundfile==0.10.3.post1 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.16.0 ; python_version >= "3.11" and python_version < "3.12"

@@ -26,6 +26,7 @@ requests==2.31.0 ; python_version >= "3.11" and python_version < "3.12"
 scipy==1.11.2 ; python_version >= "3.11" and python_version < "3.12"
 semver==3.0.1 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
+snfa==0.0.1 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 soundfile==0.10.3.post1 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.16.0 ; python_version >= "3.11" and python_version < "3.12"

@@ -79,6 +79,7 @@ semver==3.0.1 ; python_version >= "3.11" and python_version < "3.12"
 shellingham==1.5.3 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
 smmap==5.0.0 ; python_version >= "3.11" and python_version < "3.12"
+snfa==0.0.1 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 soundfile==0.10.3.post1 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.16.0 ; python_version >= "3.11" and python_version < "3.12"

@@ -24,6 +24,7 @@ requests==2.31.0 ; python_version >= "3.11" and python_version < "3.12"
 scipy==1.11.2 ; python_version >= "3.11" and python_version < "3.12"
 semver==3.0.1 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
+snfa==0.0.1 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 soundfile==0.10.3.post1 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.16.0 ; python_version >= "3.11" and python_version < "3.12"

@@ -326,6 +326,44 @@ def accent_phrases(
         else:
             return engine.create_accent_phrases(text, speaker_id=speaker)
 
+    @app.post(
+        "/guide",
+        response_model=AudioQuery,
+        tags=["クエリ編集"],
+        summary="Create Accent Phrase from External Audio",
+    )
+    def guide(
+        query: AudioQuery,
+        speaker: int,
+        ref_path: str,
+        normalize: bool,
+        core_version: Optional[str] = None,
+    ):
+        if not args.enable_guided:
+            raise HTTPException(
+                status_code=404,
+                detail="実験的機能はデフォルトで無効になっています。使用するには引数を指定してください。",
+            )
+        try:
+            with open(ref_path, "rb") as file:
+                # use dtype=float32 also normalizes the wav into [-1.0,1.0]
+                wav, sr = soundfile.read(file, dtype="float32")
+        except Exception:
+            raise HTTPException(
+                status_code=422,
+                detail="Invalid wav file",
+            )
+
+        engine = get_engine(core_version)
+        return engine.guide(
+            query=query,
+            speaker_id=speaker,
+            ref_wav=wav,
+            sr=sr,
+            normalize=normalize,
+            model_path=args.guide_model,
+        )
+
     @app.post(
         "/mora_data",
         response_model=List[AccentPhrase],
@@ -475,18 +513,14 @@ def multi_synthesis(
         sampling_rate = queries[0].outputSamplingRate
 
         with NamedTemporaryFile(delete=False) as f:
-
             with zipfile.ZipFile(f, mode="a") as zip_file:
-
                 for i in range(len(queries)):
-
                     if queries[i].outputSamplingRate != sampling_rate:
                         raise HTTPException(
                             status_code=422, detail="サンプリングレートが異なるクエリがあります"
                         )
 
                     with TemporaryFile() as wav_file:
-
                         wave = engine.synthesis(query=queries[i], speaker_id=speaker)
                         soundfile.write(
                             file=wav_file,
@@ -1221,6 +1255,15 @@ def custom_openapi():
         action="store_true",
         help="指定すると音声合成を途中でキャンセルできるようになります。",
     )
+    parser.add_argument(
+        "--enable_guided", action="store_true", help="入力音声を解析して音声合成クエリで返す機能を有効化します。"
+    )
+    parser.add_argument(
+        "--guide_model",
+        type=Path,
+        default="cv_jp.bin",
+        help="guided機能に入力音声の発音の長さを解析するため必要なモデルファイルです。",
+    )
     parser.add_argument(
         "--init_processes",
         type=int,

@@ -12,6 +12,8 @@ datas = [
     ('presets.yaml', '.'),
     ('default_setting.yml', '.'),
     ('ui_template', 'ui_template'),
+    ('model', 'model'),
+    ('cv_jp.bin', '.')
 ]
 datas += collect_data_files('pyopenjtalk')