VOICEVOX · Hiroshiba · Mar 10, 2022 · Dec 28, 2021 · Dec 28, 2021 · Dec 28, 2021
@@ -23,3 +23,9 @@ venv/
 /cache
 
 /licenses.json
+
+# Guided Synthesis temp files
+/voicevox_engine/dictation-kit*
+first_pass*
+second_pass*
+tmp.wav
@@ -3,6 +3,7 @@
 import base64
 import json
 import multiprocessing
+import traceback
 import zipfile
 from functools import lru_cache
 from pathlib import Path
@@ -11,11 +12,12 @@
 
 import soundfile
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.params import Query
 from starlette.responses import FileResponse
 
+import voicevox_engine.guided as guided
 from voicevox_engine.cancellable_engine import CancellableEngine
 from voicevox_engine.kana_parser import create_kana, parse_kana
 from voicevox_engine.model import (
@@ -206,6 +208,63 @@ def accent_phrases(
                 enable_interrogative=enable_interrogative,
             )
 
+    @app.post(
+        "/guided/accent_phrase",
+        response_model=AudioQuery,
         response_model=List[AccentPhrase], 
         response_model=List[AccentPhrase], 
+        tags=["クエリ作成"],
+        summary="Create Audio Query Guided by External Audio",
+    )
+    def guided_accent_phrase(
+        kana: str = Form(...),
+        speaker_id: int = Form(...),
+        normalize: int = Form(...),
+        audio_file: UploadFile = File(...),
+    ):
+        try:
+            accent_phrases = guided.accent_phrase(
+                engine=engine,
+                audio_file=audio_file.file,
+                kana=kana,
+                speaker_id=speaker_id,
+                normalize=normalize,
+            )
+            return AudioQuery(
+                accent_phrases=accent_phrases,
+                speedScale=1,
+                pitchScale=0,
+                intonationScale=1,
+                volumeScale=1,
+                prePhonemeLength=0.1,
+                postPhonemeLength=0.1,
+                outputSamplingRate=default_sampling_rate,
+                outputStereo=False,
+                kana=create_kana(accent_phrases),
+            )
+        except ParseKanaError:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to Parse Kana",
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment. Please try again with another Audio Resource",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment. Please try again with another Audio Resource",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error.",
+                )
+
     @app.post(
         "/mora_data",
         response_model=List[AccentPhrase],
@@ -324,7 +383,7 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int):
                             format="WAV",
                         )
                         wav_file.seek(0)
-                        zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
+                        zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
 
         return FileResponse(f.name, media_type="application/zip")
 
@@ -376,6 +435,72 @@ def _synthesis_morphing(
 
         return FileResponse(f.name, media_type="audio/wav")
 
+    @app.post(
+        "/guided/synthesis",
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="Audio synthesis guided by external audio and phonemes in kana, both uploaded in one form",
+    )
+    def guided_synthesis(
+        kana: str = Form(...),
+        speaker_id: int = Form(...),
+        normalize: int = Form(...),
+        audio_file: UploadFile = File(...),
+        stereo: int = Form(...),
+        sample_rate: int = Form(...),
+        volumeScale: float = Form(...),
+        pitchScale: float = Form(...),
+        speedScale: float = Form(...),
+    ):
+        try:
+            wave = guided.synthesis(
+                engine=engine,
+                audio_file=audio_file.file,
+                kana=kana,
+                speaker_id=speaker_id,
+                normalize=normalize,
+                stereo=stereo,
+                sample_rate=sample_rate,
+                volumeScale=volumeScale,
+                pitchScale=pitchScale,
+                speedScale=speedScale,
+            )
+
+            with NamedTemporaryFile(delete=False) as f:
+                soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+
+            return FileResponse(f.name, media_type="audio/wav")
+        except ParseKanaError:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to Parse Kana",
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment. Please try again with another Audio Resource",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment. Please try again with another Audio Resource",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error.",
+                )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,