VOICEVOX · Hiroshiba · Mar 10, 2022 · Dec 28, 2021 · Dec 28, 2021 · Dec 28, 2021
@@ -160,3 +160,9 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# Guided Synthesis temp files
+/voicevox_engine/experimental/dictation-kit*
+first_pass*
+second_pass*
+tmp.wav
@@ -17,7 +17,7 @@
 
 import soundfile
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.params import Query
 from pydantic import ValidationError
@@ -213,6 +213,66 @@ def accent_phrases(
         else:
             return engine.create_accent_phrases(text, speaker_id=speaker)
 
+    @app.post(
+        "/guided_accent_phrase",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="Create Accent Phrase from External Audio",
+    )
+    def guided_accent_phrase(
+        text: str = Form(...),  # noqa:B008
+        speaker: int = Form(...),  # noqa:B008
+        is_kana: bool = Form(...),  # noqa:B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+        normalize: bool = Form(...),  # noqa:B008
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        if is_kana:
+            try:
+                accent_phrases = parse_kana(text)
+            except ParseKanaError as err:
+                raise HTTPException(
+                    status_code=400,
+                    detail=ParseKanaBadRequest(err).dict(),
+                )
+        else:
+            accent_phrases = engine.create_accent_phrases(
+                text,
+                speaker_id=speaker,
+            )
+
+        try:
+            return engine.guided_accent_phrases(
+                accent_phrases=accent_phrases,
+                speaker=speaker,
+                audio_file=audio_file.file,
+                normalize=normalize,
+            )
+        except ParseKanaError as err:
+            raise HTTPException(
+                status_code=422,
+                detail=ParseKanaBadRequest(err).dict(),
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error",
+                )
+
     @app.post(
         "/mora_data",
         response_model=List[AccentPhrase],
@@ -364,7 +424,7 @@ def multi_synthesis(
                             format="WAV",
                         )
                         wav_file.seek(0)
-                        zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
+                        zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
 
         return FileResponse(f.name, media_type="application/zip")
 
@@ -418,6 +478,80 @@ def _synthesis_morphing(
 
         return FileResponse(f.name, media_type="audio/wav")
 
+    @app.post(
+        "/guided_synthesis",
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="Audio synthesis guided by external audio and phonemes",
+    )
+    def guided_synthesis(
+        kana: str = Form(...),  # noqa: B008
+        speaker_id: int = Form(...),  # noqa: B008
+        normalize: int = Form(...),  # noqa: B008
+        audio_file: UploadFile = File(...),  # noqa: B008
+        stereo: int = Form(...),  # noqa: B008
+        sample_rate: int = Form(...),  # noqa: B008
+        volume_scale: float = Form(...),  # noqa: B008
+        pitch_scale: float = Form(...),  # noqa: B008
+        speed_scale: float = Form(...),  # noqa: B008
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        try:
+            accent_phrases = parse_kana(kana)
+            query = AudioQuery(
+                accent_phrases=accent_phrases,
+                speedScale=speed_scale,
+                pitchScale=pitch_scale,
+                intonationScale=1,
+                volumeScale=volume_scale,
+                prePhonemeLength=0.1,
+                postPhonemeLength=0.1,
+                outputSamplingRate=sample_rate,
+                outputStereo=stereo,
+                kana=kana,
+            )
+            wave = engine.guided_synthesis(
+                audio_file=audio_file.file,
+                query=query,
+                speaker=speaker_id,
+                normalize=normalize,
+            )
+
+            with NamedTemporaryFile(delete=False) as f:
+                soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")
+
+            return FileResponse(f.name, media_type="audio/wav")
+        except ParseKanaError as err:
+            raise HTTPException(
+                status_code=400,
+                detail=ParseKanaBadRequest(err).dict(),
+            )
+        except StopIteration:
+            print(traceback.format_exc())
+            raise HTTPException(
+                status_code=500,
+                detail="Failed in Forced Alignment.",
+            )
+        except Exception as e:
+            print(traceback.format_exc())
+            if str(e) == "Decode Failed":
+                raise HTTPException(
+                    status_code=500,
+                    detail="Failed in Forced Alignment.",
+                )
+            else:
+                raise HTTPException(
+                    status_code=500,
+                    detail="Internal Server Error.",
+                )
+
     @app.post(
         "/connect_waves",
         response_class=FileResponse,

@@ -1,5 +1,6 @@
 from logging import getLogger
 from typing import Any, Dict, List, Optional
+from typing.io import IO
 
 import numpy as np
 from pyopenjtalk import tts
@@ -130,3 +131,50 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
         wave, sr = tts(text)
         wave = resample(wave, 24000 * len(wave) // 48000)
         return wave
+
+    def guided_synthesis(
+        self,
+        query: AudioQuery,
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> np.ndarray:
+        """
+        Open jtalk doesn't have a guided function [Mock]
+        simply calling mock synthesis
+
+        Parameters
+        ----------
+        query
+        speaker
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return self.synthesis(query=query, speaker_id=speaker)
+
+    def guided_accent_phrases(
+        self,
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        audio_file: IO,
+        normalize: int,
+    ) -> List[AccentPhrase]:
+        """
+        guided_accent_phrases 入力accent_phrasesを変更せずにそのまま返します [Mock]
+
+        Parameters
+        ----------
+        query
+        speaker
+        audio_file
+        normalize
+
+        Returns
+        -------
+
+        """
+        return accent_phrases