Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Guided Synthesis #252

Merged
merged 32 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a73892b
forced alignment, f0 extraction and entry point
Patchethium Dec 28, 2021
28cf7c2
Merge branch 'master' into guided_synthesis
Patchethium Dec 28, 2021
a060398
kind of finished
Patchethium Dec 28, 2021
f7a3713
change julius4seg, doesn't seem to help
Patchethium Dec 29, 2021
6b0651f
run pysen format
Patchethium Dec 29, 2021
f1a663a
add speaker id to api
Patchethium Dec 29, 2021
668df80
run pysen format
Patchethium Dec 29, 2021
ad4bdbd
add accent_phrase api, finish
Patchethium Dec 30, 2021
ea95405
add request parameter
Patchethium Dec 30, 2021
6dff2ec
improve error handling
Patchethium Dec 30, 2021
34eec39
run pysen format
Patchethium Dec 30, 2021
a0cba4d
add parameters
Patchethium Dec 30, 2021
90e41e2
run pysen format
Patchethium Dec 30, 2021
e889207
a little boundary check
Patchethium Dec 30, 2021
c98c8be
add normalization for different WAV format
Patchethium Dec 31, 2021
1c6d96e
run format
Patchethium Dec 31, 2021
2d74993
run format
Patchethium Dec 31, 2021
ca356df
Merge branch 'master' into guided_synthesis
Patchethium Dec 31, 2021
f088176
move synthesis and accent phrase to synthesis engine
Patchethium Dec 31, 2021
cf18c3c
add test for mock
Patchethium Dec 31, 2021
98d387c
change url for apis
Patchethium Dec 31, 2021
48b629f
simplify
Patchethium Dec 31, 2021
061483c
error type
Patchethium Jan 11, 2022
fc45886
Merge branch 'master' into guided_synthesis
Patchethium Jan 24, 2022
0e26bbb
do something
Patchethium Feb 21, 2022
365ed92
do something
Patchethium Feb 21, 2022
29427d9
run format
Patchethium Feb 21, 2022
ddc6537
Merge branch 'master' into guided_synthesis
Patchethium Feb 21, 2022
ca6df3b
resolve conflict
Patchethium Feb 21, 2022
730917f
add usage to README
Patchethium Feb 22, 2022
3522370
Merge branch 'master' into guided_synthesis
Patchethium Feb 27, 2022
9b75c6c
add comments and experimental flag for guided api
Patchethium Mar 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,9 @@ venv/
/cache

/licenses.json

# Guided Synthesis temp files
/voicevox_engine/dictation-kit*
first_pass*
second_pass*
tmp.wav
129 changes: 127 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import json
import multiprocessing
import traceback
import zipfile
from functools import lru_cache
from pathlib import Path
Expand All @@ -11,11 +12,12 @@

import soundfile
import uvicorn
from fastapi import FastAPI, HTTPException, Request, Response
from fastapi import FastAPI, File, Form, HTTPException, Request, Response, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.params import Query
from starlette.responses import FileResponse

import voicevox_engine.guided as guided
from voicevox_engine.cancellable_engine import CancellableEngine
from voicevox_engine.kana_parser import create_kana, parse_kana
from voicevox_engine.model import (
Expand Down Expand Up @@ -206,6 +208,63 @@ def accent_phrases(
enable_interrogative=enable_interrogative,
)

@app.post(
"/guided/accent_phrase",
response_model=AudioQuery,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ここはList[AccentPhrase]が正しそうです。

response_model=List[AccentPhrase],

tags=["クエリ作成"],
summary="Create Audio Query Guided by External Audio",
)
def guided_accent_phrase(
kana: str = Form(...),
speaker_id: int = Form(...),
normalize: int = Form(...),
audio_file: UploadFile = File(...),
):
try:
accent_phrases = guided.accent_phrase(
engine=engine,
audio_file=audio_file.file,
kana=kana,
speaker_id=speaker_id,
normalize=normalize,
)
return AudioQuery(
accent_phrases=accent_phrases,
speedScale=1,
pitchScale=0,
intonationScale=1,
volumeScale=1,
prePhonemeLength=0.1,
postPhonemeLength=0.1,
outputSamplingRate=default_sampling_rate,
outputStereo=False,
kana=create_kana(accent_phrases),
)
except ParseKanaError:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
Copy link
Member

@takana-v takana-v Jan 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using 422 instead of 500 for the status code is better.
ref #91

detail="Failed to Parse Kana",
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment. Please try again with another Audio Resource",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment. Please try again with another Audio Resource",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error.",
)

@app.post(
"/mora_data",
response_model=List[AccentPhrase],
Expand Down Expand Up @@ -324,7 +383,7 @@ def multi_synthesis(queries: List[AudioQuery], speaker: int):
format="WAV",
)
wav_file.seek(0)
zip_file.writestr(f"{str(i+1).zfill(3)}.wav", wav_file.read())
zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())

return FileResponse(f.name, media_type="application/zip")

Expand Down Expand Up @@ -376,6 +435,72 @@ def _synthesis_morphing(

return FileResponse(f.name, media_type="audio/wav")

@app.post(
"/guided/synthesis",
responses={
200: {
"content": {
"audio/wav": {"schema": {"type": "string", "format": "binary"}}
},
}
},
tags=["音声合成"],
summary="Audio synthesis guided by external audio and phonemes in kana, both uploaded in one form",
)
def guided_synthesis(
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
kana: str = Form(...),
speaker_id: int = Form(...),
normalize: int = Form(...),
audio_file: UploadFile = File(...),
stereo: int = Form(...),
sample_rate: int = Form(...),
volumeScale: float = Form(...),
pitchScale: float = Form(...),
speedScale: float = Form(...),
):
try:
wave = guided.synthesis(
engine=engine,
audio_file=audio_file.file,
kana=kana,
speaker_id=speaker_id,
normalize=normalize,
stereo=stereo,
sample_rate=sample_rate,
volumeScale=volumeScale,
pitchScale=pitchScale,
speedScale=speedScale,
)

with NamedTemporaryFile(delete=False) as f:
soundfile.write(file=f, data=wave, samplerate=sample_rate, format="WAV")

return FileResponse(f.name, media_type="audio/wav")
except ParseKanaError:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed to Parse Kana",
)
except StopIteration:
print(traceback.format_exc())
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment. Please try again with another Audio Resource",
)
except Exception as e:
print(traceback.format_exc())
if str(e) == "Decode Failed":
raise HTTPException(
status_code=500,
detail="Failed in Forced Alignment. Please try again with another Audio Resource",
)
else:
raise HTTPException(
status_code=500,
detail="Internal Server Error.",
)

@app.post(
"/connect_waves",
response_class=FileResponse,
Expand Down
Loading