Skip to content

Commit

Permalink
ENH: Support fish speech reference audio (#2542)
Browse files Browse the repository at this point in the history
  • Loading branch information
codingl2k1 authored Nov 19, 2024
1 parent 4c96475 commit 0cdfb43
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 7 deletions.
33 changes: 32 additions & 1 deletion doc/source/models/model_abilities/audio.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ Clone voice, launch model ``CosyVoice-300M``.
zero_shot_prompt_text = ""
# The zero shot prompt file is the voice file
# the words said in the file shoule be identical to zero_shot_prompt_text
# the words said in the file should be identical to zero_shot_prompt_text
with open(zero_shot_prompt_file, "rb") as f:
zero_shot_prompt = f.read()
Expand Down Expand Up @@ -379,3 +379,34 @@ Instruction based, launch model ``CosyVoice-300M-Instruct``.
)
More instructions and examples, could be found at https://fun-audio-llm.github.io/ .


FishSpeech Usage
~~~~~~~~~~~~~~~~

Basic usage, refer to :ref:`audio speech usage <audio_speech>`.

Clone voice, launch model ``FishSpeech-1.4``. Please use `prompt_speech` instead of `reference_audio`
to provide the reference audio to the FishSpeech model.

.. code-block::
from xinference.client import Client
client = Client("http://<XINFERENCE_HOST>:<XINFERENCE_PORT>")
model = client.get_model("<MODEL_UID>")
reference_text = ""
# The reference audio file is the voice file
# the words said in the file should be identical to reference_text
with open(reference_audio_file, "rb") as f:
reference_audio = f.read()
speech_bytes = model.speech(
"<The text to generate audio for>",
reference_text=reference_text,
prompt_speech=reference_audio,
enable_reference_audio=True,
)
-
2 changes: 2 additions & 0 deletions xinference/client/restful/restful_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,8 @@ def speech(
The speed of the generated audio.
stream: bool
Use stream or not.
prompt_speech: bytes
The audio bytes to be provided to the model.
Returns
-------
Expand Down
12 changes: 8 additions & 4 deletions xinference/model/audio/fish_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,13 @@ def load(self):
if not is_device_available(self._device):
raise ValueError(f"Device {self._device} is not available!")

logger.info("Loading Llama model...")
enable_compile = self._kwargs.get("compile", False)
logger.info("Loading Llama model, compile=%s...", enable_compile)
self._llama_queue = launch_thread_safe_queue(
checkpoint_path=self._model_path,
device=self._device,
precision=torch.bfloat16,
compile=False,
compile=enable_compile,
)
logger.info("Llama model loaded, loading VQ-GAN model...")

Expand Down Expand Up @@ -208,11 +209,14 @@ def speech(
logger.warning("stream mode is not implemented.")
import torchaudio

prompt_speech = kwargs.get("prompt_speech")
result = list(
self._inference(
text=input,
enable_reference_audio=False,
reference_audio=None,
enable_reference_audio=kwargs.get(
"enable_reference_audio", prompt_speech is not None
),
reference_audio=prompt_speech,
reference_text=kwargs.get("reference_text", ""),
max_new_tokens=kwargs.get("max_new_tokens", 1024),
chunk_length=kwargs.get("chunk_length", 200),
Expand Down
3 changes: 1 addition & 2 deletions xinference/model/audio/tests/test_fish_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def test_fish_speech(setup):
client = Client(endpoint)

model_uid = client.launch_model(
model_name="FishSpeech-1.4",
model_type="audio",
model_name="FishSpeech-1.4", model_type="audio", compile=False
)
model = client.get_model(model_uid)
input_string = "你好,你是谁?"
Expand Down

0 comments on commit 0cdfb43

Please sign in to comment.