diff --git a/.gitignore b/.gitignore index fc8dbfaec..d06a0063e 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,4 @@ cython_debug/ # inferred result *.wav +*.mp3 diff --git a/README.md b/README.md index 56b7d414a..8b261f398 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ A generative speech model for daily dialogue. [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb) [![Discord](https://img.shields.io/badge/ChatTTS-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/Ud5Jxgx5yD) -**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md) +**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md) | [**Español**](docs/es/README.md) @@ -93,29 +93,31 @@ pip install -r requirements.txt ``` ### Quick Start +> Make sure you are under the project root directory when you execute these commands below. + #### 1. Launch WebUI ```bash python examples/web/webui.py ``` #### 2. Infer by Command Line -> It will save audio to `./output_audio_xxx.wav` +> It will save audio to `./output_audio_n.mp3` ```bash -python examples/cmd/run.py "Please input your text." +python examples/cmd/run.py "Your text 1." "Your text 2." ``` ### Basic ```python import ChatTTS -from IPython.display import Audio +import torch import torchaudio chat = ChatTTS.Chat() chat.load(compile=False) # Set to True for better performance -texts = ["PUT YOUR TEXT HERE",] +texts = ["PUT YOUR 1st TEXT HERE", "PUT YOUR 2nd TEXT HERE"] wavs = chat.infer(texts) @@ -154,6 +156,7 @@ wavs = chat.infer( ################################### # For word level manual control. + text = 'What is [uv_break]your favorite english food?[laugh][lbreak]' wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text, params_infer_code=params_infer_code) torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000) diff --git a/docs/cn/README.md b/docs/cn/README.md index 7c1598ee2..6fbbbdcaa 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -10,7 +10,7 @@ [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS) [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb) -[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) +[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | [**Español**](../es/README.md) diff --git a/docs/jp/README.md b/docs/jp/README.md index 2f7adda16..92e5e7813 100644 --- a/docs/jp/README.md +++ b/docs/jp/README.md @@ -4,7 +4,7 @@ [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS) -[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md) +[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md) | [**Español**](../es/README.md) ChatTTSは、LLMアシスタントなどの対話シナリオ用に特別に設計されたテキストから音声へのモデルです。英語と中国語の両方をサポートしています。私たちのモデルは、中国語と英語で構成される100,000時間以上でトレーニングされています。**[HuggingFace](https://huggingface.co/2Noise/ChatTTS)**でオープンソース化されているバージョンは、40,000時間の事前トレーニングモデルで、SFTは行われていません。 diff --git a/docs/ru/README.md b/docs/ru/README.md index f93a8caef..8bb2d2377 100644 --- a/docs/ru/README.md +++ b/docs/ru/README.md @@ -4,7 +4,7 @@ [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS) -[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский** +[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский** | [**Español**](../es/README.md) ChatTTS - это модель преобразования текста в речь, специально разработанная для диалоговых сценариев, таких как помощник LLM. Она поддерживает как английский, так и китайский языки. Наша модель обучена на более чем 100 000 часах английского и китайского языков. Открытая версия на **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** - это предварительно обученная модель с 40 000 часами без SFT. diff --git a/examples/cmd/run.py b/examples/cmd/run.py index 12fb8f615..9579ac507 100644 --- a/examples/cmd/run.py +++ b/examples/cmd/run.py @@ -8,23 +8,28 @@ import wave import argparse +from io import BytesIO import ChatTTS -from tools.audio import unsafe_float_to_int16 +from tools.audio import unsafe_float_to_int16, wav2 from tools.logger import get_logger logger = get_logger("Command") -def save_wav_file(wav, index): - wav_filename = f"output_audio_{index}.wav" - with wave.open(wav_filename, "wb") as wf: +def save_mp3_file(wav, index): + buf = BytesIO() + with wave.open(buf, "wb") as wf: wf.setnchannels(1) # Mono channel wf.setsampwidth(2) # Sample width in bytes wf.setframerate(24000) # Sample rate in Hz wf.writeframes(unsafe_float_to_int16(wav)) - logger.info(f"Audio saved to {wav_filename}") + buf.seek(0, 0) + mp3_filename = f"output_audio_{index}.mp3" + with open(mp3_filename, "wb") as f: + wav2(buf, f, "mp3") + logger.info(f"Audio saved to {mp3_filename}") def main(texts: list[str]): @@ -42,7 +47,7 @@ def main(texts: list[str]): logger.info("Inference completed. Audio generation successful.") # Save each generated wav file to a local file for index, wav in enumerate(wavs): - save_wav_file(wav, index) + save_mp3_file(wav, index) if __name__ == "__main__": diff --git a/examples/web/webui.py b/examples/web/webui.py index a200ff6c4..a9dd2ce02 100644 --- a/examples/web/webui.py +++ b/examples/web/webui.py @@ -78,7 +78,7 @@ def main(): "Interrupt", scale=2, variant="stop", visible=False, interactive=False ) - text_output = gr.Textbox(label="Output Text", interactive=False) + text_output = gr.Textbox(label="Output Text", interactive=False, show_copy_button=True) # 使用Gradio的回调功能来更新数值输入框 voice_selection.change( @@ -117,6 +117,7 @@ def make_audio(autoplay, stream): streaming=stream, interactive=False, show_label=True, + format="mp3", ) text_output.change( text_output_listener, diff --git a/requirements.txt b/requirements.txt index 939480904..81163b20b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ pybase16384 pynini==2.1.5; sys_platform == 'linux' WeTextProcessing; sys_platform == 'linux' nemo_text_processing; sys_platform == 'linux' +av diff --git a/tools/audio/__init__.py b/tools/audio/__init__.py index fc55f41fe..14566107f 100644 --- a/tools/audio/__init__.py +++ b/tools/audio/__init__.py @@ -1 +1,2 @@ from .np import unsafe_float_to_int16 +from .av import wav2 diff --git a/tools/audio/av.py b/tools/audio/av.py new file mode 100644 index 000000000..747dc0e34 --- /dev/null +++ b/tools/audio/av.py @@ -0,0 +1,36 @@ +from io import BufferedWriter, BytesIO +from typing import Dict + +import av + + +video_format_dict: Dict[str, str] = { + "m4a": "mp4", +} + +audio_format_dict: Dict[str, str] = { + "ogg": "libvorbis", + "mp4": "aac", +} + + +def wav2(i: BytesIO, o: BufferedWriter, format: str): + """ + https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/412a9950a1e371a018c381d1bfb8579c4b0de329/infer/lib/audio.py#L20 + """ + inp = av.open(i, "r") + format = video_format_dict.get(format, format) + out = av.open(o, "w", format=format) + format = audio_format_dict.get(format, format) + + ostream = out.add_stream(format) + + for frame in inp.decode(audio=0): + for p in ostream.encode(frame): + out.mux(p) + + for p in ostream.encode(None): + out.mux(p) + + out.close() + inp.close()