diff --git a/.gitignore b/.gitignore
index fc8dbfaec..d06a0063e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,4 @@ cython_debug/
 
 # inferred result
 *.wav
+*.mp3
diff --git a/README.md b/README.md
index 56b7d414a..8b261f398 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ A generative speech model for daily dialogue.
 [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
 [![Discord](https://img.shields.io/badge/ChatTTS-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/Ud5Jxgx5yD)
 
-**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md)
+**English** | [**简体中文**](docs/cn/README.md) | [**日本語**](docs/jp/README.md) | [**Русский**](docs/ru/README.md) | [**Español**](docs/es/README.md)
 
 </div>
 
@@ -93,29 +93,31 @@ pip install -r requirements.txt
 ```
 
 ### Quick Start
+> Make sure you are under the project root directory when you execute these commands below.
+
 #### 1. Launch WebUI
 ```bash
 python examples/web/webui.py
 ```
 
 #### 2. Infer by Command Line
-> It will save audio to `./output_audio_xxx.wav`
+> It will save audio to `./output_audio_n.mp3`
 
 ```bash
-python examples/cmd/run.py "Please input your text."
+python examples/cmd/run.py "Your text 1." "Your text 2."
 ```
 
 ### Basic
 
 ```python
 import ChatTTS
-from IPython.display import Audio
+import torch
 import torchaudio
 
 chat = ChatTTS.Chat()
 chat.load(compile=False) # Set to True for better performance
 
-texts = ["PUT YOUR TEXT HERE",]
+texts = ["PUT YOUR 1st TEXT HERE", "PUT YOUR 2nd TEXT HERE"]
 
 wavs = chat.infer(texts)
 
@@ -154,6 +156,7 @@ wavs = chat.infer(
 
 ###################################
 # For word level manual control.
+
 text = 'What is [uv_break]your favorite english food?[laugh][lbreak]'
 wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
 torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
diff --git a/docs/cn/README.md b/docs/cn/README.md
index 7c1598ee2..6fbbbdcaa 100644
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@@ -10,7 +10,7 @@
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 [![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
 
-[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md)
+[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | [**Español**](../es/README.md)
 
 </div>
 
diff --git a/docs/jp/README.md b/docs/jp/README.md
index 2f7adda16..92e5e7813 100644
--- a/docs/jp/README.md
+++ b/docs/jp/README.md
@@ -4,7 +4,7 @@
 
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 
-[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md) | [**Español**](../es/README.md)
 
 ChatTTSは、LLMアシスタントなどの対話シナリオ用に特別に設計されたテキストから音声へのモデルです。英語と中国語の両方をサポートしています。私たちのモデルは、中国語と英語で構成される100,000時間以上でトレーニングされています。**[HuggingFace](https://huggingface.co/2Noise/ChatTTS)**でオープンソース化されているバージョンは、40,000時間の事前トレーニングモデルで、SFTは行われていません。
 
diff --git a/docs/ru/README.md b/docs/ru/README.md
index f93a8caef..8bb2d2377 100644
--- a/docs/ru/README.md
+++ b/docs/ru/README.md
@@ -4,7 +4,7 @@
 
 [![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
 
-[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский**
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский** | [**Español**](../es/README.md)
 
 ChatTTS - это модель преобразования текста в речь, специально разработанная для диалоговых сценариев, таких как помощник LLM. Она поддерживает как английский, так и китайский языки. Наша модель обучена на более чем 100 000 часах английского и китайского языков. Открытая версия на **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** - это предварительно обученная модель с 40 000 часами без SFT.
 
diff --git a/examples/cmd/run.py b/examples/cmd/run.py
index 12fb8f615..9579ac507 100644
--- a/examples/cmd/run.py
+++ b/examples/cmd/run.py
@@ -8,23 +8,28 @@
 
 import wave
 import argparse
+from io import BytesIO
 
 import ChatTTS
 
-from tools.audio import unsafe_float_to_int16
+from tools.audio import unsafe_float_to_int16, wav2
 from tools.logger import get_logger
 
 logger = get_logger("Command")
 
 
-def save_wav_file(wav, index):
-    wav_filename = f"output_audio_{index}.wav"
-    with wave.open(wav_filename, "wb") as wf:
+def save_mp3_file(wav, index):
+    buf = BytesIO()
+    with wave.open(buf, "wb") as wf:
         wf.setnchannels(1)  # Mono channel
         wf.setsampwidth(2)  # Sample width in bytes
         wf.setframerate(24000)  # Sample rate in Hz
         wf.writeframes(unsafe_float_to_int16(wav))
-    logger.info(f"Audio saved to {wav_filename}")
+    buf.seek(0, 0)
+    mp3_filename = f"output_audio_{index}.mp3"
+    with open(mp3_filename, "wb") as f:
+        wav2(buf, f, "mp3")
+    logger.info(f"Audio saved to {mp3_filename}")
 
 
 def main(texts: list[str]):
@@ -42,7 +47,7 @@ def main(texts: list[str]):
     logger.info("Inference completed. Audio generation successful.")
     # Save each generated wav file to a local file
     for index, wav in enumerate(wavs):
-        save_wav_file(wav, index)
+        save_mp3_file(wav, index)
 
 
 if __name__ == "__main__":
diff --git a/examples/web/webui.py b/examples/web/webui.py
index a200ff6c4..a9dd2ce02 100644
--- a/examples/web/webui.py
+++ b/examples/web/webui.py
@@ -78,7 +78,7 @@ def main():
                 "Interrupt", scale=2, variant="stop", visible=False, interactive=False
             )
 
-        text_output = gr.Textbox(label="Output Text", interactive=False)
+        text_output = gr.Textbox(label="Output Text", interactive=False, show_copy_button=True)
 
         # 使用Gradio的回调功能来更新数值输入框
         voice_selection.change(
@@ -117,6 +117,7 @@ def make_audio(autoplay, stream):
                 streaming=stream,
                 interactive=False,
                 show_label=True,
+                format="mp3",
             )
             text_output.change(
                 text_output_listener,
diff --git a/requirements.txt b/requirements.txt
index 939480904..81163b20b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,4 @@ pybase16384
 pynini==2.1.5; sys_platform == 'linux'
 WeTextProcessing; sys_platform == 'linux'
 nemo_text_processing; sys_platform == 'linux'
+av
diff --git a/tools/audio/__init__.py b/tools/audio/__init__.py
index fc55f41fe..14566107f 100644
--- a/tools/audio/__init__.py
+++ b/tools/audio/__init__.py
@@ -1 +1,2 @@
 from .np import unsafe_float_to_int16
+from .av import wav2
diff --git a/tools/audio/av.py b/tools/audio/av.py
new file mode 100644
index 000000000..747dc0e34
--- /dev/null
+++ b/tools/audio/av.py
@@ -0,0 +1,36 @@
+from io import BufferedWriter, BytesIO
+from typing import Dict
+
+import av
+
+
+video_format_dict: Dict[str, str] = {
+    "m4a": "mp4",
+}
+
+audio_format_dict: Dict[str, str] = {
+    "ogg": "libvorbis",
+    "mp4": "aac",
+}
+
+
+def wav2(i: BytesIO, o: BufferedWriter, format: str):
+    """
+    https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/412a9950a1e371a018c381d1bfb8579c4b0de329/infer/lib/audio.py#L20
+    """
+    inp = av.open(i, "r")
+    format = video_format_dict.get(format, format)
+    out = av.open(o, "w", format=format)
+    format = audio_format_dict.get(format, format)
+
+    ostream = out.add_stream(format)
+
+    for frame in inp.decode(audio=0):
+        for p in ostream.encode(frame):
+            out.mux(p)
+
+    for p in ostream.encode(None):
+        out.mux(p)
+
+    out.close()
+    inp.close()