Skip to content

Commit

Permalink
v0.2.0. hydra config for training
Browse files Browse the repository at this point in the history
  • Loading branch information
SWivid committed Nov 27, 2024
1 parent a72a097 commit 771007b
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 66 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "f5-tts"
version = "0.1.2"
version = "0.2.0"
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
readme = "README.md"
license = {text = "MIT License"}
Expand All @@ -21,6 +21,7 @@ dependencies = [
"datasets",
"ema_pytorch>=0.5.2",
"gradio>=3.45.2",
"hydra-core>=1.3.0",
"jieba",
"librosa",
"matplotlib",
Expand All @@ -39,7 +40,6 @@ dependencies = [
"vocos",
"wandb",
"x_transformers>=1.31.14",
"hydra-core>=1.3.0",
]

[project.optional-dependencies]
Expand Down
46 changes: 23 additions & 23 deletions src/f5_tts/configs/E2TTS_Base_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,41 @@ hydra:
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

datasets:
name: Emilia_ZH_EN # dataset name
name: Emilia_ZH_EN # dataset name
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
batch_size_type: frame # "frame" or "sample"
batch_size_type: frame # "frame" or "sample"
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
num_workers: 16 # number of workers
num_workers: 16

optim:
epochs: 15 # max epochs
learning_rate: 7.5e-5 # learning rate
epochs: 15
learning_rate: 7.5e-5
num_warmup_updates: 20000 # warmup steps
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
max_grad_norm: 1.0 # gradient clipping
bnb_optimizer: False # use bnb optimizer or not
max_grad_norm: 1.0 # gradient clipping
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not

model:
name: E2TTS_Base # model name
tokenizer: pinyin # tokenizer type
name: E2TTS_Base
tokenizer: pinyin
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
arch:
dim: 1024 # model dimension
depth: 24 # number of transformer layers
heads: 16 # number of transformer heads
ff_mult: 4 # ff layer expansion
dim: 1024
depth: 24
heads: 16
ff_mult: 4
mel_spec:
target_sample_rate: 24000 # target sample rate
n_mel_channels: 100 # mel channel
hop_length: 256 # hop length
win_length: 1024 # window length
n_fft: 1024 # fft length
target_sample_rate: 24000
n_mel_channels: 100
hop_length: 256
win_length: 1024
n_fft: 1024
mel_spec_type: vocos # 'vocos' or 'bigvgan'
is_local_vocoder: False # use local vocoder or not
local_vocoder_path: None # path to local vocoder
is_local_vocoder: False # use local offline vocoder ckpt or not
local_vocoder_path: None # path to local vocoder

ckpts:
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
10 changes: 5 additions & 5 deletions src/f5_tts/configs/E2TTS_Small_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ hydra:
datasets:
name: Emilia_ZH_EN
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
batch_size_type: frame # "frame" or "sample"
batch_size_type: frame # "frame" or "sample"
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
num_workers: 16 # number of workers
num_workers: 16

optim:
epochs: 15
Expand Down Expand Up @@ -37,7 +37,7 @@ model:
local_vocoder_path: None

ckpts:
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
50 changes: 25 additions & 25 deletions src/f5_tts/configs/F5TTS_Base_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,43 @@ hydra:
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

datasets:
name: Emilia_ZH_EN # dataset name
name: Emilia_ZH_EN # dataset name
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
batch_size_type: frame # "frame" or "sample"
batch_size_type: frame # "frame" or "sample"
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
num_workers: 16 # number of workers
num_workers: 16

optim:
epochs: 15 # max epochs
learning_rate: 7.5e-5 # learning rate
epochs: 15
learning_rate: 7.5e-5
num_warmup_updates: 20000 # warmup steps
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
max_grad_norm: 1.0 # gradient clipping
bnb_optimizer: False # use bnb optimizer or not
max_grad_norm: 1.0 # gradient clipping
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not

model:
name: F5TTS_Base # model name
tokenizer: pinyin # tokenizer type
name: F5TTS_Base # model name
tokenizer: pinyin # tokenizer type
tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
arch:
dim: 1024 # model dim
depth: 22 # model depth
heads: 16 # model heads
ff_mult: 2 # feedforward expansion
text_dim: 512 # text encoder dim
conv_layers: 4 # convolution layers
dim: 1024
depth: 22
heads: 16
ff_mult: 2
text_dim: 512
conv_layers: 4
mel_spec:
target_sample_rate: 24000 # target sample rate
n_mel_channels: 100 # mel channel
hop_length: 256 # hop length
win_length: 1024 # window length
n_fft: 1024 # fft length
target_sample_rate: 24000
n_mel_channels: 100
hop_length: 256
win_length: 1024
n_fft: 1024
mel_spec_type: vocos # 'vocos' or 'bigvgan'
is_local_vocoder: False # use local vocoder or not
local_vocoder_path: None # local vocoder path
is_local_vocoder: False # use local offline vocoder ckpt or not
local_vocoder_path: None # local vocoder path

ckpts:
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
14 changes: 7 additions & 7 deletions src/f5_tts/configs/F5TTS_Small_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ hydra:
datasets:
name: Emilia_ZH_EN
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
batch_size_type: frame # "frame" or "sample"
batch_size_type: frame # "frame" or "sample"
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
num_workers: 16 # number of workers
num_workers: 16

optim:
epochs: 15
learning_rate: 7.5e-5
num_warmup_updates: 20000 # warmup steps
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
max_grad_norm: 1.0
bnb_optimizer: False
max_grad_norm: 1.0 # gradient clipping
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not

model:
name: F5TTS_Small
Expand All @@ -39,7 +39,7 @@ model:
local_vocoder_path: None

ckpts:
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
logger: wandb # wandb | tensorboard | None
save_per_updates: 50000 # save checkpoint per steps
last_per_steps: 5000 # save last checkpoint per steps
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
8 changes: 5 additions & 3 deletions src/f5_tts/train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

## Prepare Dataset

Example data processing scripts for Emilia and Wenetspeech4TTS, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.
Example data processing scripts, and you may tailor your own one along with a Dataset class in `src/f5_tts/model/dataset.py`.

### 1. Datasets used for pretrained models
### 1. Some specific Datasets preparing scripts
Download corresponding dataset first, and fill in the path in scripts.

```bash
Expand Down Expand Up @@ -38,7 +38,9 @@ Once your datasets are prepared, you can start the training process.
# setup accelerate config, e.g. use multi-gpu ddp, fp16
# will be to: ~/.cache/huggingface/accelerate/default_config.yaml
accelerate config
accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml

# .yaml files are under src/f5_tts/configs directory
accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml
```

### 2. Finetuning practice
Expand Down
3 changes: 2 additions & 1 deletion src/f5_tts/train/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# training script.

import os
from importlib.resources import files

Expand All @@ -8,7 +9,7 @@
from f5_tts.model.dataset import load_dataset
from f5_tts.model.utils import get_tokenizer

os.chdir(str(files("f5_tts").joinpath("../..")))
os.chdir(str(files("f5_tts").joinpath("../.."))) # change working directory to root of project (local editable)


@hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
Expand Down

0 comments on commit 771007b

Please sign in to comment.