-
Notifications
You must be signed in to change notification settings - Fork 252
/
Copy pathmcloud_train.yaml
27 lines (27 loc) · 1.03 KB
/
mcloud_train.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# Ultravox training configuration
name: ultravox-train
image: mosaicml/composer:latest
compute:
gpus: 8
cluster: r15z1p1
integrations:
- integration_type: git_repo
git_repo: fixie-ai/ultravox
git_branch: $UV_BRANCH
pip_install: poetry==1.7.1
scheduling:
max_duration: 6 # 6 hours max for jobs to avoid hanging jobs
command: >-
cd ultravox &&
poetry install --no-dev &&
poetry run python -m ultravox.training.helpers.prefetch_weights $TRAIN_ARGS &&
poetry run torchrun --nproc_per_node=8 -m ultravox.training.train $TRAIN_ARGS
env_variables:
MLFLOW_TRACKING_URI: databricks
UV_BRANCH: main
TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml
HF_HUB_DOWNLOAD_TIMEOUT: "300" # Set timeout to 300 seconds (5 minutes)
HF_HUB_ENABLE_HF_TRANSFER: 1
# RAM Efficient Loading: only the first process loads the pretrained model checkoint while all other processes have empty
# weights. Only applicable for Transformers. Forcibly sets `sync_module_states` to `True`.
FSDP_CPU_RAM_EFFICIENT_LOADING: 1