forked from thu-ml/RoboticsDiffusionTransformer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
finetune.sh
executable file
·57 lines (50 loc) · 2.01 KB
/
finetune.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME=bond0
export NCCL_DEBUG=INFO
export NCCL_NVLS_ENABLE=0
export TEXT_ENCODER_NAME="google/t5-v1_1-xxl"
export VISION_ENCODER_NAME="/home/1ms.ai/hf/hf_cache/models--google--siglip-so400m-patch14-384/snapshots/9fdffc58afc957d1a03a25b10dba0329ab15c2a3/"
export OUTPUT_DIR="./checkpoints/rdt-finetune-1b_10-episode_xavier"
export CFLAGS="-I/usr/include"
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu"
export CUTLASS_PATH="/path/to/cutlass"
export WANDB_PROJECT="robotics_diffusion_transformer"
export CUDA_VISIBLE_DEVICES=0,1,2,3
if [ ! -d "$OUTPUT_DIR" ]; then
mkdir "$OUTPUT_DIR"
echo "Folder '$OUTPUT_DIR' created"
else
echo "Folder '$OUTPUT_DIR' already exists"
fi
# For run in a single node/machine
# accelerate launch main.py \
# --deepspeed="./configs/zero2.json" \
# ...
deepspeed --hostfile=hostfile.txt main.py \
--deepspeed="./configs/zero2.json" \
--pretrained_model_name_or_path="/home/1ms.ai/hf/hf_cache/models--robotics-diffusion-transformer--rdt-1b/snapshots/eb09036cc64ca4945051acbd1bd581d30a1d7711/" \
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \
--output_dir=$OUTPUT_DIR \
--train_batch_size=8 \
--sample_batch_size=8 \
--max_train_steps=50 \
--checkpointing_period=25 \
--sample_period=500 \
--checkpoints_total_limit=40 \
--lr_scheduler="constant" \
--learning_rate=1e-4 \
--mixed_precision="bf16" \
--dataloader_num_workers=8 \
--image_aug \
--dataset_type="finetune" \
--state_noise_snr=40 \
--load_from_hdf5 \
--report_to=wandb \
--precomp_lang_embed
# Use this to resume training from some previous checkpoint
# --resume_from_checkpoint="checkpoint-36000" \
# Use this to load from saved lanuage instruction embeddings,
# instead of calculating it during training
# --precomp_lang_embed \