-
Notifications
You must be signed in to change notification settings - Fork 133
/
runLauncher.sh
executable file
·86 lines (73 loc) · 2.79 KB
/
runLauncher.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# Copyright xmuspeech (Author:Snowdar 2020-03-08)
stage=3
endstage=4
multi_gpu_solution="ddp"
omp_num_threads=1
port=0
. subtools/parse_options.sh
. subtools/path.sh
if [[ $# < 1 ]];then
echo "[exit] Num of parameters is zero, expected a launcher."
echo "usage: $0 <launcher> [launcher-options]"
echo "e.g. $0 subtools/pytorch/launcher/runSnowdarXvector-voxceleb1.py --gpu-id=0,1,2"
exit 1
fi
launcher=$1
shift
[ ! -f $launcher ] && echo "Expected $launcher (*.py) to exist." && exit 1
# Should note the " and space char when giving a parameter from shell to python.
launcher_options=""
num_gpu=1
while true;do
[ $# -eq 0 ] && break
if [[ $1 == "--gpu-id="* ]];then
gpu_id_option=$(echo "$1" | sed 's/ /,/g')
launcher_options="$launcher_options $gpu_id_option"
num_gpu=$(echo $gpu_id_option | sed 's/=/ /g' | awk '{print $2}' | sed 's/[,-]/\n/g' | sed '/^$/d' | wc -l)
elif [[ $1 == "--multi-gpu-solution="* ]];then
multi_gpu_solution=$(echo $1 | sed 's/=/ /g' | awk '{print $2}')
launcher_options="$launcher_options $1"
elif [[ $1 == "--port="* ]];then
port=$(echo $1 | sed 's/=/ /g' | awk '{print $2}')
launcher_options="$launcher_options $1"
elif [[ $1 == "--stage="* ]];then
stage=$(echo $1 | sed 's/=/ /g' | awk '{print $2}')
elif [[ $1 == "--endstage="* ]];then
endstage=$(echo $1 | sed 's/=/ /g' | awk '{print $2}')
else
launcher_options="$launcher_options $1"
fi
shift
done
# Add multi-gpu case.
if [ $num_gpu -gt 1 ];then
if [ "$multi_gpu_solution" == "horovod" ];then
sh subtools/pytorch/launcher/multi_gpu/check_horovod.sh || exit 1
# Ser cache for synchronize batchnorm to avoid WARNING.
export HOROVOD_CACHE_CAPACITY=0
train_cmd="horovodrun -np $num_gpu python3"
elif [ "$multi_gpu_solution" == "ddp" ];then
export OMP_NUM_THREADS=$omp_num_threads
if [ "$port" == "0" ];then
port=$(python3 subtools/pytorch/launcher/multi_gpu/get_free_port.py)
# launcher_options="$launcher_options --port $port"
fi
train_cmd="python3 -m torch.distributed.launch --nproc_per_node=$num_gpu --master_port=$port"
# train_cmd="torchrun --nproc_per_node=$num_gpu"
else
echo "[exit] Do not support $multi_gpu_solution solution for multi-GPU training." && exit 1
fi
else
train_cmd="python3"
fi
# Split this two stage to free GPU memory of model by an exit-python way
# and use these GPU memory to extract x-vectors.
if [[ "$stage" -le 3 && "$endstage" -ge "$stage" ]];then
[ $endstage -ge 3 ] && endstage=3
$train_cmd $launcher $launcher_options --stage=$stage --endstage=$endstage || exit 1
fi
if [[ "$stage" -le 4 && "$endstage" -ge 4 ]];then
python3 $launcher --stage=4 || exit 1
fi
exit 0