-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsky_train.sh
executable file
·128 lines (114 loc) · 3.25 KB
/
sky_train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env bash
# Initialize default values
cfg="default"
cluster_name="zh-tw-model-trainer"
script_name="train.py"
skip_setup="false"
# Usage function
usage() {
echo "Usage: $0 <train_name> [-c|--cfg CONFIG_NAME] [-n|--cluster_name CLUSTER_NAME] [-s|--skip_setup]"
echo
echo "Arguments:"
echo " <train_name> The train to run, required."
echo " -c, --cfg Config name to use, defaults to '$cfg'."
echo " -n, --cluster_name Cluster name to use, defaults to '$cluster_name'."
echo " --skip_setup Skips setup, defaults to $skip_setup."
echo
echo " --help Display this help message and exit."
}
# Check if no arguments are provided
if [ $# -eq 0 ]; then
echo "Error: No arguments provided."
usage
exit 1
fi
# Parse arguments
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-c|--cfg)
cfg="$2"
shift # past argument
shift # past value
;;
-c=*|--cfg=*)
cfg="${1#*=}"
shift # past argument
;;
-r|--script_name)
script_name="$2"
shift # past argument
shift # past value
;;
-r=*|--script_name=*)
script_name="${1#*=}"
shift # past argument
;;
-n|--cluster_name)
cluster_name="$2"
shift # past argument
shift # past value
;;
-n=*|--cluster_name=*)
cluster_name="${1#*=}"
shift # past argument
;;
-s|--skip_setup)
skip_setup="true"
shift # past argument
;;
--help)
usage
exit 0
;;
*)
if [[ -z $train_name ]]; then
train_name="$1"
shift # past argument
else
echo "Error: Unknown option '$1'"
usage
exit 1
fi
;;
esac
done
# Check if train_name is provided
if [[ -z $train_name ]]; then
echo "Error: train_name not provided."
usage
exit 1
fi
# Run the script with the provided arguments
BASEDIR=$(cd "$(dirname "$0")"; pwd)
cd $BASEDIR
python train_check_config.py "$train_name" --cfg="$cfg"
echo "Using cluster name: $cluster_name"
echo "Skip setup: $skip_setup"
echo ""
if [ -f "$BASEDIR/sky_prepare.sh" ]; then
echo "Running sky_prepare.sh"
bash "$BASEDIR/sky_prepare.sh"
echo ""
fi
./copy_files_to_sky_workdir.sh
echo ""
if [ "$skip_setup" == "true" ]; then
sky exec "$cluster_name" sky_training.yaml \
--env SCRIPT_NAME="$script_name" \
--env WANDB_API_KEY="$(awk -v machine="api.wandb.ai" 'BEGIN {RS="\n"; FS="\n"} $1 == "machine " machine {getline; while ($0 != "" && $0 !~ /^machine/) {if ($0 ~ /^ *password/) {sub(/^ *password */, "", $0); print $0; exit}; getline}}' ~/.netrc)" \
--env HUGGING_FACE_HUB_TOKEN="$(cat ~/.cache/huggingface/token | tr -d '\n')" \
--env CFG="$cfg" \
--env TRAIN_NAME="$train_name"
else
sleep 8
sky launch sky_training.yaml \
--env SCRIPT_NAME="$script_name" \
--env WANDB_API_KEY="$(awk -v machine="api.wandb.ai" 'BEGIN {RS="\n"; FS="\n"} $1 == "machine " machine {getline; while ($0 != "" && $0 !~ /^machine/) {if ($0 ~ /^ *password/) {sub(/^ *password */, "", $0); print $0; exit}; getline}}' ~/.netrc)" \
--env HUGGING_FACE_HUB_TOKEN="$(cat ~/.cache/huggingface/token | tr -d '\n')" \
--env CFG="$cfg" \
--env TRAIN_NAME="$train_name" \
-c "$cluster_name" \
--retry-until-up \
-y
fi