-
Notifications
You must be signed in to change notification settings - Fork 0
/
ray_slurm_template.sh
136 lines (92 loc) · 3.17 KB
/
ray_slurm_template.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
# '''
# =================================
# Slurm only handels allocating initial resources. All management is handeled by Ray.
# Make one "task" on each node with all the GPUs you want
# This will start a Ray instance on each node with all the GPUs allocated.
# Ray handles internally the GPU work allocation
# =================================
# '''
#SBATCH -p gpu_test
#SBATCH -t 00:05:00
#SBATCH --job-name=ray
### Number of total nodes requested
#SBATCH --nodes=1
### Number of total nodes requested
#SBATCH --ntasks=1
### Leave this
#SBATCH --tasks-per-node=1
### GPU per node requested
#SBATCH --gres=gpu:2
### CPU per node requested. Make sure to leave 1 for Ray worker to use
#SBATCH --cpus-per-task=10
### Memory requested
#SBATCH --mem=200G
#SBATCH --chdir=/n/home11/nswood/HPC_Parallel_Computing/
#SBATCH --output=slurm_monitoring/%x-%j.out
### GPU per node requested (both should be the same as above)
export SLURM_GPUS_PER_TASK=2
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-
address)
### UPDATE WITH YOUR CONFIGURATIONS
source ~/.bashrc
source /n/holystore01/LABS/iaifi_lab/Users/nswood/mambaforge/etc/profile.d/conda.sh
conda activate tune_env
# conda activate top_env
if [[ "$head_node_ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<<"$head_node_ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
head_node_ip=${ADDR[2]}
else
head_node_ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
fi
port=6322
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"
redis_password=$(uuidgen)
echo "redis_password: "$redis_password
nodeManagerPort=6600
objectManagerPort=6601
rayClientServerPort=11001
redisShardPorts=6602
minWorkerPort=11002
maxWorkerPort=19999
# ray disable-usage-stats
echo "Starting HEAD at $head_node"
srun --nodes=1 --ntasks=1 -w "$head_node" \
ray start --head --node-ip-address="$head_node_ip" \
--port=$port \
--node-manager-port=$nodeManagerPort \
--object-manager-port=$objectManagerPort \
--ray-client-server-port=$rayClientServerPort \
--redis-shard-ports=$redisShardPorts \
--min-worker-port=$minWorkerPort \
--max-worker-port=$maxWorkerPort \
--redis-password=$redis_password \
--num-cpus "${SLURM_CPUS_PER_TASK}" \
--num-gpus "${SLURM_GPUS_PER_TASK}" \
--temp-dir="/n/home11/nswood/HPC_Parallel_Computing/log" \
--block &
sleep 10
echo "IP Head: $ip_head"
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Starting WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w "$node_i" \
ray start --address "$ip_head" \
--redis-password=$redis_password \
--num-cpus "${SLURM_CPUS_PER_TASK}" \
--num-gpus "${SLURM_GPUS_PER_TASK}" \
--block &
sleep 5
done
# python -u Ray/simpler-trainer.py "$SLURM_CPUS_PER_TASK"
# python -u Ray/ray_hyp_tune.py --gpu_per_trial 1 --cpu_per_trial 2
python -u Ray/pytorch_lightning_ray.py --gpu_per_trial 1 --cpu_per_trial 2