-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart_dask_rank_sunspot.sh
74 lines (63 loc) · 2.07 KB
/
start_dask_rank_sunspot.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
usage() { echo "Usage: $0 [-g] <SCHEDULER|>" 1>&2; exit 1; }
GPU=0
while getopts ":g" o; do
case "${o}" in
g)
GPU=1 #${OPTARG}
;;
*)
usage
;;
esac
done
shift "$((OPTIND-1))"
ROLE=$1
HOSTNAME=$HOSTNAME
#MAX_SYSTEM_MEMORY=$(free -m | awk '/^Mem:/{print $2}')M
#DEVICE_MEMORY_LIMIT="29GB"
#POOL_SIZE="31GB"
## A100 big mem
## DEVICE_MEMORY_LIMIT="70GB"
## POOL_SIZE="78GB"
# Used for writing scheduler file to shared storage
LOCAL_DIRECTORY=~/dask-local-directory
SCHEDULER_FILE=$LOCAL_DIRECTORY/scheduler.json
LOGDIR="$LOCAL_DIRECTORY/logs"
WORKER_DIR="/tmp/dask-workers/"
DASHBOARD_PORT=8787
# Purge Dask worker and log directories
if [ "$ROLE" = "SCHEDULER" ]; then
rm -rf $LOCAL_DIRECTORY/*
mkdir -p $LOGDIR
rm -rf $WORKER_DIR/*
mkdir -p $WORKER_DIR
fi
# Purge Dask config directories
rm -rf ~/.config/dask
# Dask/distributed configuration
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s"
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"
sleep 2
if [ "$ROLE" = "SCHEDULER" ]
then
# Setup scheduler
if (($GPU == 1)); then
ZE_AFFINITY_MASK=$PALS_LOCAL_RANKID DASK_DISTRIBUTED__WORKER__RESOURCES__GPU=1 nohup dask scheduler --dashboard-address $DASHBOARD_PORT --scheduler-file $SCHEDULER_FILE > $LOGDIR/$HOSTNAME-scheduler.log 2>&1 &
else
nohup dask scheduler --dashboard-address $DASHBOARD_PORT --scheduler-file $SCHEDULER_FILE > $LOGDIR/$HOSTNAME-scheduler.log 2>&1 &
fi
fi
sleep 5
#if [ "$ROLE" != "SCHEDULER" ]
#then
# Setup workers # --no-nanny
if (($GPU == 1)); then
ZE_AFFINITY_MASK=$PALS_LOCAL_RANKID DASK_DISTRIBUTED__WORKER__RESOURCES__GPU=1 dask worker --local-directory ${WORKER_DIR} --scheduler-file $SCHEDULER_FILE >> $LOGDIR/$HOSTNAME-worker.log 2>&1
else
dask worker --local-directory ${WORKER_DIR} --scheduler-file $SCHEDULER_FILE >> $LOGDIR/$HOSTNAME-worker.log 2>&1
fi
#fi