-
Notifications
You must be signed in to change notification settings - Fork 79
/
node-mark-offline
150 lines (137 loc) · 5.45 KB
/
node-mark-offline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
#
# LBNL Node Health Check -- Node Offlining Helper
#
# Michael Jennings <[email protected]>
# 16 September 2011
#
# This script is a simple pbsnodes wrapper that the node health check
# can run in the background to mark nodes offline. It will first
# obtain the current node state information to avoid overwriting notes
# which were not placed by NHC. If these checks pass, the node is
# marked offline with the note supplied.
IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}"
LEADER="NHC:"
echo "`date '+%Y%m%d %H:%M:%S'` $0 $*"
HOSTNAME="$1"
shift
NOTE="$*"
### PBS (TORQUE)
if [[ "$NHC_RM" == "pbs" ]]; then
PBSNODES="${PBSNODES:-pbsnodes}"
PBSNODES_LIST_ARGS="${PBSNODES_LIST_ARGS:--n -l all}"
PBSNODES_OFFLINE_ARGS="${PBSNODES_OFFLINE_ARGS:--o -N}"
LINE=( $($PBSNODES $PBSNODES_LIST_ARGS $HOSTNAME) )
STATUS="${LINE[1]}"
OLD_NOTE_LEADER="${LINE[2]}"
OLD_NOTE="${LINE[*]:3}"
case "$STATUS" in
*down*|*offline*|*unknown*)
if [[ "${STATUS/offline}" != "${STATUS}" ]]; then
# If the node is already offline, and there is no old note, and
# we've not been told to ignore that, do not touch the node.
if [[ -z "$OLD_NOTE_LEADER" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
echo "$0: Not offlining $HOSTNAME: Already offline with no note set."
exit 0
fi
fi
# If there's an old note that wasn't set by NHC, preserve it.
if [[ -n "$OLD_NOTE_LEADER" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
LEADER="$OLD_NOTE_LEADER"
NOTE="$OLD_NOTE"
fi
;;
esac
echo "$0: Marking $STATUS $HOSTNAME offline: $LEADER $NOTE"
exec $PBSNODES $PBSNODES_OFFLINE_ARGS "$LEADER $NOTE" $HOSTNAME
### Slurm
elif [[ "$NHC_RM" == "slurm" ]]; then
SLURM_SINFO="${SLURM_SINFO:-sinfo}"
SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}"
SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-update State=DRAIN}"
LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) )
STATUS="${LINE[0]}"
OLD_NOTE_LEADER="${LINE[1]}"
OLD_NOTE="${LINE[*]:2}"
case "$STATUS" in
*'@'*|*'#'*|boot*|*-*|plnd*)
# These states aren't handled yet.
echo "$0: State \"$STATUS\" not yet handled; ignoring."
exit 0
;;
alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*)
case "$STATUS" in
drain*|drng*|fail*|maint*)
# If the node is already offline, and there is no old note, and
# we've not been told to ignore that, do not touch the node.
if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
echo "$0: Not offlining $HOSTNAME: Already offline with no note set."
exit 0
fi
;;
esac
# If there's an old note that wasn't set by NHC, preserve it.
if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
LEADER="$OLD_NOTE_LEADER"
NOTE="$OLD_NOTE"
fi
echo "$0: Marking $STATUS $HOSTNAME offline: $LEADER $NOTE"
exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS NodeName=$HOSTNAME Reason="$LEADER $NOTE"
;;
down*)
echo "$0: Not changing state of down node $HOSTNAME."
;;
*) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;;
esac
### IBM Platform LSF
elif [[ "$NHC_RM" == "lsf" ]]; then
LSF_BHOSTS="${LSF_BHOSTS:-bhosts}"
LSF_BADMIN="${LSF_BADMIN:-badmin}"
LSF_OFFLINE_ARGS="${LSF_OFFLINE_ARGS:-hclose -C}"
STATUS=""
OLD_NOTE_LEADER=""
OLD_NOTE=""
IFS=$'\n'
LINES=( $($LSF_BHOSTS -l $HOSTNAME) )
IFS=$' \t\n'
for ((i=0; i < ${#LINES[*]}; i++)) ; do
LINE=( ${LINES[$i]} )
if [[ "${LINE[0]}" == "STATUS" ]]; then
((i++))
LINE=( ${LINES[$i]} )
STATUS="${LINE[0]}"
elif [[ "${LINE[0]}" == "ADMIN" && "${LINE[2]}" == "COMMENT:" ]]; then
OLD_NOTE_LEADER="${LINE[3]/\"}"
OLD_NOTE="${LINE[*]:4}"
OLD_NOTE="${OLD_NOTE/%\"}"
break
fi
done
case "$STATUS" in
ok|closed*)
if [[ "$STATUS" == "closed_Adm" ]]; then
# If the node is already offline, and there is no old note, and
# we've not been told to ignore that, do not touch the node.
if [[ -z "$OLD_NOTE_LEADER" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
echo "$0: Not offlining $HOSTNAME: Already offline with no note set."
exit 0
fi
fi
# If there's an old note that wasn't set by NHC, preserve it.
if [[ -n "$OLD_NOTE_LEADER" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
LEADER="$OLD_NOTE_LEADER"
NOTE="$OLD_NOTE"
fi
echo "$0: Marking $STATUS $HOSTNAME offline: $LEADER $NOTE"
exec $LSF_BADMIN $LSF_OFFLINE_ARGS "$LEADER $NOTE" $HOSTNAME
;;
esac
### Sun Grid Engine (and variants)
elif [[ "$NHC_RM" == "sge" ]]; then
echo "$0: No additional node marking necessary for SGE and variants."
### Everything else is unsupported.
else
echo "$0: Unsupported RM detected in $0: \"$NHC_RM\""
exit -1
fi
exit 0