Skip to content

Commit

Permalink
cluster: better tools
Browse files Browse the repository at this point in the history
  • Loading branch information
JanSkalny committed Oct 31, 2024
1 parent 45d34ad commit 4610420
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 173 deletions.
250 changes: 123 additions & 127 deletions cluster/files/cluster-balance.sh
Original file line number Diff line number Diff line change
@@ -1,151 +1,147 @@
#!/bin/bash

# tunables
MIN_OBSERVE_TIME=3 #XXX: 10
MIN_OBSERVE_TIME=5
MAX_OBSERVE_TIME=300
MAX_MIGRATE_TIME=300
AUTO_MIGRATE_TIME=5
WAIT_AFTER_MIGRATION=10

warn() {
echo "$*" 1>&2
}

fail() {
echo "$*" 1>&2
exit 1
}

check_status() {
CNT=$( crm status | grep -i "failed" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "f" && return 1

CNT=$( crm status | grep "Migrating" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "m" && return 1

CNT=$( crm status | grep "Monitoring" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "o" && return 1

return 0
}
# get common functions from cluster-tools.sh
. /usr/local/bin/cluster-tools.sh

ASK_FOR_CONFIRMATION=true
SIMULATE=false

# parse arguments
while getopts ":as" opt; do
case ${opt} in
a) ASK_FOR_CONFIRMATION=false ;;
s) SIMULATE=true ;;
\?) fail "Invalid option: -$OPTARG" ;;
esac
case ${opt} in
a) ASK_FOR_CONFIRMATION=false ;;
s) SIMULATE=true ;;
\?) fail "Invalid option: -$OPTARG" ;;
esac
done
shift $((OPTIND -1))

# adjust timers when running simulation
if [[ "$SIMULATE" == true ]]; then
echo "Running in simulation mode"
MIN_OBSERVE_TIME=1
WAIT_AFTER_MIGRATION=3
fi

# get list of all cluster nodes except ariber
NODES=($( crm status | grep -oP 'Online: \[\K[^\]]+' | tr ', ' '\n' | grep -v '^$' | grep -v arbiter ))
ALL_NODES=( $( crm status | grep -oP 'Online: \[\K[^\]]+' | tr ', ' '\n' | grep -v '^$' | grep -v arbiter ) )
NODE_INDEX=0

# make sure we have some target nodes
[ ${#NODES[@]} -eq 0 ] && fail "No valid migration targets!"
# make sure cluster is healthy
wait_for_healthy_cluster

# balance VMs for each service group
for GROUP in $( grep service_group /var/lib/virtual/conf/*.xml | cut -d '>' -f 2 | cut -d '<' -f 1 | sort | uniq ); do

FIRST_NODE=""
echo "Solving service group $GROUP..."

for XML_FILE in $( grep "<service_group>$GROUP</service_group>" /var/lib/virtual/conf/*.xml -l ); do

# make sure cluster is healthy
echo -n "Checking cluster status..."
FAILS=$MIN_OBSERVE_TIME
ATTEMPTS=0
while [[ $FAILS -gt 0 ]]; do
((ATTEMPTS++))
((FAILS--))

sleep 1
check_status
if [[ $? -eq 1 ]]; then
# failure resets count-down to 10
FAILS=$MIN_OBSERVE_TIME
else
echo -n "."
fi

# terminate after 300 seconds
[[ $ATTEMPTS -ge $MAX_OBSERVE_TIME ]] && fail " UNCLEAN!"
done
echo " OK"


# figure out vm name and uuid
NAME=$(cat "$XML_FILE" | xmllint --xpath '/domain/metadata/fqdn/text()' - 2>/dev/null)
[ -z "$NAME" ] && fail "$VM does not have a fqdn defined in its metadata!"

#XXX: fixme for short uuids
UUID=$( cat "$XML_FILE" | xmllint --xpath 'string(/domain/uuid/text())' - | cut -d '-' -f 1)
VM="vm-${UUID}"

ACTIVE_NODE=$( crm status | grep "$VM" | grep Started | rev | awk '{print $1}' | rev )

if [ "$ACTIVE_NODE" == "" ]; then
warn "VM not running! ($VM)"
continue
fi

echo "Group member $NAME ($VM) running on $ACTIVE_NODE"

if [ "$FIRST_NODE" == "" ]; then
FIRST_NODE="$ACTIVE_NODE"
NODES=($( crm status | grep -oP 'Online: \[\K[^\]]+' | tr ', ' '\n' | grep -v '^$' | grep -v arbiter | grep -v "$ACTIVE_NODE" ))
echo "Leave member running on $FIRST_NODE"
echo "Migration targets are ${NODES[@]}"
continue
fi

if [ "$FIRST_NODE" == "$ACTIVE_NODE" ]; then
NEXT_NODE="${NODES[NODE_INDEX++ % ${#NODES[@]}]}"
echo "Migration required to $NEXT_NODE"
fi

if [[ "$ASK_FOR_CONFIRMATION" == true ]]; then
# confirm migration
read -p "Press Enter to continue..."
else
# wait for 5 seconds and then continue
echo -n "Will migrate $NAME to $NEXT_NODE in"
for i in $(seq "$AUTO_MIGRATE_TIME" -1 1); do
echo -n " $i"
sleep 1
done
echo ""
fi

if [[ "$SIMULATE" == true ]]; then
# simulate migration
echo "Simulated crm res move ${VM}_vm $NEXT_NODE"
sleep 1
else
# request migration
crm res move "${VM}_vm" $NEXT_NODE > /dev/null

# observe migration process
for I in $( seq 1 $MAX_MIGRATE_TIME ); do
sleep 1
STATUS=$( virsh list | grep $VM | awk '{print $3}')
virsh list | grep $VM > /dev/null

# stop waiting
if [ $? -eq 1 ]; then
echo " migrated!"
echo ""
sleep $WAIT_AFTER_MIGRATION
break
fi
echo -n "${STATUS:0:1}"
done
fi
done
echo ""
echo "Solving service group $GROUP..."

VM_CNT=0
UNUSED_NODES=( $( crm status | grep -oP 'Online: \[\K[^\]]+' | tr ', ' '\n' | grep -v '^$' | grep -v arbiter ) )

# step 1. identify which nodes are NOT running VMs from this group
for XML_FILE in $( grep "<service_group>$GROUP</service_group>" /var/lib/virtual/conf/*.xml -l ); do
VM_CNT=$(( $VM_CNT + 1 ))

# figure out vm name and uuid
NAME=$( cluster_vm_name_from_xml "$XML_FILE" )
VM=$( cluster_vm_id_from_xml "$XML_FILE" )

# figure out where is vm running
ACTIVE_NODE=$( cluster_vm_active_node "$VM" )
[ "$ACTIVE_NODE" == "" ] && warn "- $NAME ($VM) not running!" && continue

echo "- $NAME ($VM) running on $ACTIVE_NODE"
UNUSED_NODES=($( echo "${UNUSED_NODES[@]}" | tr ' ' '\n' | grep -v $ACTIVE_NODE) )
#XXX: this produces empty strings in array :/
#UNUSED_NODES=("${UNUSED_NODES[@]/$ACTIVE_NODE}")
done

echo "Valid migration targets are:"
for NODE in "${UNUSED_NODES[@]}"; do
echo "- $NODE"
done

# reset node useage count
declare -A USAGE
for NODE in ${ALL_NODES[@]}; do
USAGE[$NODE]=0
done

# calculate migration ratio
NODE_CNT=${#ALL_NODES[@]}
RATIO=$(echo "scale=2; $VM_CNT / $NODE_CNT" | bc)
RATIO_N=$(echo "scale=0; ($RATIO + 0.99) / 1" | bc)
echo "Migration ratio is $RATIO (rounded $RATIO_N)"

# step 2. if more than $RATIO_N vms are running, move them to empty nodes
for XML_FILE in $( grep "<service_group>$GROUP</service_group>" /var/lib/virtual/conf/*.xml -l ); do
# figure out vm name and uuid
NAME=$( cluster_vm_name_from_xml "$XML_FILE" )
VM=$( cluster_vm_id_from_xml "$XML_FILE" )

# figure out where is vm running
ACTIVE_NODE=$( cluster_vm_active_node "$VM" )
[ "$ACTIVE_NODE" == "" ] && continue

USAGE[$ACTIVE_NODE]=$(( ${USAGE[$ACTIVE_NODE]} + 1 ))
if [ ${USAGE[$ACTIVE_NODE]} -gt $RATIO_N ]; then
# migration target is round-robin from nodes with no vms running
NEXT_NODE="${UNUSED_NODES[NODE_INDEX++ % ${#UNUSED_NODES[@]}]}"
[[ "$NEXT_NODE" == "" ]] && warn "NO MIGRATION TARGETS!!" && continue
echo "Evict $NAME ($VM) from $ACTIVE_NODE"
else
# skip migration if under migration ratio (#vms/#nodes)
echo "Leave $NAME ($VM) running on $ACTIVE_NODE"
continue
fi

if [[ "$ASK_FOR_CONFIRMATION" == true ]]; then
# confirm migration
read -p "Move $NAME to $NEXT_NODE? [Y/n] " cont
if [[ $cont =~ ^[Yy]$ || $cont == "" ]]; then
# continue with migration
echo -n ""
else
# don't migrate
echo "VM $NAME ($VM) left running on $ACTIVE_NODE"
continue
fi
else
# wait for 5 seconds and then continue
echo -n "Will migrate $NAME to $NEXT_NODE in"
for i in $(seq "$AUTO_MIGRATE_TIME" -1 1); do
echo -n " $i"
sleep 1
done
echo ""
fi

if [[ "$SIMULATE" == true ]]; then
# simulate migration
echo "Simulated crm res move ${VM}_vm $NEXT_NODE"
sleep 1
else
# make sure cluster is healthy
wait_for_healthy_cluster

# request migration
echo "Start migration"
crm res move "${VM}_vm" $NEXT_NODE >/dev/null 2>/dev/null
sleep $WAIT_AFTER_MIGRATION

# make sure cluster is healthy (and wait for migration)
wait_for_healthy_cluster
fi
done
echo ""
done

# make sure cluster is clean when we're done
wait_for_healthy_cluster
52 changes: 6 additions & 46 deletions cluster/files/cluster-drain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,7 @@ MAX_MIGRATE_TIME=300
AUTO_MIGRATE_TIME=5
WAIT_AFTER_MIGRATION=10

warn() {
echo "$*" 1>&2
}

fail() {
echo "$*" 1>&2
exit 1
}

check_status() {
CNT=$( crm status | grep -i "failed" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "f" && return 1

CNT=$( crm status | grep "Migrating" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "m" && return 1

CNT=$( crm status | grep "Monitoring" | wc -l | awk '{print $1}' )
[ $CNT -ne 0 ] && echo -n "o" && return 1

return 0
}
. /usr/local/bin/cluster-tools.sh

IGNORE_HOST=$( hostname -s )
ASK_FOR_CONFIRMATION=true
Expand Down Expand Up @@ -57,35 +37,15 @@ NODE_INDEX=0
echo "Migration targets are ${NODES[@]}"

for VM in $( virsh list | grep running | awk '{print $2}'); do
# observe cluster for at least 10 seconds and make sure everything is ok
echo -n "Checking cluster status..."
FAILS=$MIN_OBSERVE_TIME
ATTEMPTS=0
while [[ $FAILS -gt 0 ]]; do
((ATTEMPTS++))
((FAILS--))

sleep 1
check_status
if [[ $? -eq 1 ]]; then
# failure resets count-down to 10
FAILS=$MIN_OBSERVE_TIME
else
echo -n "."
fi

# terminate after 300 seconds
[[ $ATTEMPTS -ge $MAX_OBSERVE_TIME ]] && fail " UNCLEAN!"
done
echo " OK"
# make sure cluster is healthy
wait_for_healthy_cluster

# get VM name from config xml
UUID=$(virsh dumpxml "$VM" | xmllint --xpath 'string(/domain/uuid/text())' -)
[ -z "$UUID" ] && fail "$VM does not have uuid?"
XML_FILE=$(grep -l "<name>$VM</name>" /var/lib/virtual/conf/*.xml)
[ -f "$XML_FILE" ] || fail "$VM does not have XML file"
NAME=$(cat "$XML_FILE" | xmllint --xpath '/domain/metadata/fqdn/text()' - 2>/dev/null)
[ -z "$NAME" ] && fail "$VM does not have a fqdn defined in its metadata!"
NAME=$( cluster_vm_name_from_xml "$XML_FILE" )

# check if VM is defined in corosync
crm conf show | grep "${VM}_vm" > /dev/null
Expand Down Expand Up @@ -129,11 +89,11 @@ for VM in $( virsh list | grep running | awk '{print $2}'); do
sleep $WAIT_AFTER_MIGRATION
break
fi
echo -n "${STATUS:0:1}"
echo -n "status=$STATUS"
done

# if still running, migration failed
virsh list | grep $UUID > /dev/null
[ $? -eq 1 ] || echo " failed!"done
[ $? -eq 1 ] || echo " failed!"
fi
done
Loading

0 comments on commit 4610420

Please sign in to comment.