From c0066106be7149b9d6fd14cdf396edd55c5c2a13 Mon Sep 17 00:00:00 2001 From: Sofer Athlan-Guyot Date: Fri, 26 Jul 2024 13:35:04 +0200 Subject: [PATCH] Introduce control plane testing. Just before starting the update and for the entire update, this continuously: 1. creates a vm; 2. attach a volume (optional) and a fip to ip 3. ssh to it 4. destroy it 5. restart from 1. This give a good level of confidence that the openstack API is still reachable during the update. --- roles/update/README.md | 2 +- roles/update/defaults/main.yml | 2 + roles/update/files/continuous-test.sh | 198 ++++++++++++++++++ roles/update/tasks/create_test_files.yml | 72 +++++-- roles/update/tasks/main.yml | 16 +- .../templates/control_plane_test_start.sh.j2 | 12 ++ .../templates/control_plane_test_stop.sh.j2 | 70 +++++++ .../templates/workload_launch_k8s.sh.j2 | 8 + 8 files changed, 358 insertions(+), 22 deletions(-) create mode 100644 roles/update/files/continuous-test.sh create mode 100644 roles/update/templates/control_plane_test_start.sh.j2 create mode 100644 roles/update/templates/control_plane_test_stop.sh.j2 create mode 100644 roles/update/templates/workload_launch_k8s.sh.j2 diff --git a/roles/update/README.md b/roles/update/README.md index bcf8af1e1e..18f7e0ca47 100644 --- a/roles/update/README.md +++ b/roles/update/README.md @@ -11,5 +11,5 @@ Role to run update * `cifmw_update_create_volume`: (Boolean) Attach a volume to the test OS instance when set to true. Default to `False` * `cifmw_update_ping_loss_second` : (Integer) Number of seconds that the ping test is allowed to fail. Default to `0`. Note that 1 packet loss is always accepted to avoid false positive. * `cifmw_update_ping_loss_percent` : (Integer) Maximum percentage of ping loss accepted. Default to `0`. Only relevant when `cifmw_update_ping_loss_second` is not 0. - +* `cifmw_update_control_plane_check`: (Boolean) Activate a continuous control plane testing. Default to `False` ## Examples diff --git a/roles/update/defaults/main.yml b/roles/update/defaults/main.yml index c290812a69..2fda964805 100644 --- a/roles/update/defaults/main.yml +++ b/roles/update/defaults/main.yml @@ -43,3 +43,5 @@ cifmw_update_ping_test: false cifmw_update_create_volume: false cifmw_update_ping_loss_second: 0 cifmw_update_ping_loss_percent: 0 + +cifmw_update_control_plane_check: false diff --git a/roles/update/files/continuous-test.sh b/roles/update/files/continuous-test.sh new file mode 100644 index 0000000000..0bdef5f956 --- /dev/null +++ b/roles/update/files/continuous-test.sh @@ -0,0 +1,198 @@ +#!/bin/bash +set -eu +## --------------------------------------------------------------------- +## NAME: +## continuous-test.sh - run a script in a loop and gather the results. +## +## SYNOPSIS +## continuous-test.sh [OPTION] [SCRIPT] +## +## DESCRIPTION +## Run SCRIPT and collect date, time and exit status. +## +## The SCRIPT will be continuously run until we get a SIGUSR1 +## signal. When the signal is caught, we will wait for the last +## run to end and dump to stdout the result of all commands. +## +## The output of the each command will be saved into "continuous-test-/" under +## the current directory. +## +## A /var/run/continuous-test.pid will register the pid of the +## running process. +## +## OPTIONS +## -d Enable debug mode. +## -l Prefix used for: +## - Logfile: Default to ./continuous-test-.log +## - Done file: Default to ./continuous-test-.done +## +## The logfile will hold the result of each command run and the +## done file indicate that the last run is finished when we want +## to end the continuous test. +## +## Both those files will have the added to the prefix so that +## multiple command can be run in parallel if needed. +## +## The pid can be find in the PIDFILE. +## +## -p save the PID to that file. +## Default to ./continuous-test.pid +## +## -o Directory where to save all those files. Default to +## the directory where continuous-test.sh is. +## +## FILES +## +## /var/run/continuous-test.pid will hold the pid of the process +## ./continuous-test.log have the result of the check +## ./continuous-test-/ will hold the output of each command. +## +## ENVIRONMENT +## CT_SCRIPT_ARGS A string holding any argument that should +## be passed to SCRIPT. +## +## AUTHOR +## Athlan-Guyot Sofer +## --------------------------------------------------------------------- +FILE=$(basename $0) + +CT_PARENT=${CT_PARENT:-true} +CT_CHILD=${CT_CHILD:-false} + +CT_STOP=false + +## --------------------------------------------------------------------- +## Function definitions. +process_sig() { + echo "$$: received term signal" >&2 + CT_STOP=true +} + +process_sigterm_parent() { + echo "$$: Parent received term signal" >&2 + if [ -n "${CT_PID}" ]; then + echo "$$: received term signal: killing $CT_PID" >&2 + kill -s USR1 $CT_PID + else + # Should not happen. + echo "$$: received term signal: killing group" >&2 + kill -s USR1 0 + fi +} + +# Daemonize the process. This will fork a process and detach from the +# console after setting the environment from the options. +if "${CT_PARENT}"; then + export DEBUG=false + while getopts :p:l:o:d OPT; do + case $OPT in + l|+l) + CT_PREFIX="$OPTARG" + ;; + p|+p) + CT_PIDFILE="$OPTARG" + ;; + o|+o) + CT_DIR="$OPTARG" + ;; + d|+d) + DEBUG=true + ;; + *) + echo "usage: ${0##*/} [-l LOGFILE] [-p PIDFILE] [-d] SCRIPT" + exit 2 + esac + done + shift $(( OPTIND - 1 )) + OPTIND=1 + if [ -z "${CT_DIR}" ]; then + CT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + fi + export CT_DIR + if $DEBUG; then + export CT_TTY=$(tty) + else + export CT_TTY=/dev/null + fi + exec 2>$CT_TTY + echo "entering parent $$ $FILE" >&2 + export CT_SCRIPT_ARGS=${CT_SCRIPT_ARGS:-""} + export CT_SCRIPT="${@:?'SCRIPT cannot be empty.'}" + export CT_PREFIX="${CT_PREFIX:-}" + export CT_PIDFILE="${CT_PIDFILE:-}" + export CT_CHILD=true + export CT_PARENT=false + setsid ${CT_DIR}/${FILE} "$@" $CT_TTY 2>$CT_TTY & + CT_PID=$! + if $DEBUG ; then + trap process_sigterm_parent SIGTERM SIGINT + wait $CT_PID + echo "leaving parent $$ after waiting for $CT_PID/$FILE" >&2 + else + echo "leaving parent $$ $FILE" >&2 + fi + sync + exit 0 +fi + +if "${CT_CHILD}"; then + if [ -n "${CT_TTY}" ]; then + exec 2> ${CT_TTY} + exec 1> ${CT_TTY} + else + CT_TTY=/dev/null + fi + echo "entering child $$ running $FILE" >&2 + if [ -z "${CT_PREFIX}" ]; then + CT_LOGFILE="${CT_DIR}/continuous-test-$$.log" + else + CT_LOGFILE="${CT_DIR}/${CT_PREFIX}-$$.log" + fi + if [ -z "${CT_PIDFILE}" ]; then + CT_PIDFILE="${CT_DIR}/continuous-test.pid" + fi + export CT_LOGFILE + export CT_PIDFILE + export CT_CMD_OUT_DIR="${CT_DIR}/ct-$$" + trap process_sig SIGTERM SIGUSR1 + export CT_CHILD=false + export CT_PARENT=false + echo $$ > "${CT_PIDFILE}" + # Main loop where eventually run the script. + while ! $CT_STOP; do + setsid ${CT_DIR}/$FILE "$@" $CT_TTY + done + echo "Leaving child $$ running $FILE" >&2 + if [ -z "${CT_PREFIX}" ]; then + CT_ENDFILE="${CT_DIR}/continuous-test-$$.done" + else + CT_ENDFILE="${CT_DIR}/${CT_PREFIX}-$$.done" + fi + date > $CT_ENDFILE + sync + exit 0 +fi + +exec >>$CT_LOGFILE +mkdir -p "${CT_CMD_OUT_DIR}" +echo "entering loop $$ $CT_SCRIPT" >&2 +# We cannot have to jobs in the same seconds, or else we will +# overwrite the file. sleep 1 prevents this. +sleep 1 +start_time="$(date +%s)" +start_time_h="$(date -d@${start_time})" +echo -n "${start_time_h} (${start_time}) " +set +e +"${CT_SCRIPT}" ${CT_SCRIPT_ARGS} &>> "${CT_CMD_OUT_DIR}/${start_time}.log" +RC="${?}" +set -e +end_time="$(date +%s)" +duration=$((end_time - start_time)) +echo -n "${duration}s " + +if [ $RC -eq 0 ]; then + echo "SUCCESS (0)" +else + echo "FAILED (${RC})" +fi +echo "leaving loop $$" >&2 diff --git a/roles/update/tasks/create_test_files.yml b/roles/update/tasks/create_test_files.yml index b1d466f127..e4118a2014 100644 --- a/roles/update/tasks/create_test_files.yml +++ b/roles/update/tasks/create_test_files.yml @@ -14,26 +14,58 @@ # License for the specific language governing permissions and limitations # under the License. -- name: Ensure update log directory exists. - ansible.builtin.file: - path: "{{ cifmw_update_artifacts_basedir }}" - state: directory - mode: "0755" +- name: Update testing related files + when: ( cifmw_update_ping_test | bool ) or ( cifmw_update_control_plane_check | bool ) + block: + - name: Ensure update log directory exists. + ansible.builtin.file: + path: "{{ cifmw_update_artifacts_basedir }}" + state: directory + mode: "0755" + - name: Create workload launch script + ansible.builtin.template: + src: "workload_launch.sh.j2" + dest: "{{ cifmw_update_workload_launch_script }}" + mode: "0775" -- name: Create workload launch script - ansible.builtin.template: - src: "workload_launch.sh.j2" - dest: "{{ cifmw_update_workload_launch_script }}" - mode: "0775" +- name: Ping test related files + when: cifmw_update_ping_test | bool + block: + - name: Create start l3 agent connectivity check scripts + ansible.builtin.template: + src: "l3_agent_start_ping.sh.j2" + dest: "{{ cifmw_update_ping_start_script }}" + mode: "0775" -- name: Create start l3 agent connectivity check scripts - ansible.builtin.template: - src: "l3_agent_start_ping.sh.j2" - dest: "{{ cifmw_update_ping_start_script }}" - mode: "0775" + - name: Create stop l3 agent connectivity check scripts + ansible.builtin.template: + src: "l3_agent_stop_ping.sh.j2" + dest: "{{ cifmw_update_ping_stop_script }}" + mode: "0775" -- name: Create stop l3 agent connectivity check scripts - ansible.builtin.template: - src: "l3_agent_stop_ping.sh.j2" - dest: "{{ cifmw_update_ping_stop_script }}" - mode: "0775" +- name: Control plane testing related files + when: cifmw_update_control_plane_check|bool + block: + - name: Create control plane wrapper + ansible.builtin.copy: + src: "continuous-test.sh" + dest: "{{ cifmw_update_artifacts_basedir }}/continuous-test.sh" + mode: "0775" + + - name: Create control plane start script + ansible.builtin.template: + src: "control_plane_test_start.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/control_plane_test_start.sh" + mode: "0775" + + - name: Create control plane stop script + ansible.builtin.template: + src: "control_plane_test_stop.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh" + mode: "0775" + + - name: Create control plane workload launch wrapper + ansible.builtin.template: + src: "workload_launch_k8s.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/workload_launch_k8s.sh" + mode: "0775" diff --git a/roles/update/tasks/main.yml b/roles/update/tasks/main.yml index 004fc390a0..6fbd74e4a3 100644 --- a/roles/update/tasks/main.yml +++ b/roles/update/tasks/main.yml @@ -16,7 +16,6 @@ - name: Create the support files for test ansible.builtin.include_tasks: create_test_files.yml - when: cifmw_update_ping_test | bool - name: Trigger the ping test when: @@ -30,6 +29,14 @@ - name: Start ping test ansible.builtin.include_tasks: l3_agent_connectivity_check_start.yml +- name: Trigger the continuous control plane test + when: + - cifmw_update_control_plane_check | bool + - not cifmw_update_run_dryrun | bool + ansible.builtin.shell: | + {{ cifmw_update_artifacts_basedir }}/control_plane_test_start.sh + + - name: Set openstack_update_run Makefile environment variables tags: - always @@ -60,3 +67,10 @@ when: - cifmw_update_ping_test | bool - not cifmw_update_run_dryrun | bool + +- name: Stop the continuous control plane test + when: + - cifmw_update_control_plane_check | bool + - not cifmw_update_run_dryrun | bool + ansible.builtin.shell: | + {{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh diff --git a/roles/update/templates/control_plane_test_start.sh.j2 b/roles/update/templates/control_plane_test_start.sh.j2 new file mode 100644 index 0000000000..9271c5ca3f --- /dev/null +++ b/roles/update/templates/control_plane_test_start.sh.j2 @@ -0,0 +1,12 @@ +#!/bin/bash +# +# Script to test control plane by creating a vm in a loop during the +# update. Start sequence. +set -eu +BASE_DIR="${1:-{{ cifmw_update_artifacts_basedir }}}" + +continuous_test_wrapper="${BASE_DIR}/continuous-test.sh" + +if [ -e "${continuous_test_wrapper}" ]; then + ${continuous_test_wrapper} -o "${BASE_DIR}" -l control-plane-test -p "${BASE_DIR}/control-plane-test.pid" "${BASE_DIR}/workload_launch_k8s.sh" +fi diff --git a/roles/update/templates/control_plane_test_stop.sh.j2 b/roles/update/templates/control_plane_test_stop.sh.j2 new file mode 100644 index 0000000000..9eb3648c2a --- /dev/null +++ b/roles/update/templates/control_plane_test_stop.sh.j2 @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Script to stop a previously started control plane testing. +# Get the pid, kill it and wait for the end of the last run. +set -eu + +MAX_CONS_FAIL=${1:-2} +MAX_FAIL=${2:-3} +BASE_DIR="${3:-{{ cifmw_update_artifacts_basedir }}}" +STOP_MAX_TRIES=${4:-60} # 5 seconds x MAX_TRIES = 5 min by default + +pid_file="${BASE_DIR}/control-plane-test.pid" + +if [ ! -e "${pid_file}" ]; then + echo "Not pid file: ${pid_file}" + exit 1 +fi + +PID=$(cat "${pid_file}") + +done_file=${BASE_DIR}/control-plane-test-${PID}.done +result_file=${BASE_DIR}/control-plane-test-${PID}.log + +kill "${PID}" + +current_try=0 +until [ -e "${done_file}" ]; do + if [ $current_try -le "${STOP_MAX_TRIES}" ]; then + sleep 5 + current_try=$((current_try+1)) + else + echo "Waited to long for ${PID} to finish. Aborting." + exit 1 + fi +done + +# Verify that we didn't get any workload issue. +FAILURE="" +# Number of successive failure +successive_failure=$( + awk 'BEGIN{fail=0; max=0} + NR>1 && NF>1 && $(NF-1)==prev{fail++; if (fail > max){max = fail}} + /FAIL/{prev=$(NF-1)} + /SUCCESS/{fail=0} + END{print max}' "${result_file}" +) +if [ "${successive_failure}" -gt "${MAX_CONS_FAIL}" ]; then + echo "Max number of consecutive control plane failure (${MAX_CONS_FAIL}) reached." + echo "Found ${successive_failure} consecutive failures during update." + grep FAILED "${result_file}" + FAILURE="true" +fi +# Total number of failure +failures=$(grep -Fc FAILED "${result_file}" ||:) # prevents exit 1 when no match +if [ "${failures}" -gt "${MAX_FAIL}" ]; then + echo "Max number of control plan failure (${MAX_FAIL}) reached." + echo "Found ${failures} failures during update." + grep FAILED "${result_file}" + FAILURE="true" +fi + +if [ -n "${FAILURE}" ]; then + echo "Concaneted files in ${BASE_DIR}/control-plane-testing-detailed.log" + tail -n +1 ./ct-"${PID}"/*.log > "${BASE_DIR}/control-plane-testing-detailed.log" + exit 1 +else + echo "$(date) No (or not enough) failure(s) during control plane testing" + echo "Successive failure: ${successive_failure}/${MAX_CONS_FAIL}" + echo "Total number of failures: ${failures}/${MAX_FAIL}" +fi diff --git a/roles/update/templates/workload_launch_k8s.sh.j2 b/roles/update/templates/workload_launch_k8s.sh.j2 new file mode 100644 index 0000000000..18179455f1 --- /dev/null +++ b/roles/update/templates/workload_launch_k8s.sh.j2 @@ -0,0 +1,8 @@ +#!/usr/bin/bash + + +export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}" +export PATH="{{ cifmw_path }}" + +cat "{{ cifmw_update_artifacts_basedir }}/workload_launch.sh" | \ + oc rsh -n openstack openstackclient env WKL_MODE=sanityfast bash