Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the steps to reboot the computes after update. #2587

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion roles/update/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ Role to run update
* `cifmw_update_ping_loss_percent` : (Integer) Maximum percentage of ping loss accepted. Default to `0`. Only relevant when `cifmw_update_ping_loss_second` is not 0.
* `cifmw_update_control_plane_check`: (Boolean) Activate a continuous control plane testing. Default to `False`
* `cifmw_update_openstackclient_pod_timeout`: (Integer) Maximum number of seconds to wait for the openstackclient Pod to be available during control plane testing, as it is being restarted during update. Default to `10` seconds.

* `cifmw_update_reboot_test`: (Boolean) Activate the reboot test after update. Default to `True`.
* `cifmw_update_ansible_ssh_private_key_file`: (String) Define the path to the private key file used for the compute nodes.
* `cifmw_update_wait_retries_reboot`: (Integer) Number of retries to wait for a compute node reboot. One retry is done every five seconds. Default to 60, so five minutes.
## Examples
4 changes: 4 additions & 0 deletions roles/update/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_s
cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh"

## User facing
cifmw_update_reboot_test: true
cifmw_update_ansible_ssh_private_key_file: >-
"{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}"
cifmw_update_wait_retries_reboot: 60

cifmw_update_ping_test: false
cifmw_update_create_volume: false
Expand Down
3 changes: 2 additions & 1 deletion roles/update/molecule/default/prepare.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
- role: ci_setup
- role: install_yamls
tasks:
- name: Set custom cifmw PATH reusable fact
- name: Set custom some reusable facts
ansible.builtin.set_fact:
cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}"
cifmw_update_reboot_test: false
cacheable: true
5 changes: 5 additions & 0 deletions roles/update/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,8 @@
- not cifmw_update_run_dryrun | bool
ansible.builtin.shell: |
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh

- name: Reboot the compute nodes
ansible.builtin.include_tasks: reboot_computes.yml
when:
- cifmw_update_reboot_test | bool
75 changes: 75 additions & 0 deletions roles/update/tasks/reboot_computes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
- name: Define command for OpenStack client interactions
ansible.builtin.set_fact:
cifmw_update_openstack_cmd: >-
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack

- name: Register storage backend type
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} volume service list -f json |
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
register: storage_backend

- name: Get the list of OpenStack hypervisors
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: |
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
register: hypervisor_list
changed_when: false

- name: Parse the hypervisor list to extract hostnames
ansible.builtin.set_fact:
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"

- name: Create a reboot monitor servers script
ansible.builtin.template:
src: "monitor_servers.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh"
mode: "0775"

- name: Start the monitor servers script
ansible.builtin.shell: |
nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null &
echo $!
register: monitor_servers_job

- name: Create a monitor placement monitor script
ansible.builtin.template:
src: "monitor_vm_placement.sh.j2"
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
mode: "0775"

- name: Start the monitor placement script
ansible.builtin.shell: |
nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null &
echo $!
register: monitor_placement_job

- name: Iterate over each hypervisor for the reboot sequence
ansible.builtin.include_tasks: reboot_hypervisor.yml
loop: "{{ hypervisor_hostnames }}"
loop_control:
loop_var: hypervisor

- name: Stop the monitor servers script if running
ansible.builtin.shell: |
if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then
kill {{ monitor_servers_job.stdout }}
fi
register: kill_result
failed_when: kill_result.rc not in [0, 1] # We can still have a race
# between kill -0 and
# kill, even if unlikely.

- name: Stop the monitor placement script if running
ansible.builtin.shell: |
if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then
kill {{ monitor_placement_job.stdout }}
fi
register: kill_result
failed_when: kill_result.rc not in [0, 1]
84 changes: 84 additions & 0 deletions roles/update/tasks/reboot_hypervisor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
---
- name: Extract short hostname from FQDN
ansible.builtin.set_fact:
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"

- name: Current stage
ansible.builtin.debug:
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"

- name: Define command for nova interaction
ansible.builtin.set_fact:
cifmw_update_bash_cmd: >-
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c

- name: Check active VMs on hypervisor
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
register: active_vms
changed_when: false

- name: Evacuate VMs if they are running
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
{{ cifmw_update_bash_cmd }} ". cloudrc &&
nova host-evacuate-live
sathlan marked this conversation as resolved.
Show resolved Hide resolved
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ hypervisor }}"
when: active_vms.stdout != ''
changed_when: true

- name: Wait for compute node to get quiesced
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '[.[] | select(.Status |
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
| length'
register: compute_node_instances
until: compute_node_instances.stdout.find("0") > -1
retries: 30
delay: 5
when:
- active_vms.stdout != ''

- name: Reboot the hypervisor using CR
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml

- name: Perform sanity checks post-reboot
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
vars:
current_hypervisor: "{{ hypervisor }}"

- name: Current stage
ansible.builtin.debug:
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
with_items: "{{ active_vms.stdout_lines }}"

- name: Migrate back VMs post-reboot
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_bash_cmd }} ". cloudrc &&
nova live-migration
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ item }} {{ hypervisor }}";
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
register: instance_migration_result
until: instance_migration_result.stdout.find(hypervisor) > -1
retries: 30
delay: 5
with_items: "{{ active_vms.stdout_lines }}"
when:
- active_vms.stdout != ''
50 changes: 50 additions & 0 deletions roles/update/tasks/reboot_hypervisor_sanity_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
- name: Current stage
ansible.builtin.debug:
msg: |
Testing the status of the services for {{ current_hypervisor }} after reboot.

- name: Verify nova-compute service
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} compute service list
--host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("nova-compute")) | .State'
register: nova_compute_status
until: nova_compute_status.stdout == 'up'
retries: 30
delay: 5

- name: Verify ovn-controller service
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} network agent list
--host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("ovn-controller")) | .Alive'
register: ovn_controller_status
until: ovn_controller_status.stdout == 'true'
retries: 30
delay: 5

- name: Verify networking-ovn-metadata-agent
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_openstack_cmd }} network agent list
--host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
register: networking_ovn_metadata_status
until: networking_ovn_metadata_status.stdout == 'true'
retries: 30
delay: 5
62 changes: 62 additions & 0 deletions roles/update/tasks/reboot_hypervisor_using_cr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
---
- name: Define command prefix for OpenShift operations
ansible.builtin.set_fact:
cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}"

- name: Fetch NodeSets for the OpenStackDataPlaneDeployment
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.shell: >-
set -o pipefail;
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name
| awk -F'/' '{print " - " $2}'
register: cifmw_update_node_sets
changed_when: false

- name: Construct date string for the CR name
ansible.builtin.set_fact:
cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d%H%S') }}"

- name: Construct the CR name
ansible.builtin.set_fact:
cifmw_reboot_dep_name: >-
reboot-{{ cifmw_update_hypervisor_short_name }}-{{ cifmw_update_cr_date }}

- name: Create the OpenStackDataPlaneDeployment CR YAML file
ansible.builtin.copy:
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
content: |
apiVersion: dataplane.openstack.org/v1beta1
kind: OpenStackDataPlaneDeployment
metadata:
name: {{ cifmw_reboot_dep_name }}
namespace: {{ cifmw_update_namespace }}
spec:
nodeSets:
{{ cifmw_update_node_sets.stdout }}
servicesOverride:
- reboot-os
ansibleExtraVars:
edpm_reboot_strategy: force
ansibleLimit: {{ cifmw_update_hypervisor_short_name }}

- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot
- name: Create the OpenStackDataPlaneDeployment CR to trigger a reboot

environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.command: >-
{{ cifmw_update_oc_cmd_prefix }}
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml

- name: Check OpenStackDataPlaneDeployment status
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
ansible.builtin.command: >-
{{ cifmw_update_oc_cmd_prefix }}
get openstackdataplanedeployment {{ cifmw_reboot_dep_name }}
Copy link
Contributor

@ciecierski ciecierski Dec 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
get openstackdataplanedeployment {{ cifmw_reboot_dep_name }}
wait openstackdataplanedeployment {{ cifmw_reboot_dep_name }}
--for=condition=ready
--timeout={{ cifmw_update_timeout_reboot }}m

With oc wait ansible log is more readable, as there no retires logged to output.

register: deployment_status
until: deployment_status.stdout.find('Setup complete') > -1
retries: "{{ cifmw_update_wait_retries_reboot }}"
delay: 5
46 changes: 46 additions & 0 deletions roles/update/templates/monitor_servers.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

set -e

# List of servers can be input as command line arguments or hardcoded here.
servers=(
{% for server in hypervisor_hostnames %}
{{ server.split('.')[0] }}
{% endfor %}
)
# or, for a hardcoded list: servers=("server1" "server2" ...)

# Log file to store the status changes
log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log"

# Function to check server status via SSH
# TODO: ping always replies even if server is down.
check_servers() {
for server in "${servers[@]}"; do
if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then
# Server is up
if [ "${server_status[$server]}" == "down" ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file"
server_status[$server]="up"
fi
else
# Server is down
if [ "${server_status[$server]}" != "down" ]; then
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file"
server_status[$server]="down"
fi
fi
done
}

# Initialize server status array
declare -A server_status
for server in "${servers[@]}"; do
server_status[$server]="unknown"
done

# Main loop to continuously check server status
while true; do
check_servers
sleep 1 # Wait for 60 seconds before re-checking
done
23 changes: 23 additions & 0 deletions roles/update/templates/monitor_vm_placement.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Log the instance hypervisor. Useful when tracking compute reboot.
set -e

export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}"
export PATH="{{ cifmw_path }}"

log_file={{ cifmw_update_artifacts_basedir }}/instance_placement.log
source_file={{ cifmw_update_artifacts_basedir }}/workload_suffix

. "$source_file"

instance_name="instance_${SUFFIX}"
previous_hypervisor=""

while true; do
current_hypervisor=$(oc rsh -n openstack openstackclient openstack server show "${instance_name}" -f json | jq -r -c '.["OS-EXT-SRV-ATTR:host"]')
if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then
echo "$(date) $instance_name $current_hypervisor" >> "$log_file"
previous_hypervisor="$current_hypervisor"
fi
sleep 1
done
Loading