Skip to content

Commit

Permalink
Add the steps to reboot the computes after update.
Browse files Browse the repository at this point in the history
This sequences implement reboot of the compute nodes after the update.

If one or more instances have been created they will be live-migrated
to others instance before the reboot and migrated back to the original
hypervisor after the reboot.

Some basic sanity checks are performed after the reboot and before the
migration back to ensure that the necessary services are up and
running.

Closes: https://issues.redhat.com/browse/OSPRH-8937
  • Loading branch information
sathlan committed Dec 5, 2024
1 parent 79a60e4 commit 09bc6f0
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 0 deletions.
3 changes: 3 additions & 0 deletions roles/update/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@
- not cifmw_update_run_dryrun | bool
ansible.builtin.shell: |
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
- name: Reboot the compute nodes
ansible.builtin.include_tasks: reboot_compute.yml
26 changes: 26 additions & 0 deletions roles/update/tasks/reboot_compute.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
- name: Define command for OpenStack client interactions
ansible.builtin.set_fact:
openstack_cmd: "oc rsh -n openstack openstackclient openstack"
bash_cmd: "oc rsh -n openstack openstackclient bash -c"

- name: Register storage backend type
shell: >-
{{ openstack_cmd }} volume service list -f json |
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
register: storage_backend

- name: Get list of OpenStack hypervisors
ansible.builtin.shell: |
{{ openstack_cmd }} hypervisor list -f json
register: hypervisor_list
changed_when: false

- name: Parse the hypervisor list to extract hostnames
ansible.builtin.set_fact:
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"

- name: Iterate over each hypervisor
ansible.builtin.include_tasks: reboot_hypervisor.yml
loop: "{{ hypervisor_hostnames }}"
loop_control:
loop_var: hypervisor
74 changes: 74 additions & 0 deletions roles/update/tasks/reboot_hypervisor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---
- name: Extract short hostname from FQDN
ansible.builtin.set_fact:
hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"

- debug:
msg: "Rebooting {{ hypervisor_short_name }}"

- name: Check active VMs on hypervisor
ansible.builtin.shell: >-
{{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
register: active_vms
changed_when: false

- name: Evacuate VMs if they are running
ansible.builtin.shell: >-
{{ bash_cmd }} ". cloudrc &&
nova host-evacuate-live
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ hypervisor }}"
when: active_vms.stdout != ''
changed_when: true

- name: Wait for compute node to get quiesced
ansible.builtin.shell: >-
{{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json
| jq -r -c '[.[] | select(.Status |
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
| length'
register: compute_node_instances
until: compute_node_instances.stdout.find("0") > -1
retries: 30
delay: 5
when:
- active_vms.stdout != ''

- name: Reboot the hypervisor
ansible.builtin.reboot:
reboot_timeout: 1200
test_command: "systemctl is-system-running | grep -e running -e degraded"
delegate_to: "{{ hypervisor_short_name }}"
become: true

- name: Perform sanity checks post-reboot
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
vars:
current_hypervisor: "{{ hypervisor }}"

- debug:
msg: "Migrate back {{ item }} to {{ hypervisor_short_name }}."
with_items: "{{ active_vms.stdout_lines }}"

- name: Migrate back VMs post-reboot
ansible.builtin.shell: >-
set -o pipefail;
{{ bash_cmd }} ". cloudrc &&
nova live-migration
{% if 'ceph' not in storage_backend.stdout %}
--block-migrate
{% endif %}
{{ item }} {{ hypervisor }}";
{{ openstack_cmd }} server show {{ item }} -f json |
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
register: instance_migration_result
until: instance_migration_result.stdout.find(hypervisor) > -1
retries: 30
delay: 5
with_items: "{{ active_vms.stdout_lines }}"
when:
- active_vms.stdout != ''
33 changes: 33 additions & 0 deletions roles/update/tasks/reboot_hypervisor_sanity_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
- ansible.builtin.debug:
msg: "Here I'm testing the reboot for {{ current_hypervisor }}."

- name: Verify nova-compute service
ansible.builtin.shell: >-
{{ openstack_cmd }} compute service list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("nova-compute")) | .State'
register: nova_compute_status
until: nova_compute_status.stdout == 'up'
retries: 5
delay: 30

- name: Verify ovn-controller service
ansible.builtin.shell: >-
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("ovn-controller")) | .Alive'
register: ovn_controller_status
until: ovn_controller_status.stdout == 'true'
retries: 5
delay: 30

- name: Verify networking-ovn-metadata-agent
ansible.builtin.shell: >-
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
| jq -r -c '.[]
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
register: networking_ovn_metadata_status
until: networking_ovn_metadata_status.stdout == 'true'
retries: 5
delay: 30

0 comments on commit 09bc6f0

Please sign in to comment.