diff --git a/roles/update/tasks/main.yml b/roles/update/tasks/main.yml index 6fbd74e4a3..b13bfee9e8 100644 --- a/roles/update/tasks/main.yml +++ b/roles/update/tasks/main.yml @@ -74,3 +74,6 @@ - not cifmw_update_run_dryrun | bool ansible.builtin.shell: | {{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh + +- name: Reboot the compute nodes + ansible.builtin.include_tasks: reboot_compute.yml diff --git a/roles/update/tasks/reboot_compute.yml b/roles/update/tasks/reboot_compute.yml new file mode 100644 index 0000000000..8a3d2f44ce --- /dev/null +++ b/roles/update/tasks/reboot_compute.yml @@ -0,0 +1,26 @@ +- name: Define command for OpenStack client interactions + ansible.builtin.set_fact: + openstack_cmd: "oc rsh -n openstack openstackclient openstack" + bash_cmd: "oc rsh -n openstack openstackclient bash -c" + +- name: Register storage backend type + shell: >- + {{ openstack_cmd }} volume service list -f json | + jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' + register: storage_backend + +- name: Get list of OpenStack hypervisors + ansible.builtin.shell: | + {{ openstack_cmd }} hypervisor list -f json + register: hypervisor_list + changed_when: false + +- name: Parse the hypervisor list to extract hostnames + ansible.builtin.set_fact: + hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}" + +- name: Iterate over each hypervisor + ansible.builtin.include_tasks: reboot_hypervisor.yml + loop: "{{ hypervisor_hostnames }}" + loop_control: + loop_var: hypervisor diff --git a/roles/update/tasks/reboot_hypervisor.yml b/roles/update/tasks/reboot_hypervisor.yml new file mode 100644 index 0000000000..5a12f3d8a5 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor.yml @@ -0,0 +1,73 @@ +--- +- name: Extract short hostname from FQDN + ansible.builtin.set_fact: + hypervisor_short_name: "{{ hypervisor.split('.')[0] }}" + +- debug: + msg: "Rebooting {{ hypervisor_short_name }}" + +- name: Check active VMs on hypervisor + ansible.builtin.shell: >- + {{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID' + register: active_vms + changed_when: false + +- name: Evacuate VMs if they are running + ansible.builtin.shell: >- + {{ bash_cmd }} ". cloudrc && + nova host-evacuate-live + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ hypervisor }}" + when: active_vms.stdout != '' + changed_when: true + +- name: Wait for compute node to get quiesced + ansible.builtin.shell: >- + {{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '[.[] | select(.Status | + contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] + | length' + register: compute_node_instances + until: compute_node_instances.stdout.find("0") > -1 + retries: 30 + delay: 5 + when: + - active_vms.stdout != '' + +- name: Reboot the hypervisor + ansible.builtin.reboot: + reboot_timeout: 1200 + test_command: "systemctl is-system-running | grep -e running -e degraded" + delegate_to: "{{ hypervisor_short_name }}" + become: true + +- name: Perform sanity checks post-reboot + ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml + vars: + current_hypervisor: "{{ hypervisor }}" + +- debug: + msg: "Migrate back {{ item }} to {{ hypervisor_short_name }}." + with_items: "{{ active_vms.stdout_lines }}" + +- name: Migrate back VMs post-reboot + ansible.builtin.shell: >- + set -o pipefail; + {{ bash_cmd }} ". cloudrc && + nova live-migration + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ item }} {{ hypervisor }}"; + {{ openstack_cmd }} server show {{ item }} -f json | + jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]' + register: instance_migration_result + until: instance_migration_result.stdout.find(hypervisor) > -1 + retries: 30 + delay: 5 + with_items: "{{ active_vms.stdout_lines }}" + when: + - active_vms.stdout != '' diff --git a/roles/update/tasks/reboot_hypervisor_sanity_checks.yml b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml new file mode 100644 index 0000000000..cb86d8f7ce --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml @@ -0,0 +1,33 @@ +--- +- ansible.builtin.debug: + msg: "Here I'm testing the reboot for {{ current_hypervisor }}." + +- name: Verify nova-compute service + ansible.builtin.shell: >- + {{ openstack_cmd }} compute service list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("nova-compute")) | .State' + register: nova_compute_status + until: nova_compute_status.stdout == 'up' + retries: 5 + delay: 30 + +- name: Verify ovn-controller service + ansible.builtin.shell: >- + {{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("ovn-controller")) | .Alive' + register: ovn_controller_status + until: ovn_controller_status.stdout == 'true' + retries: 5 + delay: 30 + +- name: Verify networking-ovn-metadata-agent + ansible.builtin.shell: >- + {{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive' + register: networking_ovn_metadata_status + until: networking_ovn_metadata_status.stdout == 'true' + retries: 5 + delay: 30