diff --git a/roles/update/defaults/main.yml b/roles/update/defaults/main.yml index 527c6361f2..2d6c6452b1 100644 --- a/roles/update/defaults/main.yml +++ b/roles/update/defaults/main.yml @@ -38,6 +38,8 @@ cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_s cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh" ## User facing +cifmw_update_ansible_ssh_private_key_file: >- + "{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}" cifmw_update_ping_test: false cifmw_update_create_volume: false diff --git a/roles/update/tasks/main.yml b/roles/update/tasks/main.yml index 6fbd74e4a3..8e647ec4d9 100644 --- a/roles/update/tasks/main.yml +++ b/roles/update/tasks/main.yml @@ -74,3 +74,6 @@ - not cifmw_update_run_dryrun | bool ansible.builtin.shell: | {{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh + +- name: Reboot the compute nodes + ansible.builtin.include_tasks: reboot_computes.yml diff --git a/roles/update/tasks/reboot_computes.yml b/roles/update/tasks/reboot_computes.yml new file mode 100644 index 0000000000..149fd92935 --- /dev/null +++ b/roles/update/tasks/reboot_computes.yml @@ -0,0 +1,70 @@ +- name: Define command for OpenStack client interactions + ansible.builtin.set_fact: + cifmw_update_openstack_cmd: >- + oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack + cifmw_update_bash_cmd: >- + oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c + +- name: Register storage backend type + shell: >- + {{ cifmw_update_openstack_cmd }} volume service list -f json | + jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' + register: storage_backend + +- name: Get list of OpenStack hypervisors + ansible.builtin.shell: | + {{ cifmw_update_openstack_cmd }} hypervisor list -f json + register: hypervisor_list + changed_when: false + +- name: Parse the hypervisor list to extract hostnames + ansible.builtin.set_fact: + hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}" + +- name: Create a reboot monitor servers script + ansible.builtin.template: + src: "monitor_servers.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh" + mode: "0775" + +- name: Start the monitor servers script + ansible.builtin.shell: | + nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null & + echo $! + register: monitor_servers_job + +- name: Create a monitor placement monitor script + ansible.builtin.template: + src: "monitor_vm_placement.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh" + mode: "0775" + +- name: Start the monitor placement script + ansible.builtin.shell: | + nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null & + echo $! + register: monitor_placement_job + +- name: Iterate over each hypervisor + ansible.builtin.include_tasks: reboot_hypervisor.yml + loop: "{{ hypervisor_hostnames }}" + loop_control: + loop_var: hypervisor + +- name: Stop the monitor servers script if running + ansible.builtin.shell: | + if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then + kill {{ monitor_servers_job.stdout }} + fi + register: kill_result + failed_when: kill_result.rc not in [0, 1] # We can still have a race + # between kill -0 and + # kill, even if unlikely. + +- name: Stop the monitor placement script if running + ansible.builtin.shell: | + if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then + kill {{ monitor_placement_job.stdout }} + fi + register: kill_result + failed_when: kill_result.rc not in [0, 1] diff --git a/roles/update/tasks/reboot_hypervisor.yml b/roles/update/tasks/reboot_hypervisor.yml new file mode 100644 index 0000000000..c911f00c37 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor.yml @@ -0,0 +1,69 @@ +--- +- name: Extract short hostname from FQDN + ansible.builtin.set_fact: + cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}" + +- debug: + msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}" + +- name: Check active VMs on hypervisor + ansible.builtin.shell: >- + {{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID' + register: active_vms + changed_when: false + +- name: Evacuate VMs if they are running + ansible.builtin.shell: >- + {{ cifmw_update_bash_cmd }} ". cloudrc && + nova host-evacuate-live + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ hypervisor }}" + when: active_vms.stdout != '' + changed_when: true + +- name: Wait for compute node to get quiesced + ansible.builtin.shell: >- + {{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '[.[] | select(.Status | + contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] + | length' + register: compute_node_instances + until: compute_node_instances.stdout.find("0") > -1 + retries: 30 + delay: 5 + when: + - active_vms.stdout != '' + +- name: Reboot the hypervisor + ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml + +- name: Perform sanity checks post-reboot + ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml + vars: + current_hypervisor: "{{ hypervisor }}" + +- debug: + msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}." + with_items: "{{ active_vms.stdout_lines }}" + +- name: Migrate back VMs post-reboot + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_bash_cmd }} ". cloudrc && + nova live-migration + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ item }} {{ hypervisor }}"; + {{ cifmw_update_openstack_cmd }} server show {{ item }} -f json | + jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]' + register: instance_migration_result + until: instance_migration_result.stdout.find(hypervisor) > -1 + retries: 30 + delay: 5 + with_items: "{{ active_vms.stdout_lines }}" + when: + - active_vms.stdout != '' diff --git a/roles/update/tasks/reboot_hypervisor_sanity_checks.yml b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml new file mode 100644 index 0000000000..b68b6546fc --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml @@ -0,0 +1,36 @@ +--- +- ansible.builtin.debug: + msg: "Testing the status of the services for {{ current_hypervisor }}." + +- name: Verify nova-compute service + ansible.builtin.shell: >- + {{ cifmw_update_openstack_cmd }} compute service list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("nova-compute")) | .State' + register: nova_compute_status + until: nova_compute_status.stdout == 'up' + retries: 5 + delay: 30 + +- name: Verify ovn-controller service + ansible.builtin.shell: >- + {{ cifmw_update_openstack_cmd }} network agent list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("ovn-controller")) | .Alive' + register: ovn_controller_status + until: ovn_controller_status.stdout == 'true' + retries: 5 + delay: 30 + +- name: Verify networking-ovn-metadata-agent + ansible.builtin.shell: >- + {{ cifmw_update_openstack_cmd }} network agent list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive' + register: networking_ovn_metadata_status + until: networking_ovn_metadata_status.stdout == 'true' + retries: 5 + delay: 30 diff --git a/roles/update/tasks/reboot_hypervisor_using_cr.yml b/roles/update/tasks/reboot_hypervisor_using_cr.yml new file mode 100644 index 0000000000..ec24fe6771 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_using_cr.yml @@ -0,0 +1,51 @@ +--- +- name: Define necessary command prefixes for kube operations + ansible.builtin.set_fact: + cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}" + +- name: Fetch NodeSets for the OpenStackDataPlaneDeployment + ansible.builtin.shell: >- + {{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name | awk -F'/' '{print " - " $2}' + register: cifmw_update_node_sets + changed_when: false + +- name: Construct date string for CR name + ansible.builtin.set_fact: + cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d%H%S') }}" + +- name: Construct CR name + ansible.builtin.set_fact: + cifmw_reboot_dep_name: >- + reboot-{{ cifmw_update_hypervisor_short_name }}-{{ cifmw_update_cr_date }} + +- name: Create OpenStackDataPlaneDeployment CR YAML file + ansible.builtin.copy: + dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml" + content: | + apiVersion: dataplane.openstack.org/v1beta1 + kind: OpenStackDataPlaneDeployment + metadata: + name: {{ cifmw_reboot_dep_name }} + namespace: {{ cifmw_update_namespace }} + spec: + nodeSets: + {{ cifmw_update_node_sets.stdout }} + servicesOverride: + - reboot-os + ansibleExtraVars: + edpm_reboot_strategy: force + ansibleLimit: {{ cifmw_update_hypervisor_short_name }} + +- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot + ansible.builtin.shell: >- + {{ cifmw_update_oc_cmd_prefix }} + create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml + +- name: Check OpenStackDataPlaneDeployment status + ansible.builtin.command: >- + {{ cifmw_update_oc_cmd_prefix }} get openstackdataplanedeployment + {{ cifmw_reboot_dep_name }} + register: deployment_status + until: deployment_status.stdout.find('Setup complete') > -1 + retries: 60 + delay: 5 diff --git a/roles/update/templates/monitor_servers.sh.j2 b/roles/update/templates/monitor_servers.sh.j2 new file mode 100644 index 0000000000..4dccbe0e85 --- /dev/null +++ b/roles/update/templates/monitor_servers.sh.j2 @@ -0,0 +1,46 @@ +#!/bin/bash + +set -e + +# List of servers can be input as command line arguments or hardcoded here. +servers=( +{% for server in hypervisor_hostnames %} +{{ server.split('.')[0] }} +{% endfor %} +) +# or, for a hardcoded list: servers=("server1" "server2" ...) + +# Log file to store the status changes +log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log" + +# Function to check server status via SSH +# TODO: ping always replies even if server is down. +check_servers() { + for server in "${servers[@]}"; do + if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then + # Server is up + if [ "${server_status[$server]}" == "down" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file" + server_status[$server]="up" + fi + else + # Server is down + if [ "${server_status[$server]}" != "down" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file" + server_status[$server]="down" + fi + fi + done +} + +# Initialize server status array +declare -A server_status +for server in "${servers[@]}"; do + server_status[$server]="unknown" +done + +# Main loop to continuously check server status +while true; do + check_servers + sleep 1 # Wait for 60 seconds before re-checking +done diff --git a/roles/update/templates/monitor_vm_placement.sh.j2 b/roles/update/templates/monitor_vm_placement.sh.j2 new file mode 100644 index 0000000000..1c79162e49 --- /dev/null +++ b/roles/update/templates/monitor_vm_placement.sh.j2 @@ -0,0 +1,23 @@ +#!/bin/bash +# Log the instance hypervisor. Useful when tracking compute reboot. +set -e + +export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}" +export PATH="{{ cifmw_path }}" + +log_file={{ cifmw_update_artifacts_basedir }}/instance_placement.log +source_file={{ cifmw_update_artifacts_basedir }}/workload_suffix + +. "$source_file" + +instance_name="instance_${SUFFIX}" +previous_hypervisor="" + +while true; do + current_hypervisor=$(oc rsh -n openstack openstackclient openstack server show "${instance_name}" -f json | jq -r -c '.["OS-EXT-SRV-ATTR:host"]') + if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then + echo "$(date) $instance_name $current_hypervisor" >> "$log_file" + previous_hypervisor="$current_hypervisor" + fi + sleep 1 +done