-
Notifications
You must be signed in to change notification settings - Fork 110
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add the steps to reboot the computes after update.
This sequence implements reboot of the compute nodes after the update. If one or more instances have been created on the hypervisor being rebooted they will be live-migrated to others hypervisor before the reboot and migrated back to that original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. During the reboot we start two scripts. One monitors and log the reboot of the hypervisors. The other log where the instance is currently running. The log files can be found in `~/ci-framework-data/tests/update/` in `reboot_server_status.log` and `instance_placement.log` respectively. Closes: https://issues.redhat.com/browse/OSPRH-8937
- Loading branch information
Showing
8 changed files
with
300 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
- name: Define command for OpenStack client interactions | ||
ansible.builtin.set_fact: | ||
cifmw_update_openstack_cmd: >- | ||
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack | ||
cifmw_update_bash_cmd: >- | ||
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c | ||
- name: Register storage backend type | ||
shell: >- | ||
{{ cifmw_update_openstack_cmd }} volume service list -f json | | ||
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' | ||
register: storage_backend | ||
|
||
- name: Get list of OpenStack hypervisors | ||
ansible.builtin.shell: | | ||
{{ cifmw_update_openstack_cmd }} hypervisor list -f json | ||
register: hypervisor_list | ||
changed_when: false | ||
|
||
- name: Parse the hypervisor list to extract hostnames | ||
ansible.builtin.set_fact: | ||
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}" | ||
|
||
- name: Create a reboot monitor servers script | ||
ansible.builtin.template: | ||
src: "monitor_servers.sh.j2" | ||
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh" | ||
mode: "0775" | ||
|
||
- name: Start the monitor servers script | ||
ansible.builtin.shell: | | ||
nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null & | ||
echo $! | ||
register: monitor_servers_job | ||
|
||
- name: Create a monitor placement monitor script | ||
ansible.builtin.template: | ||
src: "monitor_vm_placement.sh.j2" | ||
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh" | ||
mode: "0775" | ||
|
||
- name: Start the monitor placement script | ||
ansible.builtin.shell: | | ||
nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null & | ||
echo $! | ||
register: monitor_placement_job | ||
|
||
- name: Iterate over each hypervisor | ||
ansible.builtin.include_tasks: reboot_hypervisor.yml | ||
loop: "{{ hypervisor_hostnames }}" | ||
loop_control: | ||
loop_var: hypervisor | ||
|
||
- name: Stop the monitor servers script if running | ||
ansible.builtin.shell: | | ||
if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then | ||
kill {{ monitor_servers_job.stdout }} | ||
fi | ||
register: kill_result | ||
failed_when: kill_result.rc not in [0, 1] # We can still have a race | ||
# between kill -0 and | ||
# kill, even if unlikely. | ||
|
||
- name: Stop the monitor placement script if running | ||
ansible.builtin.shell: | | ||
if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then | ||
kill {{ monitor_placement_job.stdout }} | ||
fi | ||
register: kill_result | ||
failed_when: kill_result.rc not in [0, 1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
--- | ||
- name: Extract short hostname from FQDN | ||
ansible.builtin.set_fact: | ||
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}" | ||
|
||
- debug: | ||
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}" | ||
|
||
- name: Check active VMs on hypervisor | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json | ||
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID' | ||
register: active_vms | ||
changed_when: false | ||
|
||
- name: Evacuate VMs if they are running | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_bash_cmd }} ". cloudrc && | ||
nova host-evacuate-live | ||
{% if 'ceph' not in storage_backend.stdout %} | ||
--block-migrate | ||
{% endif %} | ||
{{ hypervisor }}" | ||
when: active_vms.stdout != '' | ||
changed_when: true | ||
|
||
- name: Wait for compute node to get quiesced | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json | ||
| jq -r -c '[.[] | select(.Status | | ||
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] | ||
| length' | ||
register: compute_node_instances | ||
until: compute_node_instances.stdout.find("0") > -1 | ||
retries: 30 | ||
delay: 5 | ||
when: | ||
- active_vms.stdout != '' | ||
|
||
- name: Reboot the hypervisor | ||
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml | ||
|
||
- name: Perform sanity checks post-reboot | ||
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml | ||
vars: | ||
current_hypervisor: "{{ hypervisor }}" | ||
|
||
- debug: | ||
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}." | ||
with_items: "{{ active_vms.stdout_lines }}" | ||
|
||
- name: Migrate back VMs post-reboot | ||
ansible.builtin.shell: >- | ||
set -o pipefail; | ||
{{ cifmw_update_bash_cmd }} ". cloudrc && | ||
nova live-migration | ||
{% if 'ceph' not in storage_backend.stdout %} | ||
--block-migrate | ||
{% endif %} | ||
{{ item }} {{ hypervisor }}"; | ||
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json | | ||
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]' | ||
register: instance_migration_result | ||
until: instance_migration_result.stdout.find(hypervisor) > -1 | ||
retries: 30 | ||
delay: 5 | ||
with_items: "{{ active_vms.stdout_lines }}" | ||
when: | ||
- active_vms.stdout != '' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
--- | ||
- ansible.builtin.debug: | ||
msg: "Testing the status of the services for {{ current_hypervisor }}." | ||
|
||
- name: Verify nova-compute service | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_openstack_cmd }} compute service list | ||
--host {{ current_hypervisor }} -f json | ||
| jq -r -c '.[] | ||
| select(.Binary | contains("nova-compute")) | .State' | ||
register: nova_compute_status | ||
until: nova_compute_status.stdout == 'up' | ||
retries: 5 | ||
delay: 30 | ||
|
||
- name: Verify ovn-controller service | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_openstack_cmd }} network agent list | ||
--host {{ current_hypervisor }} -f json | ||
| jq -r -c '.[] | ||
| select(.Binary | contains("ovn-controller")) | .Alive' | ||
register: ovn_controller_status | ||
until: ovn_controller_status.stdout == 'true' | ||
retries: 5 | ||
delay: 30 | ||
|
||
- name: Verify networking-ovn-metadata-agent | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_openstack_cmd }} network agent list | ||
--host {{ current_hypervisor }} -f json | ||
| jq -r -c '.[] | ||
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive' | ||
register: networking_ovn_metadata_status | ||
until: networking_ovn_metadata_status.stdout == 'true' | ||
retries: 5 | ||
delay: 30 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
--- | ||
- name: Define necessary command prefixes for kube operations | ||
ansible.builtin.set_fact: | ||
cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}" | ||
|
||
- name: Fetch NodeSets for the OpenStackDataPlaneDeployment | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name | awk -F'/' '{print " - " $2}' | ||
register: cifmw_update_node_sets | ||
changed_when: false | ||
|
||
- name: Construct date string for CR name | ||
ansible.builtin.set_fact: | ||
cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d%H%S') }}" | ||
|
||
- name: Construct CR name | ||
ansible.builtin.set_fact: | ||
cifmw_reboot_dep_name: >- | ||
reboot-{{ cifmw_update_hypervisor_short_name }}-{{ cifmw_update_cr_date }} | ||
- name: Create OpenStackDataPlaneDeployment CR YAML file | ||
ansible.builtin.copy: | ||
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml" | ||
content: | | ||
apiVersion: dataplane.openstack.org/v1beta1 | ||
kind: OpenStackDataPlaneDeployment | ||
metadata: | ||
name: {{ cifmw_reboot_dep_name }} | ||
namespace: {{ cifmw_update_namespace }} | ||
spec: | ||
nodeSets: | ||
{{ cifmw_update_node_sets.stdout }} | ||
servicesOverride: | ||
- reboot-os | ||
ansibleExtraVars: | ||
edpm_reboot_strategy: force | ||
ansibleLimit: {{ cifmw_update_hypervisor_short_name }} | ||
- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot | ||
ansible.builtin.shell: >- | ||
{{ cifmw_update_oc_cmd_prefix }} | ||
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml | ||
- name: Check OpenStackDataPlaneDeployment status | ||
ansible.builtin.command: >- | ||
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanedeployment | ||
{{ cifmw_reboot_dep_name }} | ||
register: deployment_status | ||
until: deployment_status.stdout.find('Setup complete') > -1 | ||
retries: 60 | ||
delay: 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
|
||
# List of servers can be input as command line arguments or hardcoded here. | ||
servers=( | ||
{% for server in hypervisor_hostnames %} | ||
{{ server.split('.')[0] }} | ||
{% endfor %} | ||
) | ||
# or, for a hardcoded list: servers=("server1" "server2" ...) | ||
|
||
# Log file to store the status changes | ||
log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log" | ||
|
||
# Function to check server status via SSH | ||
# TODO: ping always replies even if server is down. | ||
check_servers() { | ||
for server in "${servers[@]}"; do | ||
if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then | ||
# Server is up | ||
if [ "${server_status[$server]}" == "down" ]; then | ||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file" | ||
server_status[$server]="up" | ||
fi | ||
else | ||
# Server is down | ||
if [ "${server_status[$server]}" != "down" ]; then | ||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file" | ||
server_status[$server]="down" | ||
fi | ||
fi | ||
done | ||
} | ||
|
||
# Initialize server status array | ||
declare -A server_status | ||
for server in "${servers[@]}"; do | ||
server_status[$server]="unknown" | ||
done | ||
|
||
# Main loop to continuously check server status | ||
while true; do | ||
check_servers | ||
sleep 1 # Wait for 60 seconds before re-checking | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/bin/bash | ||
# Log the instance hypervisor. Useful when tracking compute reboot. | ||
set -e | ||
|
||
export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}" | ||
export PATH="{{ cifmw_path }}" | ||
|
||
log_file={{ cifmw_update_artifacts_basedir }}/instance_placement.log | ||
source_file={{ cifmw_update_artifacts_basedir }}/workload_suffix | ||
|
||
. "$source_file" | ||
|
||
instance_name="instance_${SUFFIX}" | ||
previous_hypervisor="" | ||
|
||
while true; do | ||
current_hypervisor=$(oc rsh -n openstack openstackclient openstack server show "${instance_name}" -f json | jq -r -c '.["OS-EXT-SRV-ATTR:host"]') | ||
if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then | ||
echo "$(date) $instance_name $current_hypervisor" >> "$log_file" | ||
previous_hypervisor="$current_hypervisor" | ||
fi | ||
sleep 1 | ||
done |