From 8039f1870a66a50a182b47d5cf26dcfce31f7c6d Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Thu, 5 Dec 2024 16:57:09 +0000 Subject: [PATCH] Staggered upgrade procedure --- doc/source/operations/upgrading-openstack.rst | 65 ++++++++++++++ etc/kayobe/ansible/neutron-l3-drain.yml | 23 +++++ .../neutron-namespace-drain/defaults/main.yml | 8 ++ .../tasks/add-dhcp.yml | 51 +++++++++++ .../tasks/add-new-l3.yml | 43 +++++++++ .../tasks/drain-dhcp.yml | 87 ++++++++++++++++++ .../tasks/drain-l3.yml | 90 +++++++++++++++++++ .../tasks/enable-l3.yml | 24 +++++ .../neutron-namespace-drain/tasks/main.yml | 19 ++++ .../neutron-namespace-drain/tasks/setup.yml | 31 +++++++ 10 files changed, 441 insertions(+) create mode 100644 etc/kayobe/ansible/neutron-l3-drain.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/defaults/main.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-dhcp.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-new-l3.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-dhcp.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-l3.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/enable-l3.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/main.yml create mode 100644 etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/setup.yml diff --git a/doc/source/operations/upgrading-openstack.rst b/doc/source/operations/upgrading-openstack.rst index 9923f3cc2..998d6f8c5 100644 --- a/doc/source/operations/upgrading-openstack.rst +++ b/doc/source/operations/upgrading-openstack.rst @@ -1063,6 +1063,12 @@ This will block the upgrade, but may be overridden by setting ``etc/kayobe/kolla/globals.yml`` or ``etc/kayobe/environments//kolla/globals.yml``. +Depending on the networking architecture of your cloud, the steps used +to upgrade the containerised services will differ. + +OVN +^^^ + To upgrade the containerised control plane services: .. code-block:: console @@ -1076,6 +1082,65 @@ scope of the upgrade: kayobe overcloud service upgrade --tags config --kolla-tags keystone +OVS (w/ Dedicated network nodes) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should first stop the Octavia health manager to prevent alerts during +the service upgrade. + +.. code-block:: console + + kayobe overcloud host command run --command "docker stop octavia_health_manager" --limit controllers --become + +Upgrade the control plane services + +.. code-block:: console + + kayobe overcloud serivce upgrade --kolla-limit controllers + +To ensure L3 reliability during the upgrade, we will need to manually drain +the network nodes of all agents, and upgrade the nodes sequentially. + +Kolla credentials will need to be activated before running the neutron-namespace-drain +role. + +.. code-block:: console + + source $KOLLA_CONFIG_PATH/public-openrc.sh + +You should substitute with the first network node to be drained, To set +the node for maintenance and begin draining the agents: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/neutron-l3-drain.yml -e neutron_drain_host= -e maintenance=true -e drain_dhcp_agents=true + +You can monitor the L3/DHCP agents being drained from the node by running: + +.. code-block:: console + + ssh -t watch ip netns ls + +Once all agents have been drained, you can upgrade the containerised services +on the network node. + +.. code-block:: console + + kayobe overcloud service upgrade --kolla-limit + +Following the service upgrade, the agents can be restored on the node by disabling maintenance: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/neutron-l3-drain.yml -e neutron_drain_host= -e maintenance=false -e drain_dhcp_agents=true + +The above steps should be repeated for the remaining network nodes, once all network nodes have been upgraded +the remaining containerised services can be upgraded: + +.. code-block:: console + + kayobe overcloud service upgrade --kolla-tags common,nova,prometheus,openvswitch,neutron --skip-prechecks -kl controllers,compute --limit controllers,compute + Updating the Octavia Amphora Image ---------------------------------- diff --git a/etc/kayobe/ansible/neutron-l3-drain.yml b/etc/kayobe/ansible/neutron-l3-drain.yml new file mode 100644 index 000000000..7640a6920 --- /dev/null +++ b/etc/kayobe/ansible/neutron-l3-drain.yml @@ -0,0 +1,23 @@ +--- +- name: Drain neutron of l3 agents and dhcp agents + hosts: localhost + gather_facts: true + tags: + - neutron-l3-drain + vars: + maintenance: false + drain_ctrl1: false + drain_ctrl2: false + drain_ctrl3: false + tasks: + - import_role: + name: neutron-namespace-drain + tasks_from: main.yml + when: drain_ctrl1 | bool or drain_ctrl2 | bool or drain_ctrl3 | bool or neutron_drain_host is defined + - name: "Print Info" + debug: + msg: + - "{{ neutron_drain_host }} is read for maintenance" + - "rerun this play book with -e maintenance=false to re-add" + - "routers" + when: maintenance | bool diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/defaults/main.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/defaults/main.yml new file mode 100644 index 000000000..2d67ee960 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/defaults/main.yml @@ -0,0 +1,8 @@ +--- +neutron_drain_venv: "{{ virtualenv_path }}/openstack" +neutron_drain_host: "{% if drain_ctrl1 | bool %}{{ groups['controllers'][0] }}{% elif drain_ctrl2 | bool %}{{ groups['controllers'][1] }}{% elif drain_ctrl3 | bool %}{{ groups['controllers'][2] }}{% endif %}" +neutron_drain_venv_python: "{{ 'python' ~ ansible_facts.python.version.major ~ '.' ~ ansible_facts.python.version.minor }}" +drain_ctrl1: false +drain_ctrl2: false +drain_ctrl3: false +drain_dhcp_agents: false diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-dhcp.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-dhcp.yml new file mode 100644 index 000000000..ed58159e8 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-dhcp.yml @@ -0,0 +1,51 @@ +--- +- name: Query source SRC_DHCP_ID + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --host {{ neutron_drain_host }} + --agent-type dhcp -f value -c ID + register: SRC_DHCP_ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set fact containing SRC_DHCP_ID + set_fact: + DHCP_SRC_ID: "{{ SRC_DHCP_ID.stdout }}" + +- name: Enable DHCP agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent set "{{ DHCP_SRC_ID }}" --enable + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Get Network IDs + command: > + {{ neutron_drain_venv }}/bin/openstack + network list -f value -c ID + register: network_ids + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set Network IDs + set_fact: + NETWORK_IDS: "{{ network_ids.stdout_lines }}" + +- name: Add DHCP agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent add network {{ DHCP_SRC_ID }} {{ item }} --dhcp + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + with_items: + - "{{ NETWORK_IDS }}" + loop_control: + pause: 10 diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-new-l3.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-new-l3.yml new file mode 100644 index 000000000..0e5fd4f17 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/add-new-l3.yml @@ -0,0 +1,43 @@ +--- +- name: Query L3_IDs + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --agent-type l3 -f value -c ID + register: L3_ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set fact containing SRC_L3_IDs + set_fact: + L3_IDS: "{{ L3_ID.stdout_lines }}" + +- name: Get agents for each router + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --router {{ router_id }} --agent-type l3 -f value -c ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + failed_when: false + register: ROUTER_L3_IDS + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + loop: "{{ ROUTER_IDS }}" + loop_control: + loop_var: router_id + +- name: Add agent to router + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent add router --l3 {{ L3_ADD }} {{ item.router_id }} + loop: "{{ ROUTER_L3_IDS.results }}" + loop_control: + label: "{{ item.router_id }}" + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + L3_ADD_DIFF: "{{ L3_IDS | difference([L3_SRC_ID]) | difference(item.stdout_lines) }}" + L3_ADD: "{{ L3_ADD_DIFF[:1] | first }}" + when: L3_ADD_DIFF | length > 0 diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-dhcp.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-dhcp.yml new file mode 100644 index 000000000..ba28edc57 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-dhcp.yml @@ -0,0 +1,87 @@ +--- +- name: Query source SRC_DHCP_ID + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --host {{ neutron_drain_host }} + --agent-type dhcp -f value -c ID + register: SRC_DHCP_ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set fact containing SRC_DHCP_ID + set_fact: + DHCP_SRC_ID: "{{ SRC_DHCP_ID.stdout }}" + +- name: Get DHCP agent network IDs + command: > + {{ neutron_drain_venv }}/bin/openstack + network list --agent {{ DHCP_SRC_ID }} -f value -c ID + register: dhcp_agent_ids + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set DHCP agent network IDs + set_fact: + DHCP_AGENT_IDS: "{{ dhcp_agent_ids.stdout_lines }}" + +- name: Remove DHCP agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent remove network {{ DHCP_SRC_ID }} {{ item }} --dhcp + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + with_items: + - "{{ DHCP_AGENT_IDS }}" + +- name: Wait for no more dhcp agents to be attached to the host + block: + - name: Retry count + set_fact: + retry_count: "{{ 0 if retry_count is undefined or retry_count == 'reset' else retry_count | int + 1 }}" + max_retries: 20 + + - name: Verify dhcp agents exist + command: > + {{ neutron_drain_venv }}/bin/openstack + network list --agent {{ DHCP_SRC_ID }} -f value -c ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + register: agent_status + + - name: Fail if DHCP agent still attached + fail: + msg: a DHCP agent is still attached to "{{ L3_SRC_ID }}" + when: agent_status.stdout | length > 0 + + - name: Reset retry count after success + set_fact: + retry_count: reset + rescue: + - fail: + msg: | + Maximum retries waiting for DHCP agents to be detached reached + when: retry_count | int == max_retries + + - name: Reset retry counter if max retries reached (exit loop) + set_fact: + retry_count: reset + failed_when: retry_count == 'reset' + when: retry_count | int >= max_retries | int + +- name: Disable DHCP agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent set "{{ DHCP_SRC_ID }}" --disable + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + when: agent_status.stdout | length == 0 diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-l3.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-l3.yml new file mode 100644 index 000000000..eebd99d60 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/drain-l3.yml @@ -0,0 +1,90 @@ +--- +- name: Query source SRC_L3_ID + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --host {{ neutron_drain_host }} + --agent-type l3 -f value -c ID + register: SRC_L3_ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set fact containing SRC_L3_ID + set_fact: + L3_SRC_ID: "{{ SRC_L3_ID.stdout }}" + +- name: Get Router IDs + command: > + {{ neutron_drain_venv }}/bin/openstack + router list --agent {{ L3_SRC_ID }} -f value -c ID + register: router_ids + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set Router IDs + set_fact: + ROUTER_IDS: "{{ router_ids.stdout_lines }}" + +- name: Add agents to router + include_tasks: add-new-l3.yml + when: ROUTER_IDS | length > 0 + +- name: Remove router + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent remove router {{ L3_SRC_ID }} {{ item }} --l3 + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + with_items: + - "{{ ROUTER_IDS }}" + +- name: Wait for no more routers to be attached to the host + block: + - name: Retry count + set_fact: + retry_count: "{{ 0 if retry_count is undefined or retry_count == 'reset' else retry_count | int + 1 }}" + max_retries: 20 + + - name: Verify routers exist + command: > + {{ neutron_drain_venv }}/bin/openstack router list --agent {{ L3_SRC_ID }} -f value -c ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + register: agent_status + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + + - name: Fail if routers still attached + fail: + msg: a Router is still attached to agent "{{ L3_SRC_ID }}" + when: agent_status.stdout | length > 0 + + - name: Reset retry count after success + set_fact: + retry_count: reset + rescue: + - fail: + msg: | + Maximum retries waiting for routers to be detached reached + when: retry_count | int == max_retries + + - name: Reset retry counter if max retries reached (exit loop) + set_fact: + retry_count: reset + failed_when: retry_count == 'reset' + when: retry_count | int >= max_retries | int + +- name: Disable L3 agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent set "{{ L3_SRC_ID }}" --disable + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + when: agent_status.stdout | length == 0 + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/enable-l3.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/enable-l3.yml new file mode 100644 index 000000000..e6679c40b --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/enable-l3.yml @@ -0,0 +1,24 @@ +--- +- name: Query source SRC_L3_ID + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent list --host {{ neutron_drain_host }} + --agent-type l3 -f value -c ID + register: SRC_L3_ID + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" + +- name: Set fact containing SRC_L3_ID + set_fact: + L3_SRC_ID: "{{ SRC_L3_ID.stdout }}" + +- name: Enable L3 agent + command: > + {{ neutron_drain_venv }}/bin/openstack + network agent set "{{ L3_SRC_ID }}" --enable + environment: "{{ openstack_auth_env }}" + delegate_to: "{{ neutron_drain_host }}" + vars: + ansible_host: "{{ hostvars[neutron_drain_host].ansible_host }}" diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/main.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/main.yml new file mode 100644 index 000000000..c33f72731 --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/main.yml @@ -0,0 +1,19 @@ +--- +- name: "Setup OpenStack venv" + import_tasks: setup.yml + +- name: "Drain L3 agents" + import_tasks: drain-l3.yml + when: maintenance | bool + +- name: "Enable L3 agent" + import_tasks: enable-l3.yml + when: not maintenance | bool + +- name: "Drain DHCP agents" + import_tasks: drain-dhcp.yml + when: maintenance | bool and drain_dhcp_agents | bool + +- name: "Add DHCP agents" + import_tasks: add-dhcp.yml + when: not maintenance | bool and drain_dhcp_agents | bool diff --git a/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/setup.yml b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/setup.yml new file mode 100644 index 000000000..909c4959f --- /dev/null +++ b/etc/kayobe/ansible/roles/neutron-namespace-drain/tasks/setup.yml @@ -0,0 +1,31 @@ +--- + +- name: Ensure the latest version of pip is installed + pip: + name: + - pip + state: latest + virtualenv: "{{ neutron_drain_venv }}" + virtualenv_command: "{{ neutron_drain_venv_python }} -m venv" + run_once: true + become: true + delegate_to: "{{ neutron_drain_host }}" + vars: + # NOTE: Without this, the delegate ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ neutron_drain_host }}" + +- name: Set up openstack cli virtualenv + pip: + virtualenv: "{{ neutron_drain_venv }}" + name: + - python-openstackclient + state: latest + extra_args: "{% if pip_upper_constraints_file %}-c {{ pip_upper_constraints_file }}{% endif %}" + run_once: true + become: true + delegate_to: "{{ neutron_drain_host }}" + vars: + # NOTE: Without this, the delegate ansible_host variable will not + # be respected when using delegate_to. + ansible_host: "{{ neutron_drain_host }}"