From fbdcf143c28cf0451baa669a27f75f2c98fa1f37 Mon Sep 17 00:00:00 2001 From: lixiaoyuner <35456895+lixiaoyuner@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:42:17 +0800 Subject: [PATCH] [k8s]: Bypass the systemd service restart limit and do immediately restart when change to local mode (#15432) Why I did it During the upgrade process via k8s, the feature's systemd service will restart as well, all of the feature systemd service has restart number limit, and the limit number is too small, only three times. if fallback happens when upgrade, the start count will be 2, just once again, the systemd service will be down. So, need to bypass this. This restart function will be called when do local -> kube, kube -> kube, kube ->local, each time call this function, we indeed need to restart successfully, so do reset-failed every time we do restart. When need to go back to local mode, we do systemd restart immediately without waiting the default restart interval time so that we can reduce the container down time. Work item tracking Microsoft ADO (number only): 24172368 How I did it Before every restart for upgrade, do reset feature's restart number. The restart number will be reset to 0 to bypass the restart limit. When need to go back to local mode, we do systemd restart immediately. How to verify it Feature's systemd service can be always restarted successfully during upgrade process via k8s. --- src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py | 15 +++++++++--- src/sonic-ctrmgrd/tests/ctrmgrd_test.py | 31 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py index 27648dd22d02..516db2ef05c4 100755 --- a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py +++ b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py @@ -6,7 +6,7 @@ import os import sys import syslog - +import subprocess from collections import defaultdict from ctrmgr.ctrmgr_iptables import iptable_proxy_rule_upd @@ -141,7 +141,7 @@ def ts_now(): def is_systemd_active(feat): if not UNIT_TESTING: - status = os.system('systemctl is-active --quiet {}'.format(feat)) + status = subprocess.call(['systemctl', 'is-active', '--quiet', str(feat)]) else: status = UNIT_TESTING_ACTIVE log_debug("system status for {}: {}".format(feat, str(status))) @@ -151,7 +151,8 @@ def is_systemd_active(feat): def restart_systemd_service(server, feat, owner): log_debug("Restart service {} to owner:{}".format(feat, owner)) if not UNIT_TESTING: - status = os.system("systemctl restart {}".format(feat)) + subprocess.call(["systemctl", "reset-failed", str(feat)]) + status = subprocess.call(["systemctl", "restart", str(feat)]) else: server.mod_db_entry(STATE_DB_NAME, FEATURE_TABLE, feat, {"restart": "true"}) @@ -551,6 +552,7 @@ def on_state_update(self, key, op, data): self.st_data[key] = _update_entry(dflt_st_feat, data) remote_state = self.st_data[key][ST_FEAT_REMOTE_STATE] + current_owner = self.st_data[key][ST_FEAT_OWNER] if (remote_state == REMOTE_RUNNING) and (old_remote_state != remote_state): # Tag latest @@ -563,6 +565,13 @@ def on_state_update(self, key, op, data): log_debug("try to tag latest label after {} seconds @{}".format( remote_ctr_config[TAG_IMAGE_LATEST], start_time)) + + # This is for going back to local without waiting the systemd restart time + # when k8s is down, can't deploy containers to worker and need to go back to local + # if current owner is already local, we don't do restart + if (current_owner != OWNER_LOCAL) and (remote_state == REMOTE_NONE) and (old_remote_state == REMOTE_STOPPED): + restart_systemd_service(self.server, key, OWNER_LOCAL) + return if (not init): if (old_remote_state == remote_state): diff --git a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py index 0304985224ea..76651309ce6a 100755 --- a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py +++ b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py @@ -324,6 +324,37 @@ } } } + }, + 4: { + common_test.DESCR: "Restart immediately to go back to local when remote_state changes to none from stopped", + common_test.ARGS: "ctrmgrd", + common_test.PRE: { + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "remote_state": "stopped", + } + } + } + }, + common_test.UPD: { + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "remote_state": "none", + } + } + } + }, + common_test.POST: { + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "restart": "true" + } + } + } + } } }