From cc818fcf090efeb719aed47f74442fba0fd8c0d5 Mon Sep 17 00:00:00 2001 From: lixiaoyuner <35456895+lixiaoyuner@users.noreply.github.com> Date: Tue, 18 Jul 2023 05:20:48 +0800 Subject: [PATCH] [ctrmgr]: Container image clean up bug fix (#15772) (#15841) Why I did it When do clean up container images, current code has two bugs need to be fixed. And some variables' name maybe cause confused, change the variables' name. Work item tracking Microsoft ADO (number only): 24502294 How I did it We do clean up after tag latest successfully. But currently tag latest function only return 0 and 1, 0 means succeed and 1 means failed, when we get 1, we will retry, when we get 0, we will do clean up. Actually the code 0 includes another case we don't need to do clean up. The case is that when we are doing tag latest, the container image we want to tag maybe not running, so we can not tag latest and don't need to cleanup, we need to separate this case from 0, return -1 now. When local mode(v1) -> kube mode(v2) happens, one problem is how to handle the local image, there are two cases. one case is that there was one kube v1 container dry-run(cause we don't relace the local if kube version = local version), we will remove the kube v1 image and tag the local version with ACR prefix and remove local v1 local tag. Another case is that there was no kube v1 container dry-run, we remove the local v1 image directly, cause the local v1 image should not be the last desire version. About the docker_id variable, it may cause confused, it's actually docker image id, so rename the variable. About the two dicts and the list, rename them to be more readable. How to verify it Check tag latest and image clean up result. --- src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py | 8 ++- src/sonic-ctrmgrd/ctrmgr/kube_commands.py | 53 +++++++++++-------- src/sonic-ctrmgrd/tests/kube_commands_test.py | 23 +++++++- 3 files changed, 57 insertions(+), 27 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py index 028ced0c2176..aaee7ce3b322 100755 --- a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py +++ b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py @@ -581,7 +581,7 @@ def on_state_update(self, key, op, data): def do_tag_latest(self, feat, docker_id, image_ver): ret = kube_commands.tag_latest(feat, docker_id, image_ver) - if ret != 0: + if ret == 1: # Tag latest failed. Retry after an interval self.start_time = datetime.datetime.now() self.start_time += datetime.timedelta( @@ -590,7 +590,7 @@ def do_tag_latest(self, feat, docker_id, image_ver): log_debug("Tag latest as local failed retry after {} seconds @{}". format(remote_ctr_config[TAG_RETRY], self.start_time)) - else: + elif ret == 0: last_version = self.st_data[feat][ST_FEAT_CTR_STABLE_VER] if last_version == image_ver: last_version = self.st_data[feat][ST_FEAT_CTR_LAST_VER] @@ -600,6 +600,10 @@ def do_tag_latest(self, feat, docker_id, image_ver): self.st_data[ST_FEAT_CTR_LAST_VER] = last_version self.st_data[ST_FEAT_CTR_STABLE_VER] = image_ver self.do_clean_image(feat, image_ver, last_version) + elif ret == -1: + # This means the container we want to tag latest is not running + # so we don't need to do clean up + pass def do_clean_image(self, feat, current_version, last_version): ret = kube_commands.clean_image(feat, current_version, last_version) diff --git a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py index fe37c0c8424c..0f94c9249d9d 100755 --- a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py +++ b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py @@ -478,7 +478,7 @@ def tag_latest(feat, docker_id, image_ver): else: log_error(err) elif ret == -1: - ret = 0 + log_debug(out) else: log_error(err) return ret @@ -487,31 +487,38 @@ def _do_clean(feat, current_version, last_version): err = "" out = "" ret = 0 - DOCKER_ID = "docker_id" + IMAGE_ID = "image_id" REPO = "repo" _, image_info, err = _run_command("docker images |grep {} |grep -v latest |awk '{{print $1,$2,$3}}'".format(feat)) if image_info: - version_dict = {} - version_dict_default = {} + remote_image_version_dict = {} + local_image_version_dict = {} for info in image_info.split("\n"): - rep, version, docker_id = info.split() + rep, version, image_id = info.split() if len(rep.split("/")) == 1: - version_dict_default[version] = {DOCKER_ID: docker_id, REPO: rep} + local_image_version_dict[version] = {IMAGE_ID: image_id, REPO: rep} else: - version_dict[version] = {DOCKER_ID: docker_id, REPO: rep} + remote_image_version_dict[version] = {IMAGE_ID: image_id, REPO: rep} - if current_version in version_dict: - image_prefix = version_dict[current_version][REPO] - del version_dict[current_version] + if current_version in remote_image_version_dict: + image_prefix = remote_image_version_dict[current_version][REPO] + del remote_image_version_dict[current_version] else: out = "Current version {} doesn't exist.".format(current_version) ret = 0 return ret, out, err - # should be only one item in version_dict_default - for k, v in version_dict_default.items(): - local_version, local_repo, local_docker_id = k, v[REPO], v[DOCKER_ID] - tag_res, _, err = _run_command("docker tag {} {}:{} && docker rmi {}:{}".format( - local_docker_id, image_prefix, local_version, local_repo, local_version)) + # should be only one item in local_image_version_dict + for k, v in local_image_version_dict.items(): + local_version, local_repo, local_image_id = k, v[REPO], v[IMAGE_ID] + # if there is a kube image with same version, need to remove the kube version + # and tag the local version to kube version for fallback preparation + # and remove the local version + if local_version in remote_image_version_dict: + tag_res, _, err = _run_command("docker rmi {}:{} && docker tag {} {}:{} && docker rmi {}:{}".format( + image_prefix, local_version, local_image_id, image_prefix, local_version, local_repo, local_version)) + # if there is no kube image with same version, just remove the local version + else: + tag_res, _, err = _run_command("docker rmi {}:{}".format(local_repo, local_version)) if tag_res == 0: msg = "Tag {} local version images successfully".format(feat) log_debug(msg) @@ -519,13 +526,13 @@ def _do_clean(feat, current_version, last_version): ret = 1 err = "Failed to tag {} local version images. Err: {}".format(feat, err) return ret, out, err - - if last_version in version_dict: - del version_dict[last_version] - - versions = [item[DOCKER_ID] for item in version_dict.values()] - if versions: - clean_res, _, err = _run_command("docker rmi {} --force".format(" ".join(versions))) + + if last_version in remote_image_version_dict: + del remote_image_version_dict[last_version] + + image_id_remove_list = [item[IMAGE_ID] for item in remote_image_version_dict.values()] + if image_id_remove_list: + clean_res, _, err = _run_command("docker rmi {} --force".format(" ".join(image_id_remove_list))) else: clean_res = 0 if clean_res == 0: @@ -534,7 +541,7 @@ def _do_clean(feat, current_version, last_version): err = "Failed to clean {} old version images. Err: {}".format(feat, err) ret = 1 else: - err = "Failed to docker images |grep {} |awk '{{print $3}}'".format(feat) + err = "Failed to docker images |grep {} |awk '{{print $3}}'. Error: {}".format(feat, err) ret = 1 return ret, out, err diff --git a/src/sonic-ctrmgrd/tests/kube_commands_test.py b/src/sonic-ctrmgrd/tests/kube_commands_test.py index 40b8ab180b9a..b224458b3676 100755 --- a/src/sonic-ctrmgrd/tests/kube_commands_test.py +++ b/src/sonic-ctrmgrd/tests/kube_commands_test.py @@ -266,7 +266,7 @@ }, 2: { common_test.DESCR: "Tag a unstable container", - common_test.RETVAL: 0, + common_test.RETVAL: -1, common_test.ARGS: ["snmp", "123456", "v1"], common_test.PROC_CMD: [ "docker ps |grep 123456" @@ -382,7 +382,7 @@ common_test.ARGS: ["snmp", "20201231.84", ""], common_test.PROC_CMD: [ "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", - "docker tag 507f8d28bf6e sonick8scue.azurecr.io/docker-sonic-telemetry:20201231.74 && docker rmi docker-sonic-telemetry:20201231.74" + "docker rmi docker-sonic-telemetry:20201231.74" ], common_test.PROC_OUT: [ "docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ @@ -394,6 +394,25 @@ 0 ] }, + 5: { + common_test.DESCR: "Clean image successfuly(local to dry-kube to kube)", + common_test.RETVAL: 0, + common_test.ARGS: ["snmp", "20201231.84", "20201231.74"], + common_test.PROC_CMD: [ + "docker images |grep snmp |grep -v latest |awk '{print $1,$2,$3}'", + "docker rmi sonick8scue.azurecr.io/docker-sonic-telemetry:20201231.74 && docker tag 507f8d28bf6e sonick8scue.azurecr.io/docker-sonic-telemetry:20201231.74 && docker rmi docker-sonic-telemetry:20201231.74" + ], + common_test.PROC_OUT: [ + "docker-sonic-telemetry 20201231.74 507f8d28bf6e\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.74 507f8d28bf6f\n\ + sonick8scue.azurecr.io/docker-sonic-telemetry 20201231.84 507f8d28bf6g", + "" + ], + common_test.PROC_CODE: [ + 0, + 0 + ] + }, } class TestKubeCommands(object):