Skip to content

Commit

Permalink
Enhance dr_helpers and dr_workload to support VRG preservation in ODF…
Browse files Browse the repository at this point in the history
… 4.18 and AppSet workload health check(#11141)

* Update dr_helper functions to skip waiting for VRG deletion ODF 4.18 onward.
Improve VRG checks for discovered applications.

Signed-off-by: Sidhant Agrawal <[email protected]>

* Update AppSet workload deployment verification to check health status instead of phase

Signed-off-by: Sidhant Agrawal <[email protected]>

---------

Signed-off-by: Sidhant Agrawal <[email protected]>
  • Loading branch information
sidhant-agrawal authored Jan 17, 2025
1 parent 899036f commit 59bcd00
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 20 deletions.
34 changes: 25 additions & 9 deletions ocs_ci/helpers/dr_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,13 @@ def check_vrg_state(state, namespace):

# Skip state check if resource was deleted
if len(vrg_list) == 0 and state.lower() == "secondary":
logger.info("VRG resource not found, skipping state check")
return True
ocs_version = version.get_semantic_ocs_version_from_config()
if ocs_version <= version.VERSION_4_17:
logger.info("VRG resource not found, skipping state check")
return True
else:
logger.info("VRG resource not found")
return False

vrg_name = vrg_list[0]["metadata"]["name"]
desired_state = vrg_list[0]["spec"]["replicationState"]
Expand Down Expand Up @@ -617,7 +622,9 @@ def wait_for_replication_resources_creation(
raise TimeoutExpiredError(error_msg)


def wait_for_replication_resources_deletion(namespace, timeout, check_state=True):
def wait_for_replication_resources_deletion(
namespace, timeout, check_state=True, discovered_apps=False
):
"""
Wait for replication resources to be deleted
Expand All @@ -626,11 +633,13 @@ def wait_for_replication_resources_deletion(namespace, timeout, check_state=True
timeout (int): time in seconds to wait for resources to reach expected
state or deleted
check_state (bool): True for checking resources state before deletion, False otherwise
discovered_apps (bool): If true then deployed workload is discovered_apps
Raises:
TimeoutExpiredError: In case replication resources not deleted
"""
vrg_namespace = constants.DR_OPS_NAMESAPCE if discovered_apps else namespace
# TODO: Improve the parameter for condition
if "cephfs" in namespace:
resource_kind = constants.REPLICATION_SOURCE
Expand Down Expand Up @@ -660,7 +669,7 @@ def wait_for_replication_resources_deletion(namespace, timeout, check_state=True
sleep=5,
func=check_vrg_state,
state="secondary",
namespace=namespace,
namespace=vrg_namespace,
)
if not sample.wait_for_func_status(result=True):
error_msg = (
Expand All @@ -669,10 +678,13 @@ def wait_for_replication_resources_deletion(namespace, timeout, check_state=True
logger.info(error_msg)
raise TimeoutExpiredError(error_msg)

if "cephfs" not in namespace:
ocs_version = version.get_semantic_ocs_version_from_config()
if not check_state or (
ocs_version <= version.VERSION_4_17 and "cephfs" not in namespace
):
logger.info("Waiting for VRG to be deleted")
sample = TimeoutSampler(
timeout=timeout, sleep=5, func=check_vrg_existence, namespace=namespace
timeout=timeout, sleep=5, func=check_vrg_existence, namespace=vrg_namespace
)
if not sample.wait_for_func_status(result=False):
error_msg = "VRG resource not deleted"
Expand Down Expand Up @@ -734,7 +746,10 @@ def wait_for_all_resources_creation(


def wait_for_all_resources_deletion(
namespace, check_replication_resources_state=True, timeout=1000
namespace,
check_replication_resources_state=True,
timeout=1000,
discovered_apps=False,
):
"""
Wait for workload and replication resources to be deleted
Expand All @@ -743,6 +758,7 @@ def wait_for_all_resources_deletion(
namespace (str): the namespace of the workload
check_replication_resources_state (bool): True for checking replication resources state, False otherwise
timeout (int): time in seconds to wait for resource deletion
discovered_apps (bool): If true then deployed workload is discovered_apps
"""
logger.info("Waiting for all pods to be deleted")
Expand All @@ -754,7 +770,7 @@ def wait_for_all_resources_deletion(
)

wait_for_replication_resources_deletion(
namespace, timeout, check_replication_resources_state
namespace, timeout, check_replication_resources_state, discovered_apps
)

if not (
Expand Down Expand Up @@ -1638,7 +1654,7 @@ def do_discovered_apps_cleanup(
config.switch_to_cluster_by_name(old_primary)
workload_path = constants.DR_WORKLOAD_REPO_BASE_DIR + "/" + workload_dir
run_cmd(f"oc delete -k {workload_path} -n {workload_namespace} --wait=false")
wait_for_all_resources_deletion(namespace=workload_namespace)
wait_for_all_resources_deletion(namespace=workload_namespace, discovered_apps=True)
config.switch_acm_ctx()
drpc_obj.wait_for_progression_status(status=constants.STATUS_COMPLETED)
config.switch_ctx(restore_index)
Expand Down
47 changes: 36 additions & 11 deletions ocs_ci/ocs/dr/dr_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
CommandFailed,
UnexpectedBehaviour,
ResourceNotDeleted,
ResourceWrongStatusException,
)
from ocs_ci.ocs.resources.pod import get_all_pods
from ocs_ci.ocs.utils import get_primary_cluster_config, get_non_acm_cluster_config
from ocs_ci.utility import templating
from ocs_ci.utility.utils import clone_repo, run_cmd
from ocs_ci.utility.utils import clone_repo, run_cmd, TimeoutSampler

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -614,13 +615,33 @@ def verify_workload_deployment(self):
)

if self.appset_model == "pull":
appset_pull_obj = ocp.OCP(
kind=constants.APPLICATION_ARGOCD,
resource_name=appset_resource_name,
namespace=constants.GITOPS_CLUSTER_NAMESPACE,
sampler = TimeoutSampler(
120, sleep=5, func=self.check_workload_health_status
)
appset_pull_obj._has_phase = True
appset_pull_obj.wait_for_phase(phase="Succeeded", timeout=120)
if not sampler.wait_for_func_status(True):
raise ResourceWrongStatusException(
f"{appset_resource_name} health status is not Healthy"
)

def check_workload_health_status(self):
"""
Checks the health status of the workload and returns whether it is healthy.
Returns:
bool: True if the health status is "Healthy", False otherwise
"""
appset_resource_name = (
self._get_applicaionset_name() + "-" + self.preferred_primary_cluster
)
appset_obj = ocp.OCP(
kind=constants.APPLICATION_ARGOCD,
resource_name=appset_resource_name,
namespace=constants.GITOPS_CLUSTER_NAMESPACE,
)
health_status = appset_obj.get().get("status").get("health").get("status")
log.info(f"{appset_resource_name} health status: {health_status}")
return health_status == "Healthy"

def check_pod_pvc_status(self, skip_replication_resources=False):
"""
Expand Down Expand Up @@ -1258,14 +1279,16 @@ def delete_workload(self):
)

for cluster in get_non_acm_cluster_config():
log.info(f"Deleting Workload from {cluster}")
config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
log.info(f"Deleting workload from {cluster.ENV_DATA['cluster_name']}")
run_cmd(
f"oc delete -k {self.workload_path} -n {self.workload_namespace}",
ignore_error=True,
)
dr_helpers.wait_for_all_resources_deletion(
namespace=self.workload_namespace
namespace=self.workload_namespace,
check_replication_resources_state=False,
discovered_apps=True,
)
run_cmd(f"oc delete project {self.workload_namespace}")

Expand Down Expand Up @@ -1569,13 +1592,15 @@ def delete_workload(self):
)

for cluster in get_non_acm_cluster_config():
log.info(f"Deleting Workload from {cluster}")
config.switch_ctx(cluster.MULTICLUSTER["multicluster_index"])
log.info(f"Deleting workload from {cluster.ENV_DATA['cluster_name']}")
run_cmd(
f"oc delete -k {self.workload_path} -n {self.workload_namespace}",
ignore_error=True,
)
dr_helpers.wait_for_all_resources_deletion(
namespace=self.workload_namespace
namespace=self.workload_namespace,
check_replication_resources_state=False,
discovered_apps=True,
)
run_cmd(f"oc delete project {self.workload_namespace}")

0 comments on commit 59bcd00

Please sign in to comment.