From a619d5cd34fd178458264ab83ac8b6efdd22511a Mon Sep 17 00:00:00 2001 From: Akarsha-rai Date: Mon, 29 Apr 2024 16:27:59 +0530 Subject: [PATCH] To verify that the cluster remains accessible and NO DU/DL after failures Signed-off-by: Akarsha-rai --- ...no_data_loss_and_corruption_on_failures.py | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py diff --git a/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py new file mode 100644 index 000000000000..d6f5fdb29630 --- /dev/null +++ b/tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py @@ -0,0 +1,158 @@ +import logging +import pytest +import time + + +from ocs_ci.framework.pytest_customization.marks import tier4a +from ocs_ci.framework import config +from ocs_ci.ocs.dr.dr_workload import validate_data_integrity +from ocs_ci.ocs import constants +from ocs_ci.ocs.node import wait_for_nodes_status, get_node_objs +from ocs_ci.ocs.resources.pod import restart_pods_having_label +from ocs_ci.helpers.dr_helpers import ( + set_current_primary_cluster_context, + set_current_secondary_cluster_context, + get_current_primary_cluster_name, + get_active_acm_index, +) +from ocs_ci.framework.pytest_customization.marks import turquoise_squad +from ocs_ci.utility import vsphere +from ocs_ci.utility.utils import ceph_health_check + +logger = logging.getLogger(__name__) + + +@tier4a +@turquoise_squad +class TestNoDataLossAndDataCorruptionOnFailures: + """ + The Objective of this test cases is to make sure that the MDR cluster remains accessible + and NO DU/DL/DC is observed when following Failures are induced with supported applications are running + + 1) Noobaa pods failures - repeat at least 5-7 times + 2) Rolling reboot of the nodes in all zones one at a time + 3) RHCS nodes failures + a. 1 RHCS node in one zone + b. All the RHCS nodes in one zone + c. All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue + + """ + + @pytest.mark.polarion_id("OCS-XXXX") + def test_no_data_loss_and_data_corruption_on_failures( + self, setup_acm_ui, nodes_multicluster, dr_workload + ): + + # Deploy Subscription based application + sub = dr_workload(num_of_subscription=1)[0] + self.namespace = sub.workload_namespace + self.workload_type = sub.workload_type + + # Deploy AppSet based application + appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0] + + # Workloads list + workloads = [sub, appset] + + # Create application on Primary managed cluster + set_current_primary_cluster_context(self.namespace) + self.primary_cluster_name = get_current_primary_cluster_name( + namespace=self.namespace + ) + + # Validate data integrity + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # Noobaa pod restarts atleast 5 times and verify the data integrity + restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # Get the nodes from one active zone + config.switch_ctx(get_active_acm_index()) + active_hub_index = config.cur_index + zone = config.ENV_DATA.get("zone") + active_hub_cluster_node_objs = get_node_objs() + set_current_primary_cluster_context(self.namespace) + if config.ENV_DATA.get("zone") == zone: + managed_cluster_index = config.cur_index + managed_cluster_node_objs = get_node_objs() + else: + set_current_secondary_cluster_context(self.namespace) + managed_cluster_index = config.cur_index + managed_cluster_node_objs = get_node_objs() + external_cluster_node_roles = config.EXTERNAL_MODE.get( + "external_cluster_node_roles" + ) + ceph_node_ips = [] + for ceph_node in external_cluster_node_roles: + if ( + external_cluster_node_roles[ceph_node].get("location").get("datacenter") + != "zone-b" + ): + continue + else: + ceph_node_ips.append( + external_cluster_node_roles[ceph_node].get("ip_address") + ) + + # Rolling reboot of the nodes in all zones one at a time + wait_time = 120 + logger.info("Shutting down all the nodes from active hub zone") + nodes_multicluster[managed_cluster_index].restart_nodes( + managed_cluster_node_objs + ) + nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs) + host = config.ENV_DATA["vsphere_server"] + user = config.ENV_DATA["vsphere_user"] + password = config.ENV_DATA["vsphere_password"] + vm_objs = vsphere.VSPHERE(host, user, password) + ceph_vms = [ + vm_objs.get_vm_by_ip(ip=each_ip, dc="None") for each_ip in ceph_node_ips + ] + vm_objs.restart_vms(vms=ceph_vms) + logger.info( + "All nodes from active hub zone are rebooted/restarted." + f"Wait for {wait_time} for the nodes up" + ) + time.sleep(wait_time) + wait_for_nodes_status([node.name for node in managed_cluster_node_objs]) + wait_for_nodes_status([node.name for node in active_hub_cluster_node_objs]) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # Again verify the data integrity of application + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace) + + # RHCS nodes failures + # 1 RHCS node in one zone + vm_objs.restart_vms(vms=ceph_vms[0]) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # All the RHCS nodes in one zone + vm_objs.restart_vms(vms=ceph_vms) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=40, delay=30) + + # All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue + for i in range(10): + vm_objs.restart_vms(vms=ceph_vms) + logger.info( + f"Wait {wait_time} before another restart of ceph nodes from zones" + ) + time.sleep(wait_time) + # Validate ceph health OK + ceph_health_check(tries=120, delay=30) + + # Again verify the data integrity of application + for wl in workloads: + config.switch_to_cluster_by_name(self.primary_cluster_name) + validate_data_integrity(wl.workload_namespace)