-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
To verify that the cluster remains accessible and NO DU/DL after fail…
…ures Signed-off-by: Akarsha-rai <[email protected]>
- Loading branch information
1 parent
82ee168
commit a619d5c
Showing
1 changed file
with
158 additions
and
0 deletions.
There are no files selected for viewing
158 changes: 158 additions & 0 deletions
158
tests/functional/disaster-recovery/metro-dr/test_no_data_loss_and_corruption_on_failures.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import logging | ||
import pytest | ||
import time | ||
|
||
|
||
from ocs_ci.framework.pytest_customization.marks import tier4a | ||
from ocs_ci.framework import config | ||
from ocs_ci.ocs.dr.dr_workload import validate_data_integrity | ||
from ocs_ci.ocs import constants | ||
from ocs_ci.ocs.node import wait_for_nodes_status, get_node_objs | ||
from ocs_ci.ocs.resources.pod import restart_pods_having_label | ||
from ocs_ci.helpers.dr_helpers import ( | ||
set_current_primary_cluster_context, | ||
set_current_secondary_cluster_context, | ||
get_current_primary_cluster_name, | ||
get_active_acm_index, | ||
) | ||
from ocs_ci.framework.pytest_customization.marks import turquoise_squad | ||
from ocs_ci.utility import vsphere | ||
from ocs_ci.utility.utils import ceph_health_check | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@tier4a | ||
@turquoise_squad | ||
class TestNoDataLossAndDataCorruptionOnFailures: | ||
""" | ||
The Objective of this test cases is to make sure that the MDR cluster remains accessible | ||
and NO DU/DL/DC is observed when following Failures are induced with supported applications are running | ||
1) Noobaa pods failures - repeat at least 5-7 times | ||
2) Rolling reboot of the nodes in all zones one at a time | ||
3) RHCS nodes failures | ||
a. 1 RHCS node in one zone | ||
b. All the RHCS nodes in one zone | ||
c. All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue | ||
""" | ||
|
||
@pytest.mark.polarion_id("OCS-XXXX") | ||
def test_no_data_loss_and_data_corruption_on_failures( | ||
self, setup_acm_ui, nodes_multicluster, dr_workload | ||
): | ||
|
||
# Deploy Subscription based application | ||
sub = dr_workload(num_of_subscription=1)[0] | ||
self.namespace = sub.workload_namespace | ||
self.workload_type = sub.workload_type | ||
|
||
# Deploy AppSet based application | ||
appset = dr_workload(num_of_subscription=0, num_of_appset=1)[0] | ||
|
||
# Workloads list | ||
workloads = [sub, appset] | ||
|
||
# Create application on Primary managed cluster | ||
set_current_primary_cluster_context(self.namespace) | ||
self.primary_cluster_name = get_current_primary_cluster_name( | ||
namespace=self.namespace | ||
) | ||
|
||
# Validate data integrity | ||
for wl in workloads: | ||
config.switch_to_cluster_by_name(self.primary_cluster_name) | ||
validate_data_integrity(wl.workload_namespace) | ||
|
||
# Noobaa pod restarts atleast 5 times and verify the data integrity | ||
restart_pods_having_label(label=constants.NOOBAA_APP_LABEL) | ||
for wl in workloads: | ||
config.switch_to_cluster_by_name(self.primary_cluster_name) | ||
validate_data_integrity(wl.workload_namespace) | ||
|
||
# Get the nodes from one active zone | ||
config.switch_ctx(get_active_acm_index()) | ||
active_hub_index = config.cur_index | ||
zone = config.ENV_DATA.get("zone") | ||
active_hub_cluster_node_objs = get_node_objs() | ||
set_current_primary_cluster_context(self.namespace) | ||
if config.ENV_DATA.get("zone") == zone: | ||
managed_cluster_index = config.cur_index | ||
managed_cluster_node_objs = get_node_objs() | ||
else: | ||
set_current_secondary_cluster_context(self.namespace) | ||
managed_cluster_index = config.cur_index | ||
managed_cluster_node_objs = get_node_objs() | ||
external_cluster_node_roles = config.EXTERNAL_MODE.get( | ||
"external_cluster_node_roles" | ||
) | ||
ceph_node_ips = [] | ||
for ceph_node in external_cluster_node_roles: | ||
if ( | ||
external_cluster_node_roles[ceph_node].get("location").get("datacenter") | ||
!= "zone-b" | ||
): | ||
continue | ||
else: | ||
ceph_node_ips.append( | ||
external_cluster_node_roles[ceph_node].get("ip_address") | ||
) | ||
|
||
# Rolling reboot of the nodes in all zones one at a time | ||
wait_time = 120 | ||
logger.info("Shutting down all the nodes from active hub zone") | ||
nodes_multicluster[managed_cluster_index].restart_nodes( | ||
managed_cluster_node_objs | ||
) | ||
nodes_multicluster[active_hub_index].restart_nodes(active_hub_cluster_node_objs) | ||
host = config.ENV_DATA["vsphere_server"] | ||
user = config.ENV_DATA["vsphere_user"] | ||
password = config.ENV_DATA["vsphere_password"] | ||
vm_objs = vsphere.VSPHERE(host, user, password) | ||
ceph_vms = [ | ||
vm_objs.get_vm_by_ip(ip=each_ip, dc="None") for each_ip in ceph_node_ips | ||
] | ||
vm_objs.restart_vms(vms=ceph_vms) | ||
logger.info( | ||
"All nodes from active hub zone are rebooted/restarted." | ||
f"Wait for {wait_time} for the nodes up" | ||
) | ||
time.sleep(wait_time) | ||
wait_for_nodes_status([node.name for node in managed_cluster_node_objs]) | ||
wait_for_nodes_status([node.name for node in active_hub_cluster_node_objs]) | ||
# Validate ceph health OK | ||
ceph_health_check(tries=40, delay=30) | ||
|
||
# Again verify the data integrity of application | ||
for wl in workloads: | ||
config.switch_to_cluster_by_name(self.primary_cluster_name) | ||
validate_data_integrity(wl.workload_namespace) | ||
|
||
# RHCS nodes failures | ||
# 1 RHCS node in one zone | ||
vm_objs.restart_vms(vms=ceph_vms[0]) | ||
time.sleep(wait_time) | ||
# Validate ceph health OK | ||
ceph_health_check(tries=40, delay=30) | ||
|
||
# All the RHCS nodes in one zone | ||
vm_objs.restart_vms(vms=ceph_vms) | ||
time.sleep(wait_time) | ||
# Validate ceph health OK | ||
ceph_health_check(tries=40, delay=30) | ||
|
||
# All the RHCS nodes in one zone - Repeated to mimic Santa lucia issue | ||
for i in range(10): | ||
vm_objs.restart_vms(vms=ceph_vms) | ||
logger.info( | ||
f"Wait {wait_time} before another restart of ceph nodes from zones" | ||
) | ||
time.sleep(wait_time) | ||
# Validate ceph health OK | ||
ceph_health_check(tries=120, delay=30) | ||
|
||
# Again verify the data integrity of application | ||
for wl in workloads: | ||
config.switch_to_cluster_by_name(self.primary_cluster_name) | ||
validate_data_integrity(wl.workload_namespace) |