Skip to content

Commit

Permalink
Test and infra for replica1 feature
Browse files Browse the repository at this point in the history
Signed-off-by: Aviadp <[email protected]>
  • Loading branch information
AviadP committed Jun 24, 2024
1 parent 6e6d403 commit c137c61
Show file tree
Hide file tree
Showing 5 changed files with 572 additions and 0 deletions.
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@
CEPH_CLUSTER = "CephCluster"
EXTERNAL_CEPHCLUSTER_NAME = "ocs-external-storagecluster-cephcluster"
CEPH_CLUSTER_NAME = "ocs-storagecluster-cephcluster"
REPLICA1_STORAGECLASS = "ocs-storagecluster-ceph-non-resilient-rbd"
ENDPOINTS = "Endpoints"
WEBHOOK = "ValidatingWebhookConfiguration"
ROOK_CEPH_WEBHOOK = "rook-ceph-webhook"
Expand Down
286 changes: 286 additions & 0 deletions ocs_ci/ocs/replica_one.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
from logging import getLogger
from typing import List # To be removed when python 3.8 support is dropped

from ocs_ci.framework import config
from ocs_ci.ocs.resources.pod import (
get_pods_having_label,
get_ceph_tools_pod,
run_osd_removal_job,
)
from ocs_ci.ocs.ocp import OCP
from ocs_ci.ocs.constants import (
DEFAULT_CEPHBLOCKPOOL,
OSD_APP_LABEL,
CEPHBLOCKPOOL,
STORAGECLASS,
DEPLOYMENT,
STORAGECLUSTER,
STATUS_READY,
)
from ocs_ci.ocs.exceptions import CommandFailed


log = getLogger(__name__)

REPLICA1_STORAGECLASS = "ocs-storagecluster-ceph-non-resilient-rbd"


def get_failures_domain_name() -> List[str]:
"""
Fetch Failure domains from cephblockpools names
Returns:
list[str]: list with failure domain names
"""
cbp_object = OCP(kind=CEPHBLOCKPOOL, namespace=config.ENV_DATA["cluster_namespace"])
failure_domains = list()
cephblockpools_names = list()
prefix = DEFAULT_CEPHBLOCKPOOL
for i in range(0, len((cbp_object.data["items"]))):
cephblockpools_names.append(cbp_object.data["items"][i]["metadata"]["name"])

log.info(f"Cephblockpool names:{cephblockpools_names}")

for name in cephblockpools_names:
if name.startswith(prefix):
corrected_name = name[len(prefix) :].lstrip("-")
log.info(corrected_name)
if corrected_name:
failure_domains.append(corrected_name)

log.info(f"Failure domains:{failure_domains}")

return failure_domains


FAILURE_DOMAINS = config.ENV_DATA.get(
"worker_availability_zones", get_failures_domain_name()
)


def get_replica_1_osds() -> dict:
"""
Gets the names and IDs of OSD associated with replica1
Returns:
dict: osd name(str): osd id(str)
"""
replica1_osds = dict()
all_osds = get_pods_having_label(label=OSD_APP_LABEL)
for domain in FAILURE_DOMAINS:
for osd in all_osds:
if osd["metadata"]["labels"]["ceph.rook.io/DeviceSet"] == domain:
replica1_osds[osd["metadata"]["name"]] = osd["metadata"]["labels"][
"ceph-osd-id"
]
log.info(replica1_osds)
return replica1_osds


def get_replica1_osd_deployment() -> List[str]:
"""
Gets the names of OSD deployments associated with replica1
Returns:
list[str]: deployment names
"""
dep_obj = OCP(kind=DEPLOYMENT)
deployments = dep_obj.get()["items"]
replica1_osd_deployment = list()
osd_deployment = list()
for deployment in deployments:
if (
"metadata" in deployment
and "labels" in deployment["metadata"]
and "app.kubernetes.io/name" in deployment["metadata"]["labels"]
):
if deployment["metadata"]["labels"]["app.kubernetes.io/name"] == "ceph-osd":
osd_deployment.append(deployment)

for deployment in osd_deployment:
if (
deployment["metadata"]["labels"]["ceph.rook.io/DeviceSet"]
in FAILURE_DOMAINS
):
log.info(deployment["metadata"]["name"])
replica1_osd_deployment.append(deployment["metadata"]["name"])

return replica1_osd_deployment


def scaledown_deployment(deployments_name: List[str]) -> None:
"""
Scale down deployments to 0
Args:
deployments_name (list[str]): list of deployment names.
"""
deployment_obj = OCP(kind=DEPLOYMENT)
for deployment in deployments_name:
deployment_obj.exec_oc_cmd(f"scale deployment {deployment} --replicas=0")
log.info(f"scaling to 0: {deployment}")


def count_osd_pods() -> int:
"""
Gets OSDs count in a cluster
Returns:
int: number of OSDs in cluster
"""
return len(get_pods_having_label(label=OSD_APP_LABEL))


def delete_replica_1_sc() -> None:
"""
Deletes storage class associated with replica1
"""
sc_obj = OCP(kind=STORAGECLASS, resource_name=REPLICA1_STORAGECLASS)
try:
sc_obj.delete(resource_name=REPLICA1_STORAGECLASS)
except CommandFailed as e:
if "Error is Error from server (NotFound)" in str(e):
log.info(
f"{REPLICA1_STORAGECLASS} not found, assuming it was already deleted"
)
else:
raise CommandFailed(f"Failed to delete storage class: {str(e)}")


def purge_replica1_osd():
"""
Purge OSDs associated with replica1
1. scale down its deployments to 0
2. use OSD removal template
"""
deployments_name = get_replica1_osd_deployment()
scaledown_deployment(deployments_name)
replica1_osds = get_replica_1_osds()
log.info(f"OSDS : {replica1_osds.keys()}")
log.info(f"OSD IDs: {replica1_osds.values()}")
run_osd_removal_job(osd_ids=replica1_osds.values())


def delete_replica1_cephblockpools_cr(cbp_object: OCP):
"""
Deletes CR of cephblockpools associated with replica1
Args:
cbp_object (OCP): OCP object with kind=CEPHBLOCKPOOL
"""
for i in range(0, len((cbp_object.data["items"]))):
cbp_cr_name = cbp_object.data["items"][i]["metadata"]["name"]
if cbp_cr_name != DEFAULT_CEPHBLOCKPOOL:
cbp_object.delete(resource_name=cbp_cr_name)


def delete_replica1_cephblockpools(cbp_object: OCP):
"""
Deletes cephblockpools associated with replica1
Args:
cbp_object (OCP): OCP object with kind=CEPHBLOCKPOOL
"""
toolbox_pod = get_ceph_tools_pod()
for i in range(0, len((cbp_object.data["items"]))):
replica1_pool_name = cbp_object.data["items"][i]["metadata"]["name"]
if replica1_pool_name != DEFAULT_CEPHBLOCKPOOL:
command = f"ceph osd pool rm {replica1_pool_name} {replica1_pool_name} --yes-i-really-really-mean-it"
toolbox_pod.exec_cmd_on_pod(command)

log.info(f"deleting {replica1_pool_name}")


def modify_replica1_osd_count(new_osd_count):
"""
Modify number of OSDs associated with replica1
Args:
new_osd_count (str): number, represent the duplicatoin number of replica1 osd.
for instance, selecting 2, creates 6 osds
"""
storage_cluster = OCP(kind=STORAGECLUSTER, name="ocs-storagecluster")
storage_cluster.exec_oc_cmd(
f"patch storagecluster ocs-storagecluster -n openshift-storage "
f'--type json --patch \'[{{"op": "replace", "path": '
f'"/spec/managedResources/cephNonResilientPools/count", "value": {new_osd_count} }}]\''
)

storage_cluster.wait_for_resource(condition=STATUS_READY)


def get_device_class_from_ceph() -> dict:
"""
Gets device class from ceph by executing 'ceph df osd tree'
Returns:
dict: device class ("osd name": "device class")
"""
ceph_pod = get_ceph_tools_pod()
output = ceph_pod.exec_cmd_on_pod("ceph osd df tree -f json-pretty")
device_class = dict()
nodes = output["nodes"]
for node in nodes:
if node["type"] == "osd":
device_class[node["name"]] = node.get("device_class", "unknown")
log.info(f"Device class: {device_class}")
return device_class


def get_all_osd_names_by_device_class(osd_dict: dict, device_class: str) -> list:
return [
osd_name
for osd_name, class_value in osd_dict.items()
if class_value == device_class
]


def get_osd_kb_used_data() -> dict:
"""
Retrieves the KB used data for each OSD from the Ceph cluster.
Returns:
dict: kb_used_data("osd_name": kb_used_data)
"""
ceph_pod = get_ceph_tools_pod()
output = ceph_pod.exec_cmd_on_pod("ceph osd df tree -f json-pretty")
nodes = output["nodes"]
kb_used_data = dict()
for node in nodes:
if node["type"] == "osd":
kb_used_data[node["name"]] = node.get("kb_used_data", 0)
log.info(f"KB Used per OSD: {kb_used_data}")

return kb_used_data


def get_osd_pgs_used() -> dict:
"""
Retrieves the PG used for each OSD from the Ceph cluster.
Returns:
dict: pgs_used("osd_name": pg_used)
"""
ceph_pod = get_ceph_tools_pod()
output = ceph_pod.exec_cmd_on_pod("ceph osd df tree -f json-pretty")
nodes = output["nodes"]
pgs_used = dict()
for node in nodes:
if node["type"] == "osd":
pgs_used[node["name"]] = node.get("pgs", 0)
log.info(f"Placement Groups Used per OSD: {pgs_used}")

return pgs_used
46 changes: 46 additions & 0 deletions ocs_ci/ocs/resources/storage_cluster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
StorageCluster related functionalities
"""

import copy
import ipaddress
import logging
Expand Down Expand Up @@ -2768,3 +2769,48 @@ def resize_osd(new_osd_size, check_size=True):
format_type="json",
)
return res


def set_non_resilient_pool(
storage_cluster: StorageCluster, enable: bool = True
) -> None:
"""
Enable non-resilient ceph settings by patching the storage cluster
(Replica-1 feature)
Args:
storage_cluster (StorageCluster): StorageCluster object
enable (bool, optional): cephNonResilientPools value *** Setting False is not supported by ODF in 4.14 ***.
"""
cmd = f'[{{ "op": "replace", "path": "/spec/managedResources/cephNonResilientPools/enable", "value": {enable} }}]'
storage_cluster.patch(
resource_name=constants.DEFAULT_CLUSTERNAME, format_type="json", params=cmd
)


def validate_non_resilient_pool(storage_cluster: StorageCluster) -> bool:
"""
Validate non-resilient pools (replica-1) are enabled in storage cluster
Args:
storage_cluster (StorageCluster): StorageCluster object
Returns:
bool: True if replica-1 enabled, False otherwise
"""
storagecluster_yaml = storage_cluster.get(
resource_name=constants.DEFAULT_CLUSTERNAME
)
if (
str(
storagecluster_yaml["spec"]["managedResources"]["cephNonResilientPools"][
"enable"
]
).lower()
== "true"
):
return True

return False
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,7 @@ def factory(
deployment_config=False,
service_account=None,
security_context=None,
node_selector=None,
replica_count=1,
pod_name=None,
command=None,
Expand Down Expand Up @@ -1259,6 +1260,7 @@ def factory(
sa_name=sa_name,
replica_count=replica_count,
pod_name=pod_name,
node_selector=node_selector,
security_context=security_context,
command=command,
command_args=command_args,
Expand Down
Loading

0 comments on commit c137c61

Please sign in to comment.