Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a flag, mark_masters_schedulable to mark master nodes scheduleable #10181

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 44 additions & 20 deletions ocs_ci/deployment/baremetal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,12 +1251,16 @@ def destroy_cluster(self, log_level="DEBUG"):


@retry(exceptions.CommandFailed, tries=10, delay=30, backoff=1)
def clean_disk(worker, namespace=constants.DEFAULT_NAMESPACE):
def disks_available_to_cleanup(worker, namespace=constants.DEFAULT_NAMESPACE):
"""
Perform disk cleanup
disks available for cleanup

Args:
worker (object): worker node object
namespace (str): namespace where the oc_debug command will be executed

Returns:
disk_names_available_for_cleanup (list): The disk names available for cleanup on a node

"""
ocp_obj = ocp.OCP()
Expand All @@ -1265,23 +1269,43 @@ def clean_disk(worker, namespace=constants.DEFAULT_NAMESPACE):
node=worker.name, cmd_list=[cmd], namespace=namespace
)
disk_to_ignore_cleanup_raw = json.loads(str(out))
disk_to_ignore_cleanup_json = disk_to_ignore_cleanup_raw["blockdevices"]
selected_disks_to_ignore_cleanup = []
for disk_to_ignore_cleanup in disk_to_ignore_cleanup_json:
if disk_to_ignore_cleanup["mountpoint"] == "/boot":
logger.info(
f"Ignorning disk {disk_to_ignore_cleanup['pkname']} for cleanup because it's a root disk "
)
selected_disks_to_ignore_cleanup.append(
str(disk_to_ignore_cleanup["pkname"])
)
elif disk_to_ignore_cleanup["type"] == "rom":
logger.info(
f"Ignorning disk {disk_to_ignore_cleanup['kname']} for cleanup because it's a rom disk "
)
selected_disks_to_ignore_cleanup.append(
str(disk_to_ignore_cleanup["kname"])
)
disks_available = disk_to_ignore_cleanup_raw["blockdevices"]
boot_disks = set()
disks_available_for_cleanup = []
for disk in disks_available:
# First pass: identify boot disks and filter out ROM disks
if disk["type"] == "rom":
continue
if "nbd" in disk["kname"]:
continue
if disk["type"] == "part" and disk["mountpoint"] == "/boot":
boot_disks.add(disk["pkname"])
if disk["type"] == "disk":
disks_available_for_cleanup.append(disk)

# Second pass: filter out boot disks
disks_available_for_cleanup = [
disk for disk in disks_available_for_cleanup if disk["kname"] not in boot_disks
]
disks_names_available_for_cleanup = [
disk["kname"] for disk in disks_available_for_cleanup
]

return disks_names_available_for_cleanup


@retry(exceptions.CommandFailed, tries=10, delay=30, backoff=1)
def clean_disk(worker, namespace=constants.DEFAULT_NAMESPACE):
"""
Perform disk cleanup

Args:
worker (object): worker node object
namespace (str): namespace where the oc_debug command will be executed

"""
ocp_obj = ocp.OCP()
disks_available_on_worker_nodes_for_cleanup = disks_available_to_cleanup(worker)

out = ocp_obj.exec_oc_debug_cmd(
node=worker.name,
Expand All @@ -1292,7 +1316,7 @@ def clean_disk(worker, namespace=constants.DEFAULT_NAMESPACE):
lsblk_devices = lsblk_output["blockdevices"]

for lsblk_device in lsblk_devices:
if lsblk_device["name"] in selected_disks_to_ignore_cleanup:
if lsblk_device["name"] not in disks_available_on_worker_nodes_for_cleanup:
logger.info(f'the disk cleanup is ignored for, {lsblk_device["name"]}')
pass
else:
Expand Down
54 changes: 47 additions & 7 deletions ocs_ci/deployment/provider_client/storage_client_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
check_phase_of_rados_namespace,
)
from ocs_ci.deployment.helpers.lso_helpers import setup_local_storage
from ocs_ci.ocs.node import label_nodes, get_all_nodes, get_node_objs
from ocs_ci.ocs.node import label_nodes, get_all_nodes, get_node_objs, get_nodes
from ocs_ci.ocs.utils import (
setup_ceph_toolbox,
enable_console_plugin,
Expand All @@ -24,7 +24,7 @@
)
from ocs_ci.utility import templating, kms as KMS, version
from ocs_ci.deployment.deployment import Deployment, create_catalog_source
from ocs_ci.deployment.baremetal import clean_disk
from ocs_ci.deployment.baremetal import clean_disk, disks_available_to_cleanup
from ocs_ci.ocs.resources.storage_cluster import verify_storage_cluster
from ocs_ci.ocs.resources.storage_client import StorageClient
from ocs_ci.ocs.bucket_utils import check_pv_backingstore_type
Expand Down Expand Up @@ -78,6 +78,7 @@ def initial_function(self):
namespace=config.ENV_DATA["cluster_namespace"],
)

self.platform = config.ENV_DATA.get("platform").lower()
self.deployment = Deployment()
self.storage_clients = StorageClient()

Expand All @@ -95,13 +96,10 @@ def provider_and_native_client_installation(
6. Disable ROOK_CSI_ENABLE_CEPHFS and ROOK_CSI_ENABLE_RBD
7. Create storage profile
"""

# Allow ODF to be deployed on all nodes
nodes = get_all_nodes()
node_objs = get_node_objs(nodes)

log.info("labeling storage nodes")
label_nodes(nodes=node_objs, label=constants.OPERATOR_NODE_LABEL)
worker_node_objs = get_nodes(node_type=constants.WORKER_MACHINE)
no_of_worker_nodes = len(worker_node_objs)

# Allow hosting cluster domain to be usable by hosted clusters
path = "/spec/routeAdmission"
Expand All @@ -124,6 +122,31 @@ def provider_and_native_client_installation(
wait_for_machineconfigpool_status(node_type="all")
log.info("All the nodes are upgraded")

# Mark master nodes schedulable if mark_masters_schedulable: True
if config.ENV_DATA.get("mark_masters_schedulable", False):
path = "/spec/mastersSchedulable"
params = f"""[{{"op": "replace", "path": "{path}", "value": true}}]"""
assert self.scheduler_obj.patch(
params=params, format_type="json"
), "Failed to run patch command to update control nodes as scheduleable"
# Allow ODF to be deployed on all nodes
log.info("labeling all nodes as storage nodes")
label_nodes(nodes=node_objs, label=constants.OPERATOR_NODE_LABEL)
worker_node_objs = get_nodes(node_type=constants.WORKER_MACHINE)
no_of_worker_nodes = len(worker_node_objs)
else:
log.info("labeling worker nodes as storage nodes")
label_nodes(nodes=worker_node_objs, label=constants.OPERATOR_NODE_LABEL)

disks_available_on_worker_nodes_for_cleanup = disks_available_to_cleanup(
worker_node_objs[0]
)
Comment on lines +141 to +143
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we go over all the worker nodes(and not just one) to take all the available disks to clean up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have executed this for one worker node because I am setting that value as ["spec"]["storageDeviceSets"][0]["count"] is storage_cluster yaml.
Have seen that in general the disks count are same for all the worker nodes, so thought I can collect this value from any one.

number_of_disks_available = len(disks_available_on_worker_nodes_for_cleanup)
log.info(
f"disks avilable for cleanup, {disks_available_on_worker_nodes_for_cleanup}"
f"number of disks avilable for cleanup, {number_of_disks_available}"
)

# Install LSO, create LocalVolumeDiscovery and LocalVolumeSet
is_local_storage_available = self.sc_obj.is_exist(
resource_name=self.storageclass,
Expand Down Expand Up @@ -197,6 +220,15 @@ def provider_and_native_client_installation(
storage_cluster_data = self.add_encryption_details_to_cluster_data(
storage_cluster_data
)
storage_cluster_data["spec"]["storageDeviceSets"][0][
"replica"
] = no_of_worker_nodes

if self.platform in constants.HCI_PROVIDER_CLIENT_PLATFORMS:
storage_cluster_data["spec"]["storageDeviceSets"][0][
"count"
] = number_of_disks_available

templating.dump_data_to_temp_yaml(
storage_cluster_data, constants.OCS_STORAGE_CLUSTER_YAML
)
Expand All @@ -210,6 +242,14 @@ def provider_and_native_client_installation(
storage_cluster_data = self.add_encryption_details_to_cluster_data(
storage_cluster_data
)
storage_cluster_data["spec"]["storageDeviceSets"][0][
"replica"
] = no_of_worker_nodes

if self.platform in constants.HCI_PROVIDER_CLIENT_PLATFORMS:
storage_cluster_data["spec"]["storageDeviceSets"][0][
"count"
] = number_of_disks_available
templating.dump_data_to_temp_yaml(
storage_cluster_data, constants.OCS_STORAGE_CLUSTER_UPDATED_YAML
)
Expand Down
16 changes: 16 additions & 0 deletions ocs_ci/ocs/resources/storage_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
from ocs_ci.helpers.managed_services import (
get_all_storageclassclaims,
)
from ocs_ci.ocs.resources.ocs import get_ocs_csv
from ocs_ci.ocs.resources.storage_cluster import verify_storage_cluster
from ocs_ci.utility.utils import TimeoutSampler

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -503,11 +505,25 @@ def verify_native_storageclient(self):
storageclaims, associated storageclasses and storagerequests are created successfully.

"""
ocs_csv = get_ocs_csv()
client_csv_version = ocs_csv.data["spec"]["version"]
ocs_version = version.get_ocs_version_from_csv(only_major_minor=True)
log.info(
f"Check if OCS version: {ocs_version} matches with CSV: {client_csv_version}"
)
assert (
f"{ocs_version}" in client_csv_version
), f"OCS version: {ocs_version} mismatch with CSV version {client_csv_version}"
if self.ocs_version >= version.VERSION_4_16:
namespace = config.ENV_DATA["cluster_namespace"]
else:
namespace = constants.OPENSHIFT_STORAGE_CLIENT_NAMESPACE

# Check ocs-storagecluster is in 'Ready' status
log.info("Verify storagecluster on Ready state")
verify_storage_cluster()

# Fetch storage-client name
storageclient_name = self.get_storageclient_name(namespace)

# Verify storageclient is in Connected status
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ spec:
providerAPIServerServiceType: ClusterIP
storageDeviceSets:
- config: {}
count: 2
count: 4
dataPVCTemplate:
metadata: {}
spec:
Expand All @@ -52,5 +52,5 @@ spec:
name: local-storage-deviceset
placement: {}
preparePlacement: {}
replica: 6
replica: 3
resources: {}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
providerAPIServerServiceType: NodePort
storageDeviceSets:
- config: {}
count: 2
count: 4
dataPVCTemplate:
metadata: {}
spec:
Expand All @@ -50,5 +50,5 @@ spec:
name: local-storage-deviceset
placement: {}
preparePlacement: {}
replica: 6
replica: 3
resources: {}
Loading