From ff96f111b8964d1895ee6e8a75c614d4b15a68d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan-Marten=20Br=C3=BCggemann?= <brueggemann@b1-systems.de>
Date: Mon, 18 Mar 2024 15:00:59 +0100
Subject: [PATCH 1/8] add create cluster module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jan-Marten Brüggemann <brueggemann@b1-systems.de>
---
 .../modules/create_cluster/__init__.py        |  10 +
 src/rookify/modules/create_cluster/main.py    | 127 +++++++
 .../create_cluster/templates/cluster.yaml.j2  | 347 ++++++++++++++++++
 3 files changed, 484 insertions(+)
 create mode 100644 src/rookify/modules/create_cluster/__init__.py
 create mode 100644 src/rookify/modules/create_cluster/main.py
 create mode 100644 src/rookify/modules/create_cluster/templates/cluster.yaml.j2

diff --git a/src/rookify/modules/create_cluster/__init__.py b/src/rookify/modules/create_cluster/__init__.py
new file mode 100644
index 0000000..8bb0ef5
--- /dev/null
+++ b/src/rookify/modules/create_cluster/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# type: ignore
+
+from .main import CreateClusterHandler
+
+MODULE_NAME = "create_cluster"
+HANDLER_CLASS = CreateClusterHandler
+REQUIRES = []
+AFTER = []
+PREFLIGHT_REQUIRES = ["analyze_ceph"]
diff --git a/src/rookify/modules/create_cluster/main.py b/src/rookify/modules/create_cluster/main.py
new file mode 100644
index 0000000..8b20a0e
--- /dev/null
+++ b/src/rookify/modules/create_cluster/main.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+
+from ..module import ModuleHandler, ModuleException
+
+from typing import Any
+
+import kubernetes
+
+
+class CreateClusterHandler(ModuleHandler):
+    def __create_cluster_definition(self) -> Any:
+        try:
+            node_ls_data = self._data["analyze_ceph"]["node"]["ls"]
+
+            # Get monitor count
+            mon_count = 0
+            for node, mons in node_ls_data["mon"].items():
+                mon_count += 1
+                if len(mons) > 1:
+                    raise ModuleException(
+                        f"There are more than 1 mon running on node {node}"
+                    )
+
+            # Get manager count
+            mgr_count = 0
+            for node, mgrs in node_ls_data["mgr"].items():
+                mgr_count += 1
+                if len(mons) > 1:
+                    raise ModuleException(
+                        f"There are more than 1 mgr running on node {node}"
+                    )
+
+            # Render cluster config from template
+            self.__cluster_name = self._config["rook"]["cluster"]["name"]
+            self.__cluster_namespace = self._config["rook"]["cluster"]["namespace"]
+            self.__cluster_image = self._config["rook"]["ceph"]["image"]
+            self.__mon_placement_label = (
+                self._config["rook"]["cluster"]["mon_placement_label"]
+                if "mon_placement_label" in self._config["rook"]["cluster"]
+                else f"placement-{self.__cluster_name}-mon"
+            )
+            self.__mgr_placement_label = (
+                self._config["rook"]["cluster"]["mgr_placement_label"]
+                if "mgr_placement_label" in self._config["rook"]["cluster"]
+                else f"placement-{self.__cluster_name}-mgr"
+            )
+            self.__cluster_definition = self.load_template(
+                "cluster.yaml.j2",
+                cluster_name=self.__cluster_name,
+                cluster_namespace=self.__cluster_namespace,
+                ceph_image=self.__cluster_image,
+                mon_count=mon_count,
+                mgr_count=mgr_count,
+                mon_placement_label=self.__mon_placement_label,
+                mgr_placement_label=self.__mgr_placement_label,
+            )
+
+        except KeyError:
+            raise ModuleException("Ceph monitor data is incomplete")
+
+    def __check_k8s_prerequisites(self) -> None:
+        # We have to check, if our placement labels are disabled or unset
+        nodes = self.k8s.core_v1_api.list_node().items
+        for node in nodes:
+            node_labels = node.metadata.labels
+            if (
+                self.__mon_placement_label in node_labels
+                and node_labels[self.__mon_placement_label] == "enabled"
+            ):
+                raise ModuleException(
+                    f"Label {self.__mon_placement_label} is set on node {node.metadata.name}"
+                )
+            if (
+                self.__mgr_placement_label in node_labels
+                and node_labels[self.__mgr_placement_label] == "enabled"
+            ):
+                raise ModuleException(
+                    f"Label {self.__mon_placement_label} is set on node {node.metadata.name}"
+                )
+
+        # We have to check if our namespace exists
+        namespace_exists = False
+        namespaces = self.k8s.core_v1_api.list_namespace().items
+        for namespace in namespaces:
+            if namespace.metadata.name == self.__cluster_namespace:
+                namespace_exists = True
+        if not namespace_exists:
+            raise ModuleException(
+                f"Namespace {self.__cluster_namespace} does not exist"
+            )
+
+    def preflight(self) -> None:
+        self.__create_cluster_definition()
+        self.__check_k8s_prerequisites()
+
+    def run(self) -> Any:
+        # Create CephCluster
+        self.k8s.crd_api_apply(self.__cluster_definition.yaml)
+
+        # Wait for CephCluster to get into Progressing phase
+        watcher = kubernetes.watch.Watch()
+        stream = watcher.stream(
+            self.k8s.custom_objects_api.list_namespaced_custom_object,
+            "ceph.rook.io",
+            "v1",
+            self.__cluster_namespace,
+            "cephclusters",
+            timeout_seconds=60,
+        )
+        for event in stream:
+            event_object = event["object"]
+
+            if event_object["metadata"]["name"] != self.__cluster_name:
+                continue
+
+            try:
+                if event_object["status"]["phase"] == "Progressing":
+                    result = event_object
+                    break
+            except KeyError:
+                pass
+        watcher.stop()
+
+        try:
+            return result
+        except NameError:
+            raise ModuleException("CephCluster did not come up")
diff --git a/src/rookify/modules/create_cluster/templates/cluster.yaml.j2 b/src/rookify/modules/create_cluster/templates/cluster.yaml.j2
new file mode 100644
index 0000000..f49ac6b
--- /dev/null
+++ b/src/rookify/modules/create_cluster/templates/cluster.yaml.j2
@@ -0,0 +1,347 @@
+#################################################################################################################
+# Define the settings for the rook-ceph cluster with common settings for a production cluster.
+# All nodes with available raw devices will be used for the Ceph cluster. At least three nodes are required
+# in this example. See the documentation for more details on storage settings available.
+
+# For example, to create the cluster:
+#   kubectl create -f crds.yaml -f common.yaml -f operator.yaml
+#   kubectl create -f cluster.yaml
+#################################################################################################################
+
+apiVersion: ceph.rook.io/v1
+kind: CephCluster
+metadata:
+  name: {{ cluster_name }}
+  namespace: {{ cluster_namespace }}
+spec:
+  cephVersion:
+    # The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
+    # v17 is Quincy, v18 is Reef.
+    # RECOMMENDATION: In production, use a specific version tag instead of the general v17 flag, which pulls the latest release and could result in different
+    # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
+    # If you want to be more precise, you can always use a timestamp tag such as quay.io/ceph/ceph:v18.2.1-20240103
+    # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
+    image: {{ ceph_image }}
+    # Whether to allow unsupported versions of Ceph. Currently `quincy` and `reef` are supported.
+    # Future versions such as `squid` (v19) would require this to be set to `true`.
+    # Do not set to true in production.
+    allowUnsupported: false
+  # The path on the host where configuration files will be persisted. Must be specified.
+  # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
+  # In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
+  dataDirHostPath: /var/lib/rook
+  # Whether or not upgrade should continue even if a check fails
+  # This means Ceph's status could be degraded and we don't recommend upgrading but you might decide otherwise
+  # Use at your OWN risk
+  # To understand Rook's upgrade process of Ceph, read https://rook.io/docs/rook/latest/ceph-upgrade.html#ceph-version-upgrades
+  skipUpgradeChecks: false
+  # Whether or not continue if PGs are not clean during an upgrade
+  continueUpgradeAfterChecksEvenIfNotHealthy: false
+  # WaitTimeoutForHealthyOSDInMinutes defines the time (in minutes) the operator would wait before an OSD can be stopped for upgrade or restart.
+  # If the timeout exceeds and OSD is not ok to stop, then the operator would skip upgrade for the current OSD and proceed with the next one
+  # if `continueUpgradeAfterChecksEvenIfNotHealthy` is `false`. If `continueUpgradeAfterChecksEvenIfNotHealthy` is `true`, then operator would
+  # continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
+  # The default wait timeout is 10 minutes.
+  waitTimeoutForHealthyOSDInMinutes: 10
+  mon:
+    # Set the number of mons to be started. Generally recommended to be 3.
+    # For highest availability, an odd number of mons should be specified.
+    count: {{ mon_count }}
+    # The mons should be on unique nodes. For production, at least 3 nodes are recommended for this reason.
+    # Mons should only be allowed on the same node for test environments where data loss is acceptable.
+    allowMultiplePerNode: false
+  mgr:
+    # When higher availability of the mgr is needed, increase the count to 2.
+    # In that case, one mgr will be active and one in standby. When Ceph updates which
+    # mgr is active, Rook will update the mgr services to match the active mgr.
+    count: {{ mgr_count }}
+    allowMultiplePerNode: false
+    modules:
+      # List of modules to optionally enable or disable.
+      # Note the "dashboard" and "monitoring" modules are already configured by other settings in the cluster CR.
+      # - name: rook
+      #   enabled: true
+  # enable the ceph dashboard for viewing cluster status
+  dashboard:
+    enabled: true
+    # serve the dashboard under a subpath (useful when you are accessing the dashboard via a reverse proxy)
+    # urlPrefix: /ceph-dashboard
+    # serve the dashboard at the given port.
+    # port: 8443
+    # serve the dashboard using SSL
+    ssl: true
+    # The url of the Prometheus instance
+    # prometheusEndpoint: <protocol>://<prometheus-host>:<port>
+    # Whether SSL should be verified if the Prometheus server is using https
+    # prometheusEndpointSSLVerify: false
+  # enable prometheus alerting for cluster
+  monitoring:
+    # requires Prometheus to be pre-installed
+    enabled: false
+    # Whether to disable the metrics reported by Ceph. If false, the prometheus mgr module and Ceph exporter are enabled.
+    # If true, the prometheus mgr module and Ceph exporter are both disabled. Default is false.
+    metricsDisabled: false
+  network:
+    connections:
+      # Whether to encrypt the data in transit across the wire to prevent eavesdropping the data on the network.
+      # The default is false. When encryption is enabled, all communication between clients and Ceph daemons, or between Ceph daemons will be encrypted.
+      # When encryption is not enabled, clients still establish a strong initial authentication and data integrity is still validated with a crc check.
+      # IMPORTANT: Encryption requires the 5.11 kernel for the latest nbd and cephfs drivers. Alternatively for testing only,
+      # you can set the "mounter: rbd-nbd" in the rbd storage class, or "mounter: fuse" in the cephfs storage class.
+      # The nbd and fuse drivers are *not* recommended in production since restarting the csi driver pod will disconnect the volumes.
+      encryption:
+        enabled: false
+      # Whether to compress the data in transit across the wire. The default is false.
+      # See the kernel requirements above for encryption.
+      compression:
+        enabled: false
+      # Whether to require communication over msgr2. If true, the msgr v1 port (6789) will be disabled
+      # and clients will be required to connect to the Ceph cluster with the v2 port (3300).
+      # Requires a kernel that supports msgr v2 (kernel 5.11 or CentOS 8.4 or newer).
+      requireMsgr2: false
+    # enable host networking
+    provider: host
+    # enable the Multus network provider
+    #provider: multus
+    #selectors:
+    #  The selector keys are required to be `public` and `cluster`.
+    #  Based on the configuration, the operator will do the following:
+    #    1. if only the `public` selector key is specified both public_network and cluster_network Ceph settings will listen on that interface
+    #    2. if both `public` and `cluster` selector keys are specified the first one will point to 'public_network' flag and the second one to 'cluster_network'
+    #
+    #  In order to work, each selector value must match a NetworkAttachmentDefinition object in Multus
+    #
+    #  public: public-conf --> NetworkAttachmentDefinition object name in Multus
+    #  cluster: cluster-conf --> NetworkAttachmentDefinition object name in Multus
+    # Provide internet protocol version. IPv6, IPv4 or empty string are valid options. Empty string would mean IPv4
+    #ipFamily: "IPv6"
+    # Ceph daemons to listen on both IPv4 and Ipv6 networks
+    #dualStack: false
+    # Enable multiClusterService to export the mon and OSD services to peer cluster.
+    # This is useful to support RBD mirroring between two clusters having overlapping CIDRs.
+    # Ensure that peer clusters are connected using an MCS API compatible application, like Globalnet Submariner.
+    #multiClusterService:
+    #  enabled: false
+
+  # enable the crash collector for ceph daemon crash collection
+  crashCollector:
+    disable: false
+    # Uncomment daysToRetain to prune ceph crash entries older than the
+    # specified number of days.
+    #daysToRetain: 30
+  # enable log collector, daemons will log on files and rotate
+  logCollector:
+    enabled: true
+    periodicity: daily # one of: hourly, daily, weekly, monthly
+    maxLogSize: 500M # SUFFIX may be 'M' or 'G'. Must be at least 1M.
+  # automate [data cleanup process](https://github.com/rook/rook/blob/master/Documentation/Storage-Configuration/ceph-teardown.md#delete-the-data-on-hosts) in cluster destruction.
+  cleanupPolicy:
+    # Since cluster cleanup is destructive to data, confirmation is required.
+    # To destroy all Rook data on hosts during uninstall, confirmation must be set to "yes-really-destroy-data".
+    # This value should only be set when the cluster is about to be deleted. After the confirmation is set,
+    # Rook will immediately stop configuring the cluster and only wait for the delete command.
+    # If the empty string is set, Rook will not destroy any data on hosts during uninstall.
+    confirmation: ""
+    # sanitizeDisks represents settings for sanitizing OSD disks on cluster deletion
+    sanitizeDisks:
+      # method indicates if the entire disk should be sanitized or simply ceph's metadata
+      # in both case, re-install is possible
+      # possible choices are 'complete' or 'quick' (default)
+      method: quick
+      # dataSource indicate where to get random bytes from to write on the disk
+      # possible choices are 'zero' (default) or 'random'
+      # using random sources will consume entropy from the system and will take much more time then the zero source
+      dataSource: zero
+      # iteration overwrite N times instead of the default (1)
+      # takes an integer value
+      iteration: 1
+    # allowUninstallWithVolumes defines how the uninstall should be performed
+    # If set to true, cephCluster deletion does not wait for the PVs to be deleted.
+    allowUninstallWithVolumes: false
+  # To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
+  # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
+  # tolerate taints with a key of 'storage-node'.
+  # placement:
+  #   all:
+  #     nodeAffinity:
+  #       requiredDuringSchedulingIgnoredDuringExecution:
+  #         nodeSelectorTerms:
+  #         - matchExpressions:
+  #           - key: role
+  #             operator: In
+  #             values:
+  #             - storage-node
+  #     podAffinity:
+  #     podAntiAffinity:
+  #     topologySpreadConstraints:
+  #     tolerations:
+  #     - key: storage-node
+  #       operator: Exists
+  # The above placement information can also be specified for mon, osd, and mgr components
+  #   mon:
+  placement:
+    mon:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+          - matchExpressions:
+            - key: {{ mon_placement_label }}
+              operator: In
+              values:
+              - enabled
+    mgr:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+          - matchExpressions:
+            - key: {{ mgr_placement_label }}
+              operator: In
+              values:
+              - enabled
+
+  # Monitor deployments may contain an anti-affinity rule for avoiding monitor
+  # collocation on the same node. This is a required rule when host network is used
+  # or when AllowMultiplePerNode is false. Otherwise this anti-affinity rule is a
+  # preferred rule with weight: 50.
+  #   osd:
+  #    prepareosd:
+  #    mgr:
+  #    cleanup:
+  annotations:
+  #   all:
+  #   mon:
+  #   osd:
+  #   cleanup:
+  #   prepareosd:
+  # clusterMetadata annotations will be applied to only `rook-ceph-mon-endpoints` configmap and the `rook-ceph-mon` and `rook-ceph-admin-keyring` secrets.
+  # And clusterMetadata annotations will not be merged with `all` annotations.
+  #    clusterMetadata:
+  #       kubed.appscode.com/sync: "true"
+  # If no mgr annotations are set, prometheus scrape annotations will be set by default.
+  #   mgr:
+  labels:
+  #   all:
+  #   mon:
+  #   osd:
+  #   cleanup:
+  #   mgr:
+  #   prepareosd:
+  # monitoring is a list of key-value pairs. It is injected into all the monitoring resources created by operator.
+  # These labels can be passed as LabelSelector to Prometheus
+  #   monitoring:
+  #   crashcollector:
+  resources:
+  #The requests and limits set here, allow the mgr pod to use half of one CPU core and 1 gigabyte of memory
+  #   mgr:
+  #     limits:
+  #       memory: "1024Mi"
+  #     requests:
+  #       cpu: "500m"
+  #       memory: "1024Mi"
+  # The above example requests/limits can also be added to the other components
+  #   mon:
+  #   osd:
+  # For OSD it also is a possible to specify requests/limits based on device class
+  #   osd-hdd:
+  #   osd-ssd:
+  #   osd-nvme:
+  #   prepareosd:
+  #   mgr-sidecar:
+  #   crashcollector:
+  #   logcollector:
+  #   cleanup:
+  #   exporter:
+  # The option to automatically remove OSDs that are out and are safe to destroy.
+  removeOSDsIfOutAndSafeToRemove: false
+  priorityClassNames:
+    #all: rook-ceph-default-priority-class
+    mon: system-node-critical
+    osd: system-node-critical
+    mgr: system-cluster-critical
+    #crashcollector: rook-ceph-crashcollector-priority-class
+  storage: # cluster level storage configuration and selection
+    useAllNodes: false
+    useAllDevices: false
+    #deviceFilter:
+    config:
+      # crushRoot: "custom-root" # specify a non-default root label for the CRUSH map
+      # metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
+      # databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
+      osdsPerDevice: "1" # this value can be overridden at the node or device level
+      encryptedDevice: "true" # the default value for this option is "false"
+    # Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
+    # nodes below will be used as storage resources.  Each node's 'name' field should match their 'kubernetes.io/hostname' label.
+    # nodes:
+    #   - name: "172.17.4.201"
+    #     devices: # specific devices to use for storage can be specified for each node
+    #       - name: "sdb"
+    #       - name: "nvme01" # multiple osds can be created on high performance devices
+    #         config:
+    #           osdsPerDevice: "5"
+    #       - name: "/dev/disk/by-id/ata-ST4000DM004-XXXX" # devices can be specified using full udev paths
+    #     config: # configuration can be specified at the node level which overrides the cluster level config
+    #   - name: "172.17.4.301"
+    #     deviceFilter: "^sd."
+    # when onlyApplyOSDPlacement is false, will merge both placement.All() and placement.osd
+    onlyApplyOSDPlacement: false
+    # Time for which an OSD pod will sleep before restarting, if it stopped due to flapping
+    # flappingRestartIntervalHours: 24
+  # The section for configuring management of daemon disruptions during upgrade or fencing.
+  disruptionManagement:
+    # If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically
+    # via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will
+    # block eviction of OSDs by default and unblock them safely when drains are detected.
+    managePodBudgets: true
+    # A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
+    # default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
+    osdMaintenanceTimeout: 30
+    # A duration in minutes that the operator will wait for the placement groups to become healthy (active+clean) after a drain was completed and OSDs came back up.
+    # Operator will continue with the next drain if the timeout exceeds. It only works if `managePodBudgets` is `true`.
+    # No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
+    pgHealthCheckTimeout: 0
+
+  # csi defines CSI Driver settings applied per cluster.
+  csi:
+    readAffinity:
+      # Enable read affinity to enable clients to optimize reads from an OSD in the same topology.
+      # Enabling the read affinity may cause the OSDs to consume some extra memory.
+      # For more details see this doc:
+      # https://rook.io/docs/rook/latest/Storage-Configuration/Ceph-CSI/ceph-csi-drivers/#enable-read-affinity-for-rbd-volumes
+      enabled: false
+
+    # cephfs driver specific settings.
+    cephfs:
+      # Set CephFS Kernel mount options to use https://docs.ceph.com/en/latest/man/8/mount.ceph/#options.
+      # kernelMountOptions: ""
+      # Set CephFS Fuse mount options to use https://docs.ceph.com/en/quincy/man/8/ceph-fuse/#options.
+      # fuseMountOptions: ""
+
+  # healthChecks
+  # Valid values for daemons are 'mon', 'osd', 'status'
+  healthCheck:
+    daemonHealth:
+      mon:
+        disabled: false
+        interval: 45s
+      osd:
+        disabled: false
+        interval: 60s
+      status:
+        disabled: false
+        interval: 60s
+    # Change pod liveness probe timing or threshold values. Works for all mon,mgr,osd daemons.
+    livenessProbe:
+      mon:
+        disabled: false
+      mgr:
+        disabled: false
+      osd:
+        disabled: false
+    # Change pod startup probe timing or threshold values. Works for all mon,mgr,osd daemons.
+    startupProbe:
+      mon:
+        disabled: false
+      mgr:
+        disabled: false
+      osd:
+        disabled: false

From 59a41a16a78a7a7f13f426a47e753c2a15725734 Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Wed, 29 May 2024 12:09:21 +0200
Subject: [PATCH 2/8] Adapt "CreateClusterHandler" for our new workflow handler

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 setup.cfg                                     |   4 +-
 .../modules/create_cluster/__init__.py        |   9 +-
 src/rookify/modules/create_cluster/main.py    | 101 +++++++++++-------
 3 files changed, 64 insertions(+), 50 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 0b4de7f..f180753 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,4 +9,6 @@ tests =
     pytest==8.0.2
 
 [options.package_data]
-rookify=config.schema.yaml
+rookify=
+    config.schema.yaml
+    **/*.j2
diff --git a/src/rookify/modules/create_cluster/__init__.py b/src/rookify/modules/create_cluster/__init__.py
index 8bb0ef5..53cdfdd 100644
--- a/src/rookify/modules/create_cluster/__init__.py
+++ b/src/rookify/modules/create_cluster/__init__.py
@@ -1,10 +1,3 @@
 # -*- coding: utf-8 -*-
-# type: ignore
 
-from .main import CreateClusterHandler
-
-MODULE_NAME = "create_cluster"
-HANDLER_CLASS = CreateClusterHandler
-REQUIRES = []
-AFTER = []
-PREFLIGHT_REQUIRES = ["analyze_ceph"]
+from .main import CreateClusterHandler as ModuleHandler  # noqa
diff --git a/src/rookify/modules/create_cluster/main.py b/src/rookify/modules/create_cluster/main.py
index 8b20a0e..b744546 100644
--- a/src/rookify/modules/create_cluster/main.py
+++ b/src/rookify/modules/create_cluster/main.py
@@ -1,16 +1,40 @@
 # -*- coding: utf-8 -*-
 
+import kubernetes
+from typing import Any
+from ..machine import Machine
 from ..module import ModuleHandler, ModuleException
 
-from typing import Any
 
-import kubernetes
+class CreateClusterHandler(ModuleHandler):
+    REQUIRES = [
+        "analyze_ceph",
+        "cephx_auth_config",
+        "k8s_prerequisites_check",
+        "create_configmap",
+    ]
+
+    @property
+    def __mon_placement_label(self) -> str:
+        return (
+            self._config["rook"]["cluster"]["mon_placement_label"]
+            if "mon_placement_label" in self._config["rook"]["cluster"]
+            else f"placement-{self._config["rook"]["cluster"]["name"]}-mon"
+        )
 
+    @property
+    def __mgr_placement_label(self) -> str:
+        return (
+            self._config["rook"]["cluster"]["mgr_placement_label"]
+            if "mgr_placement_label" in self._config["rook"]["cluster"]
+            else f"placement-{self._config["rook"]["cluster"]["name"]}-mgr"
+        )
+
+    def __create_cluster_definition(self) -> None:
+        state_data = self.machine.get_preflight_state("AnalyzeCephHandler").data
 
-class CreateClusterHandler(ModuleHandler):
-    def __create_cluster_definition(self) -> Any:
         try:
-            node_ls_data = self._data["analyze_ceph"]["node"]["ls"]
+            node_ls_data = state_data["node"]["ls"]
 
             # Get monitor count
             mon_count = 0
@@ -31,30 +55,20 @@ def __create_cluster_definition(self) -> Any:
                     )
 
             # Render cluster config from template
-            self.__cluster_name = self._config["rook"]["cluster"]["name"]
-            self.__cluster_namespace = self._config["rook"]["cluster"]["namespace"]
-            self.__cluster_image = self._config["rook"]["ceph"]["image"]
-            self.__mon_placement_label = (
-                self._config["rook"]["cluster"]["mon_placement_label"]
-                if "mon_placement_label" in self._config["rook"]["cluster"]
-                else f"placement-{self.__cluster_name}-mon"
-            )
-            self.__mgr_placement_label = (
-                self._config["rook"]["cluster"]["mgr_placement_label"]
-                if "mgr_placement_label" in self._config["rook"]["cluster"]
-                else f"placement-{self.__cluster_name}-mgr"
-            )
-            self.__cluster_definition = self.load_template(
+            cluster_definition = self.load_template(
                 "cluster.yaml.j2",
-                cluster_name=self.__cluster_name,
-                cluster_namespace=self.__cluster_namespace,
-                ceph_image=self.__cluster_image,
+                cluster_name=self._config["rook"]["cluster"]["name"],
+                cluster_namespace=self._config["rook"]["cluster"]["namespace"],
+                ceph_image=self._config["rook"]["ceph"]["image"],
                 mon_count=mon_count,
                 mgr_count=mgr_count,
                 mon_placement_label=self.__mon_placement_label,
                 mgr_placement_label=self.__mgr_placement_label,
             )
 
+            self.machine.get_preflight_state(
+                "CreateClusterHandler"
+            ).cluster_definition = cluster_definition.yaml
         except KeyError:
             raise ModuleException("Ceph monitor data is incomplete")
 
@@ -78,39 +92,37 @@ def __check_k8s_prerequisites(self) -> None:
                     f"Label {self.__mon_placement_label} is set on node {node.metadata.name}"
                 )
 
-        # We have to check if our namespace exists
-        namespace_exists = False
-        namespaces = self.k8s.core_v1_api.list_namespace().items
-        for namespace in namespaces:
-            if namespace.metadata.name == self.__cluster_namespace:
-                namespace_exists = True
-        if not namespace_exists:
-            raise ModuleException(
-                f"Namespace {self.__cluster_namespace} does not exist"
-            )
-
     def preflight(self) -> None:
-        self.__create_cluster_definition()
         self.__check_k8s_prerequisites()
+        self.__create_cluster_definition()
 
-    def run(self) -> Any:
+    def execute(self) -> None:
         # Create CephCluster
-        self.k8s.crd_api_apply(self.__cluster_definition.yaml)
+        cluster_definition = self.machine.get_preflight_state(
+            "CreateClusterHandler"
+        ).cluster_definition
+
+        self.k8s.crd_api_apply(cluster_definition)
+
+        cluster_name = self._config["rook"]["cluster"]["name"]
 
         # Wait for CephCluster to get into Progressing phase
+        result = None
         watcher = kubernetes.watch.Watch()
+
         stream = watcher.stream(
             self.k8s.custom_objects_api.list_namespaced_custom_object,
             "ceph.rook.io",
             "v1",
-            self.__cluster_namespace,
+            self._config["rook"]["cluster"]["namespace"],
             "cephclusters",
             timeout_seconds=60,
         )
+
         for event in stream:
             event_object = event["object"]
 
-            if event_object["metadata"]["name"] != self.__cluster_name:
+            if event_object["metadata"]["name"] != cluster_name:
                 continue
 
             try:
@@ -119,9 +131,16 @@ def run(self) -> Any:
                     break
             except KeyError:
                 pass
+
         watcher.stop()
 
-        try:
-            return result
-        except NameError:
+        if result == None:
             raise ModuleException("CephCluster did not come up")
+
+    @staticmethod
+    def register_preflight_state(
+        machine: Machine, state_name: str, handler: ModuleHandler, **kwargs: Any
+    ) -> None:
+        ModuleHandler.register_preflight_state(
+            machine, state_name, handler, tags=["cluster_definition"]
+        )

From dcf7855ab81dff7c9c1052598f5b9184f99f71ac Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Wed, 29 May 2024 11:31:46 +0200
Subject: [PATCH 3/8] Add support to validate required k8s prerequisites in a
 separate module.

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 .../modules/k8s_prerequisites_check/__init__.py  |  3 +++
 .../modules/k8s_prerequisites_check/main.py      | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 src/rookify/modules/k8s_prerequisites_check/__init__.py
 create mode 100644 src/rookify/modules/k8s_prerequisites_check/main.py

diff --git a/src/rookify/modules/k8s_prerequisites_check/__init__.py b/src/rookify/modules/k8s_prerequisites_check/__init__.py
new file mode 100644
index 0000000..79adceb
--- /dev/null
+++ b/src/rookify/modules/k8s_prerequisites_check/__init__.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+from .main import K8sPrerequisitesCheckHandler as ModuleHandler  # noqa
diff --git a/src/rookify/modules/k8s_prerequisites_check/main.py b/src/rookify/modules/k8s_prerequisites_check/main.py
new file mode 100644
index 0000000..0ac4c9f
--- /dev/null
+++ b/src/rookify/modules/k8s_prerequisites_check/main.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+from ..module import ModuleException, ModuleHandler
+
+
+class K8sPrerequisitesCheckHandler(ModuleHandler):
+    def preflight(self) -> None:
+        namespace = self._config["rook"]["cluster"]["namespace"]
+
+        namespaces = [
+            namespace.metadata.name
+            for namespace in self.k8s.core_v1_api.list_namespace().items
+        ]
+
+        if namespace not in namespaces:
+            raise ModuleException("Namespace {0} does not exist".format(namespace))

From db380fed5b807bd7068d3bfa6c03d090fe9d8eed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan-Marten=20Br=C3=BCggemann?= <brueggemann@b1-systems.de>
Date: Wed, 20 Mar 2024 15:28:53 +0100
Subject: [PATCH 4/8] add create configmap module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jan-Marten Brüggemann <brueggemann@b1-systems.de>
---
 .../modules/create_configmap/__init__.py      | 10 ++++
 src/rookify/modules/create_configmap/main.py  | 58 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 src/rookify/modules/create_configmap/__init__.py
 create mode 100644 src/rookify/modules/create_configmap/main.py

diff --git a/src/rookify/modules/create_configmap/__init__.py b/src/rookify/modules/create_configmap/__init__.py
new file mode 100644
index 0000000..91d7f95
--- /dev/null
+++ b/src/rookify/modules/create_configmap/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# type: ignore
+
+from .main import CreateConfigMapHandler
+
+MODULE_NAME = "create_configmap"
+HANDLER_CLASS = CreateConfigMapHandler
+REQUIRES = []
+AFTER = []
+PREFLIGHT_REQUIRES = ["analyze_ceph"]
diff --git a/src/rookify/modules/create_configmap/main.py b/src/rookify/modules/create_configmap/main.py
new file mode 100644
index 0000000..d09e646
--- /dev/null
+++ b/src/rookify/modules/create_configmap/main.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+from ..module import ModuleHandler, ModuleException
+import kubernetes
+
+from typing import Any
+
+
+class CreateConfigMapHandler(ModuleHandler):
+    def __create_configmap_definition(self) -> Any:
+        pass
+
+    def preflight(self) -> None:
+        self.__cluster_name = self._config["rook"]["cluster"]["name"]
+        self.__fsid = self._data["analyze_ceph"]["mon"]["dump"]["fsid"]
+
+        # If the secret already exists, we have to abort to not override it
+        try:
+            self.k8s.core_v1_api.read_namespaced_secret(
+                "rook-ceph-mon", self._config["rook"]["cluster"]["namespace"]
+            )
+        except kubernetes.client.exceptions.ApiException:
+            pass
+        else:
+            raise ModuleException("Secret rook-ceph-mon already exists")
+
+    def run(self) -> Any:
+        # Get or create needed auth keys
+        admin_auth = self.ceph.mon_command(
+            "auth get-or-create-key",
+            entity="client.admin",
+            mon="allow *",
+            mgr="allow *",
+            mds="allow *",
+        )
+
+        mon_auth = self.ceph.mon_command(
+            "auth get-or-create-key", entity="mon.", mon="allow *"
+        )
+
+        metadata = kubernetes.client.V1ObjectMeta(name="rook-ceph-mon")
+
+        string_data = {
+            "admin-secret": admin_auth["key"],
+            "cluster-name": self.__cluster_name,
+            "fsid": self.__fsid,
+            "mon-secret": mon_auth["key"],
+        }
+
+        secret = kubernetes.client.V1Secret(
+            api_version="v1", kind="Secret", metadata=metadata, string_data=string_data
+        )
+
+        secret = self.k8s.core_v1_api.create_namespaced_secret(
+            self._config["rook"]["cluster"]["namespace"], body=secret
+        )
+
+        return secret.to_dict()

From a110d666d5c27853069c7c3179605b0e7811d623 Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Wed, 29 May 2024 11:37:14 +0200
Subject: [PATCH 5/8] Add actual configmap definition to the module

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 .../modules/create_configmap/__init__.py      |  8 +-
 src/rookify/modules/create_configmap/main.py  | 94 ++++++++++++++++---
 2 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/src/rookify/modules/create_configmap/__init__.py b/src/rookify/modules/create_configmap/__init__.py
index 91d7f95..0988cbb 100644
--- a/src/rookify/modules/create_configmap/__init__.py
+++ b/src/rookify/modules/create_configmap/__init__.py
@@ -1,10 +1,4 @@
 # -*- coding: utf-8 -*-
 # type: ignore
 
-from .main import CreateConfigMapHandler
-
-MODULE_NAME = "create_configmap"
-HANDLER_CLASS = CreateConfigMapHandler
-REQUIRES = []
-AFTER = []
-PREFLIGHT_REQUIRES = ["analyze_ceph"]
+from .main import CreateConfigMapHandler as ModuleHandler  # noqa
diff --git a/src/rookify/modules/create_configmap/main.py b/src/rookify/modules/create_configmap/main.py
index d09e646..028f9bf 100644
--- a/src/rookify/modules/create_configmap/main.py
+++ b/src/rookify/modules/create_configmap/main.py
@@ -1,18 +1,58 @@
 # -*- coding: utf-8 -*-
 
-from ..module import ModuleHandler, ModuleException
 import kubernetes
+from ..machine import Machine
+from ..module import ModuleHandler, ModuleException
 
-from typing import Any
+from typing import Any, Dict
 
 
 class CreateConfigMapHandler(ModuleHandler):
-    def __create_configmap_definition(self) -> Any:
-        pass
+    REQUIRES = ["analyze_ceph", "k8s_prerequisites_check"]
+
+    def __create_configmap_definition(self) -> None:
+        state_data = self.machine.get_preflight_state("AnalyzeCephHandler").data
+
+        metadata = kubernetes.client.V1ObjectMeta(name="rook-ceph-mon-endpoints")
+        configmap_mon_list = ""
+
+        for mon in state_data["mon"]["dump"]["mons"]:
+            if configmap_mon_list != "":
+                configmap_mon_list += ","
+
+            configmap_mon_list += "{0}:{1}".format(
+                mon["name"], mon["public_addr"].rsplit("/", 1)[0]
+            )
+
+        configmap_data = {
+            "data": configmap_mon_list,
+            "mapping": "{}",
+            "maxMonId": "-1",
+        }
+
+        configmap = kubernetes.client.V1ConfigMap(
+            api_version="v1", kind="ConfigMap", metadata=metadata, data=configmap_data
+        )
+
+        self.machine.get_preflight_state(
+            "CreateConfigMapHandler"
+        ).configmap = configmap.to_dict()
 
     def preflight(self) -> None:
         self.__cluster_name = self._config["rook"]["cluster"]["name"]
-        self.__fsid = self._data["analyze_ceph"]["mon"]["dump"]["fsid"]
+
+        state_data = self.machine.get_preflight_state("AnalyzeCephHandler").data
+        self.__fsid = state_data["mon"]["dump"]["fsid"]
+
+        # If the configmap or secret already exists, we have to abort to not override it
+        try:
+            self.k8s.core_v1_api.read_namespaced_config_map(
+                "rook-ceph-mon-endpoints", self._config["rook"]["cluster"]["namespace"]
+            )
+        except kubernetes.client.exceptions.ApiException:
+            pass
+        else:
+            raise ModuleException("Configmap rook-ceph-mon-endpoints already exists")
 
         # If the secret already exists, we have to abort to not override it
         try:
@@ -24,19 +64,33 @@ def preflight(self) -> None:
         else:
             raise ModuleException("Secret rook-ceph-mon already exists")
 
-    def run(self) -> Any:
+        self.__create_configmap_definition()
+
+    def execute(self) -> None:
+        configmap = kubernetes.client.V1ConfigMap(
+            **self.machine.get_preflight_state("CreateConfigMapHandler").configmap
+        )
+
+        configmap = self.k8s.core_v1_api.create_namespaced_config_map(
+            self._config["rook"]["cluster"]["namespace"], body=configmap
+        )
+
+        self.machine.get_execution_state(
+            "CreateConfigMapHandler"
+        ).configmap = configmap.to_dict()
+
         # Get or create needed auth keys
-        admin_auth = self.ceph.mon_command(
+        admin_auth: Dict[str, Any] = self.ceph.mon_command(
             "auth get-or-create-key",
             entity="client.admin",
             mon="allow *",
             mgr="allow *",
             mds="allow *",
-        )
+        )  # type: ignore
 
-        mon_auth = self.ceph.mon_command(
+        mon_auth: Dict[str, Any] = self.ceph.mon_command(
             "auth get-or-create-key", entity="mon.", mon="allow *"
-        )
+        )  # type: ignore
 
         metadata = kubernetes.client.V1ObjectMeta(name="rook-ceph-mon")
 
@@ -55,4 +109,22 @@ def run(self) -> Any:
             self._config["rook"]["cluster"]["namespace"], body=secret
         )
 
-        return secret.to_dict()
+        self.machine.get_execution_state(
+            "CreateConfigMapHandler"
+        ).secret = secret.to_dict()
+
+    @staticmethod
+    def register_execution_state(
+        machine: Machine, state_name: str, handler: ModuleHandler, **kwargs: Any
+    ) -> None:
+        ModuleHandler.register_execution_state(
+            machine, state_name, handler, tags=["secret"]
+        )
+
+    @staticmethod
+    def register_preflight_state(
+        machine: Machine, state_name: str, handler: ModuleHandler, **kwargs: Any
+    ) -> None:
+        ModuleHandler.register_preflight_state(
+            machine, state_name, handler, tags=["configmap"]
+        )

From 0dcf376867b0a5059cea7c27f44e8246625eccce Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Mon, 10 Jun 2024 14:49:02 +0200
Subject: [PATCH 6/8] Satisfy mypy by forcing config values to be returned as
 strings.

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 src/rookify/modules/create_cluster/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rookify/modules/create_cluster/main.py b/src/rookify/modules/create_cluster/main.py
index b744546..1a49351 100644
--- a/src/rookify/modules/create_cluster/main.py
+++ b/src/rookify/modules/create_cluster/main.py
@@ -17,7 +17,7 @@ class CreateClusterHandler(ModuleHandler):
     @property
     def __mon_placement_label(self) -> str:
         return (
-            self._config["rook"]["cluster"]["mon_placement_label"]
+            str(self._config["rook"]["cluster"]["mon_placement_label"])
             if "mon_placement_label" in self._config["rook"]["cluster"]
             else f"placement-{self._config["rook"]["cluster"]["name"]}-mon"
         )
@@ -25,7 +25,7 @@ def __mon_placement_label(self) -> str:
     @property
     def __mgr_placement_label(self) -> str:
         return (
-            self._config["rook"]["cluster"]["mgr_placement_label"]
+            str(self._config["rook"]["cluster"]["mgr_placement_label"])
             if "mgr_placement_label" in self._config["rook"]["cluster"]
             else f"placement-{self._config["rook"]["cluster"]["name"]}-mgr"
         )

From c9b24a9125084a7481bcfc880b9f578f2fb1f90c Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Wed, 12 Jun 2024 09:13:45 +0200
Subject: [PATCH 7/8] Add logging of important steps in `CreateClusterHandler`

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 src/rookify/modules/create_cluster/main.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/rookify/modules/create_cluster/main.py b/src/rookify/modules/create_cluster/main.py
index 1a49351..45e8fd2 100644
--- a/src/rookify/modules/create_cluster/main.py
+++ b/src/rookify/modules/create_cluster/main.py
@@ -54,6 +54,15 @@ def __create_cluster_definition(self) -> None:
                         f"There are more than 1 mgr running on node {node}"
                     )
 
+            self.logger.debug(
+                "Rook cluster definition values: {0} {1} with mon label {2} and mgr label {3}".format(
+                    self._config["rook"]["cluster"]["namespace"],
+                    self._config["rook"]["cluster"]["name"],
+                    self.__mon_placement_label,
+                    self.__mgr_placement_label,
+                )
+            )
+
             # Render cluster config from template
             cluster_definition = self.load_template(
                 "cluster.yaml.j2",
@@ -97,6 +106,8 @@ def preflight(self) -> None:
         self.__create_cluster_definition()
 
     def execute(self) -> None:
+        self.logger.info("Creating Rook cluster definition")
+
         # Create CephCluster
         cluster_definition = self.machine.get_preflight_state(
             "CreateClusterHandler"
@@ -104,6 +115,8 @@ def execute(self) -> None:
 
         self.k8s.crd_api_apply(cluster_definition)
 
+        self.logger.info("Waiting for Rook cluster created")
+
         cluster_name = self._config["rook"]["cluster"]["name"]
 
         # Wait for CephCluster to get into Progressing phase

From 4a0256761bccc87624dbb42446ae8544058553ef Mon Sep 17 00:00:00 2001
From: Tobias Wolf <wolf@b1-systems.de>
Date: Wed, 12 Jun 2024 09:15:44 +0200
Subject: [PATCH 8/8] Fix comparision to `None` in `CreateClusterHandler`

Signed-off-by: Tobias Wolf <wolf@b1-systems.de>
---
 src/rookify/modules/create_cluster/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rookify/modules/create_cluster/main.py b/src/rookify/modules/create_cluster/main.py
index 45e8fd2..06af6b0 100644
--- a/src/rookify/modules/create_cluster/main.py
+++ b/src/rookify/modules/create_cluster/main.py
@@ -147,7 +147,7 @@ def execute(self) -> None:
 
         watcher.stop()
 
-        if result == None:
+        if result is None:
             raise ModuleException("CephCluster did not come up")
 
     @staticmethod