wip to add better support for node groups/autoscaling on gke

This is still risky to do - the GKE clusters self update and often render in a state that is not usable. But this is still better than the current setup. When I am brave I will try bringing this up again. Another issue here is by using a region, we set a min/max for EACH region so it can result in 3-4x the number of nodes that are expected (not a great surprise imho). Signed-off-by: vsoch <[email protected]>
converged-computing · Jan 7, 2024 · 1a0a294 · 1a0a294
1 parent 7e695cd
commit 1a0a294
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 63 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
 The versions coincide with releases on pip. Only major versions will be released as tags on Github.
 
 ## [0.0.x](https://github.com/converged-computing/kubescaler/tree/main) (0.0.x)
+ - support for GKE autoscaling (0.0.19)
  - do not use the waiter for nodegroup_active it does not work! (0.0.18)
  - support adding one-off node groups to a cluster (0.0.17)
  - allow manual customization and timing of nodegroup (e.g., for spot) (0.0.16)

diff --git a/kubescaler/scaler/google.py b/kubescaler/scaler/google.py
@@ -27,13 +27,18 @@ class GKECluster(Cluster):
     A scaler for a Google Kubernetes Engine (GKE) cluster
     """
 
-    default_region = "us-central1-a"
+    default_region = "us-central1"
 
     def __init__(
         self,
         project,
-        machine_type_memory_gb=32,
-        machine_type_vcpu=8,
+        default_pool_name="default-pool",
+        zone="us-central1-a",
+        spot=False,
+        max_vcpu=8,
+        max_memory=32,
+        # Initial labels for the default cluster
+        labels=None,
         **kwargs,
     ):
         """
@@ -47,10 +52,14 @@ def __init__(
         self.client = container_v1.ClusterManagerClient()
         self.project = project
         self.machine_type = self.machine_type or "c2-standard-8"
-        self.machine_type_vcpu = machine_type_vcpu
-        self.machine_type_memory_gb = machine_type_memory_gb
         self.tags = self.tags or ["kubescaler-cluster"]
+        self.default_pool = default_pool_name
         self.configuration = None
+        self.labels = labels
+        self.zone = zone
+        self.max_vcpu = max_vcpu
+        self.max_memory = max_memory
+        self.spot = False
 
     @timed
     def delete_cluster(self):
@@ -63,13 +72,6 @@ def delete_cluster(self):
         self.configuration = None
         self.wait_for_delete()
 
-    @property
-    def zone(self):
-        """
-        The region is the zone minus the last letter!
-        """
-        return self.region.rsplit("-", 1)[0]
-
     @property
     def data(self):
         """
@@ -85,24 +87,25 @@ def data(self):
             "description": self.description,
         }
 
-    def scale_up(self, count, pool_name="default-pool"):
+    def scale_up(self, count, pool_name=None):
         """
         Make a request to scale the cluster
         """
         return self.scale(count, count, count + 1, pool_name=pool_name)
 
-    def scale_down(self, count, pool_name="default-pool"):
+    def scale_down(self, count, pool_name=None):
         """
         Make a request to scale the cluster
         """
         return self.scale(
             count, max(count - 1, self.min_nodes), count, pool_name=pool_name
         )
 
-    def scale(self, count, min_count, max_count, pool_name="default-pool"):
+    def scale(self, count, min_count, max_count, pool_name=None):
         """
         Make a request to scale the cluster
         """
+        pool_name = pool_name or self.default_pool
         node_pool_name = f"{self.cluster_name}/nodePools/{pool_name}"
 
         # Always make the max node count one more than we want
@@ -111,8 +114,6 @@ def scale(self, count, min_count, max_count, pool_name="default-pool"):
             enabled=True,
             min_node_count=min_count,
             max_node_count=max_count,
-            #            total_min_node_count=count,
-            #            total_max_node_count=count,
         )
 
         # https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L3884
@@ -140,24 +141,16 @@ def resize_cluster(self, count, node_pool_name):
         )
         return self.client.set_node_pool_size(request=request)
 
-    @property
-    def node_config(self):
+    def get_node_config(self, machine_type=None, spot=False, labels=None):
         """
-        Create the node config
-
-        Note that instead of initial_node_count + node_config above,
-        we could just use node_pool. I think the first creates the second,
-        and I'm not sure about pros/cons.
+        Get a node config for a specific machine type, and spot.
         """
-        # Note that if you use GKE Autopilot you need to use a different class, see the link:
-        # https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L448
         node_config = container_v1.NodeConfig(
-            machine_type=self.machine_type,
-            tags=self.tags
-            # metadata = {"startup-script": my_startup_script,
-            #            "user-data": my_user_data}
+            machine_type=machine_type or self.machine_type,
+            tags=self.tags,
+            spot=spot or self.spot,
+            labels=labels,
         )
-        print("\n🥣️ cluster node_config")
         print(node_config)
         return node_config
 
@@ -190,52 +183,100 @@ def get_k8s_client(self):
             self.configuration = configuration
 
         # This has .api_client for just the api client
-        return kubernetes_client.CoreV1Api(
-            kubernetes_client.ApiClient(self.configuration)
+        return kubernetes_client.CoreV1Api(self.get_api_client())
+
+    def get_api_client(self):
+        return kubernetes_client.ApiClient(self.configuration)
+
+    def get_existing_cluster(self, cluster_name=None):
+        """
+        Get a cluster after it's been created.
+        """
+        name = cluster_name or self.cluster_name
+        request = container_v1.GetClusterRequest(name=name)
+        try:
+            return self.client.get_cluster(request=request)
+        except NotFound:
+            pass
+
+    @timed
+    def create_cluster_nodes(
+        self, name, node_count, machine_type=None, spot=False, labels=None
+    ):
+        """
+        Create a node pool to add to the cluster.
+
+        https://github.com/googleapis/google-cloud-python/blob/min/
+        packages/google-cloud-container/google/cloud/container_v1/
+        services/cluster_manager/client.py#L3131
+        """
+        machine_type = self.machine_type or machine_type
+        node_config = self.get_node_config(machine_type, spot=spot, labels=labels)
+
+        # The min/max node counts are provided with the NodePoolAutoscaling
+        # For now we assume min == max for a constant number of nodes
+        autoscaling = container_v1.NodePoolAutoscaling(
+            enabled=True,
+            min_node_count=node_count,
+            max_node_count=node_count,
+        )
+        node_pool = container_v1.types.NodePool(
+            name=name,
+            config=node_config,
+            initial_node_count=node_count,
+            autoscaling=autoscaling,
+            # not specifying network_config uses cluster defaults
+            # Note that we can define placement_policy
+            # (google.cloud.container_v1.types.NodePool.PlacementPolicy)
         )
+        request = container_v1.CreateNodePoolRequest(
+            parent=self.cluster_name,
+            node_pool=node_pool,
+        )
+        response = self.client.create_node_pool(request=request)
+        print(response)
+        print(f"⏱️   Waiting for node pool {name} to be ready...")
+        return self.wait_for_status(2)
 
-    @property
-    def cluster(self):
+    def delete_nodegroup(self, name=None):
+        """
+        Delete a named node group.
+        """
+        node_pool = name or self.default_pool
+        request = container_v1.DeleteNodePoolRequest(
+            parent=self.cluster_name,
+            node_pool=node_pool,
+        )
+        # Make the request
+        return self.client.delete_node_pool(request=request)
+
+    def get_cluster(self, node_pools=None):
         """
         Get the cluster proto with our defaults
         """
         # Design our initial cluster!
-        # https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#LL2119C1-L2124C1
-
-        # Command for comparison
-        # TODO I don't see where dataplane is, or cloud dns, can add later!
-        # gcloud container clusters create flux-cluster \
-        #   --region=us-central1-a --project $GOOGLE_PROJECT \
-        #   --machine-type c2d-standard-112 --num-nodes=32 \
-        #   --cluster-dns=clouddns --cluster-dns-scope=cluster \
-        #   --tags=flux-cluster  --enable-dataplane-v2 \
-        #   --threads-per-core=1
-        # Note: we can add node_config for customizing node pools further
-
-        # TODO these look useful / interesting
-        # autoscaling (google.cloud.container_v1.types.ClusterAutoscaling):
-        #  Cluster-level autoscaling configuration.
-
         # Autoscaling - try optimizing
         # PROFILE_UNSPECIFIED = 0
         # OPTIMIZE_UTILIZATION = 1
         # BALANCED = 2
         autoscaling_profile = container_v1.ClusterAutoscaling.AutoscalingProfile(1)
 
-        # These are hard coded for c2-standard-8
+        # These are required, you get an error without them.
         # https://cloud.google.com/compute/docs/compute-optimized-machines
         resource_limits = [
             container_v1.ResourceLimit(
                 resource_type="cpu",
-                minimum=self.machine_type_vcpu,
-                maximum=self.machine_type_vcpu * self.max_nodes,
+                minimum=0,
+                maximum=self.max_vcpu * self.node_count,
             ),
             container_v1.ResourceLimit(
                 resource_type="memory",
-                minimum=self.machine_type_memory_gb,
-                maximum=self.machine_type_memory_gb * self.max_nodes,
+                minimum=0,
+                maximum=self.max_memory * self.node_count,
             ),
         ]
+
+        # Note that I removed resource_limits, no limits!
         cluster_autoscaling = container_v1.ClusterAutoscaling(
             enable_node_autoprovisioning=True,
             autoprovisioning_locations=[self.zone],
@@ -249,10 +290,20 @@ def cluster(self):
         cluster = container_v1.Cluster(
             name=self.name,
             description=self.description,
-            initial_node_count=self.node_count,
-            node_config=self.node_config,
             autoscaling=cluster_autoscaling,
         )
+
+        # We can either provide our own node pools, or a node count and initial size
+        # Keep in mind this doesn't allow setting a min or max!
+        if node_pools is not None:
+            cluster.node_pools = node_pools
+        else:
+            node_config = self.get_node_config(
+                self.machine_type, spot=self.spot, labels=self.labels
+            )
+            cluster.initial_node_count = self.node_count
+            cluster.node_config = node_config
+
         print("\n🥣️ cluster spec")
         print(cluster)
         return cluster
@@ -261,13 +312,40 @@ def cluster(self):
     def create_cluster(self):
         """
         Create a cluster, with hard coded variables for now.
+
+        Since we can't create an empty cluster, and the API doesn't allow you
+        to create one from scratch setting a min/max count, what we are going
+        to do is create the NodePool (with our preferences) first, and then
+        give it to the new cluster.
         """
-        # https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L3527
+        node_config = self.get_node_config(
+            self.machine_type, spot=self.spot, labels=self.labels
+        )
+
+        # If you don't set this, your cluster will grow as it pleases.
+        autoscaling = container_v1.NodePoolAutoscaling(
+            enabled=True,
+            min_node_count=self.min_nodes,
+            max_node_count=self.max_nodes,
+        )
+        node_pool = container_v1.types.NodePool(
+            name=self.default_pool,
+            config=node_config,
+            initial_node_count=self.node_count,
+            autoscaling=autoscaling,
+            # not specifying network_config uses cluster defaults
+            # Note that we can define placement_policy
+            # (google.cloud.container_v1.types.NodePool.PlacementPolicy)
+        )
+
+        # Get a cluster with the given node pool
+        cluster = self.get_cluster([node_pool])
+
+        # https://github.com/googleapis/google-cloud-python/blob/461c76bbc6bd7cda3ef6da0a0ec7e2418c1532aa/packages/google-cloud-container/google/cloud/container_v1/services/cluster_manager/client.py#L708
         request = container_v1.CreateClusterRequest(
             parent=f"projects/{self.project}/locations/{self.region}",
-            cluster=self.cluster,
+            cluster=cluster,
         )
-
         print("\n🥣️ cluster creation request")
         print(request)
 

diff --git a/kubescaler/version.py b/kubescaler/version.py
@@ -1,9 +1,9 @@
-# Copyright 2023 Lawrence Livermore National Security, LLC and other
+# Copyright 2023-2024 Lawrence Livermore National Security, LLC and other
 # HPCIC DevTools Developers. See the top-level COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (MIT)
 
-__version__ = "0.0.18"
+__version__ = "0.0.19"
 AUTHOR = "Vanessa Sochat"
 EMAIL = "[email protected]"
 NAME = "kubescaler"