Skip to content

Commit

Permalink
wip to add better support for node groups/autoscaling on gke
Browse files Browse the repository at this point in the history
This is still risky to do - the GKE clusters self update
and often render in a state that is not usable. But this is still better than
the current setup. When I am brave I will try bringing this up again.
Another issue here is by using a region, we set a min/max for EACH region
so it can result in 3-4x the number of nodes that are expected (not
a great surprise imho).

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Jan 7, 2024
1 parent 7e695cd commit 1a0a294
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 63 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip. Only major versions will be released as tags on Github.

## [0.0.x](https://github.com/converged-computing/kubescaler/tree/main) (0.0.x)
- support for GKE autoscaling (0.0.19)
- do not use the waiter for nodegroup_active it does not work! (0.0.18)
- support adding one-off node groups to a cluster (0.0.17)
- allow manual customization and timing of nodegroup (e.g., for spot) (0.0.16)
Expand Down
200 changes: 139 additions & 61 deletions kubescaler/scaler/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ class GKECluster(Cluster):
A scaler for a Google Kubernetes Engine (GKE) cluster
"""

default_region = "us-central1-a"
default_region = "us-central1"

def __init__(
self,
project,
machine_type_memory_gb=32,
machine_type_vcpu=8,
default_pool_name="default-pool",
zone="us-central1-a",
spot=False,
max_vcpu=8,
max_memory=32,
# Initial labels for the default cluster
labels=None,
**kwargs,
):
"""
Expand All @@ -47,10 +52,14 @@ def __init__(
self.client = container_v1.ClusterManagerClient()
self.project = project
self.machine_type = self.machine_type or "c2-standard-8"
self.machine_type_vcpu = machine_type_vcpu
self.machine_type_memory_gb = machine_type_memory_gb
self.tags = self.tags or ["kubescaler-cluster"]
self.default_pool = default_pool_name
self.configuration = None
self.labels = labels
self.zone = zone
self.max_vcpu = max_vcpu
self.max_memory = max_memory
self.spot = False

@timed
def delete_cluster(self):
Expand All @@ -63,13 +72,6 @@ def delete_cluster(self):
self.configuration = None
self.wait_for_delete()

@property
def zone(self):
"""
The region is the zone minus the last letter!
"""
return self.region.rsplit("-", 1)[0]

@property
def data(self):
"""
Expand All @@ -85,24 +87,25 @@ def data(self):
"description": self.description,
}

def scale_up(self, count, pool_name="default-pool"):
def scale_up(self, count, pool_name=None):
"""
Make a request to scale the cluster
"""
return self.scale(count, count, count + 1, pool_name=pool_name)

def scale_down(self, count, pool_name="default-pool"):
def scale_down(self, count, pool_name=None):
"""
Make a request to scale the cluster
"""
return self.scale(
count, max(count - 1, self.min_nodes), count, pool_name=pool_name
)

def scale(self, count, min_count, max_count, pool_name="default-pool"):
def scale(self, count, min_count, max_count, pool_name=None):
"""
Make a request to scale the cluster
"""
pool_name = pool_name or self.default_pool
node_pool_name = f"{self.cluster_name}/nodePools/{pool_name}"

# Always make the max node count one more than we want
Expand All @@ -111,8 +114,6 @@ def scale(self, count, min_count, max_count, pool_name="default-pool"):
enabled=True,
min_node_count=min_count,
max_node_count=max_count,
# total_min_node_count=count,
# total_max_node_count=count,
)

# https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L3884
Expand Down Expand Up @@ -140,24 +141,16 @@ def resize_cluster(self, count, node_pool_name):
)
return self.client.set_node_pool_size(request=request)

@property
def node_config(self):
def get_node_config(self, machine_type=None, spot=False, labels=None):
"""
Create the node config
Note that instead of initial_node_count + node_config above,
we could just use node_pool. I think the first creates the second,
and I'm not sure about pros/cons.
Get a node config for a specific machine type, and spot.
"""
# Note that if you use GKE Autopilot you need to use a different class, see the link:
# https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L448
node_config = container_v1.NodeConfig(
machine_type=self.machine_type,
tags=self.tags
# metadata = {"startup-script": my_startup_script,
# "user-data": my_user_data}
machine_type=machine_type or self.machine_type,
tags=self.tags,
spot=spot or self.spot,
labels=labels,
)
print("\n🥣️ cluster node_config")
print(node_config)
return node_config

Expand Down Expand Up @@ -190,52 +183,100 @@ def get_k8s_client(self):
self.configuration = configuration

# This has .api_client for just the api client
return kubernetes_client.CoreV1Api(
kubernetes_client.ApiClient(self.configuration)
return kubernetes_client.CoreV1Api(self.get_api_client())

def get_api_client(self):
return kubernetes_client.ApiClient(self.configuration)

def get_existing_cluster(self, cluster_name=None):
"""
Get a cluster after it's been created.
"""
name = cluster_name or self.cluster_name
request = container_v1.GetClusterRequest(name=name)
try:
return self.client.get_cluster(request=request)
except NotFound:
pass

@timed
def create_cluster_nodes(
self, name, node_count, machine_type=None, spot=False, labels=None
):
"""
Create a node pool to add to the cluster.
https://github.com/googleapis/google-cloud-python/blob/min/
packages/google-cloud-container/google/cloud/container_v1/
services/cluster_manager/client.py#L3131
"""
machine_type = self.machine_type or machine_type
node_config = self.get_node_config(machine_type, spot=spot, labels=labels)

# The min/max node counts are provided with the NodePoolAutoscaling
# For now we assume min == max for a constant number of nodes
autoscaling = container_v1.NodePoolAutoscaling(
enabled=True,
min_node_count=node_count,
max_node_count=node_count,
)
node_pool = container_v1.types.NodePool(
name=name,
config=node_config,
initial_node_count=node_count,
autoscaling=autoscaling,
# not specifying network_config uses cluster defaults
# Note that we can define placement_policy
# (google.cloud.container_v1.types.NodePool.PlacementPolicy)
)
request = container_v1.CreateNodePoolRequest(
parent=self.cluster_name,
node_pool=node_pool,
)
response = self.client.create_node_pool(request=request)
print(response)
print(f"⏱️ Waiting for node pool {name} to be ready...")
return self.wait_for_status(2)

@property
def cluster(self):
def delete_nodegroup(self, name=None):
"""
Delete a named node group.
"""
node_pool = name or self.default_pool
request = container_v1.DeleteNodePoolRequest(
parent=self.cluster_name,
node_pool=node_pool,
)
# Make the request
return self.client.delete_node_pool(request=request)

def get_cluster(self, node_pools=None):
"""
Get the cluster proto with our defaults
"""
# Design our initial cluster!
# https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#LL2119C1-L2124C1

# Command for comparison
# TODO I don't see where dataplane is, or cloud dns, can add later!
# gcloud container clusters create flux-cluster \
# --region=us-central1-a --project $GOOGLE_PROJECT \
# --machine-type c2d-standard-112 --num-nodes=32 \
# --cluster-dns=clouddns --cluster-dns-scope=cluster \
# --tags=flux-cluster --enable-dataplane-v2 \
# --threads-per-core=1
# Note: we can add node_config for customizing node pools further

# TODO these look useful / interesting
# autoscaling (google.cloud.container_v1.types.ClusterAutoscaling):
# Cluster-level autoscaling configuration.

# Autoscaling - try optimizing
# PROFILE_UNSPECIFIED = 0
# OPTIMIZE_UTILIZATION = 1
# BALANCED = 2
autoscaling_profile = container_v1.ClusterAutoscaling.AutoscalingProfile(1)

# These are hard coded for c2-standard-8
# These are required, you get an error without them.
# https://cloud.google.com/compute/docs/compute-optimized-machines
resource_limits = [
container_v1.ResourceLimit(
resource_type="cpu",
minimum=self.machine_type_vcpu,
maximum=self.machine_type_vcpu * self.max_nodes,
minimum=0,
maximum=self.max_vcpu * self.node_count,
),
container_v1.ResourceLimit(
resource_type="memory",
minimum=self.machine_type_memory_gb,
maximum=self.machine_type_memory_gb * self.max_nodes,
minimum=0,
maximum=self.max_memory * self.node_count,
),
]

# Note that I removed resource_limits, no limits!
cluster_autoscaling = container_v1.ClusterAutoscaling(
enable_node_autoprovisioning=True,
autoprovisioning_locations=[self.zone],
Expand All @@ -249,10 +290,20 @@ def cluster(self):
cluster = container_v1.Cluster(
name=self.name,
description=self.description,
initial_node_count=self.node_count,
node_config=self.node_config,
autoscaling=cluster_autoscaling,
)

# We can either provide our own node pools, or a node count and initial size
# Keep in mind this doesn't allow setting a min or max!
if node_pools is not None:
cluster.node_pools = node_pools
else:
node_config = self.get_node_config(
self.machine_type, spot=self.spot, labels=self.labels
)
cluster.initial_node_count = self.node_count
cluster.node_config = node_config

print("\n🥣️ cluster spec")
print(cluster)
return cluster
Expand All @@ -261,13 +312,40 @@ def cluster(self):
def create_cluster(self):
"""
Create a cluster, with hard coded variables for now.
Since we can't create an empty cluster, and the API doesn't allow you
to create one from scratch setting a min/max count, what we are going
to do is create the NodePool (with our preferences) first, and then
give it to the new cluster.
"""
# https://github.com/googleapis/python-container/blob/main/google/cloud/container_v1/types/cluster_service.py#L3527
node_config = self.get_node_config(
self.machine_type, spot=self.spot, labels=self.labels
)

# If you don't set this, your cluster will grow as it pleases.
autoscaling = container_v1.NodePoolAutoscaling(
enabled=True,
min_node_count=self.min_nodes,
max_node_count=self.max_nodes,
)
node_pool = container_v1.types.NodePool(
name=self.default_pool,
config=node_config,
initial_node_count=self.node_count,
autoscaling=autoscaling,
# not specifying network_config uses cluster defaults
# Note that we can define placement_policy
# (google.cloud.container_v1.types.NodePool.PlacementPolicy)
)

# Get a cluster with the given node pool
cluster = self.get_cluster([node_pool])

# https://github.com/googleapis/google-cloud-python/blob/461c76bbc6bd7cda3ef6da0a0ec7e2418c1532aa/packages/google-cloud-container/google/cloud/container_v1/services/cluster_manager/client.py#L708
request = container_v1.CreateClusterRequest(
parent=f"projects/{self.project}/locations/{self.region}",
cluster=self.cluster,
cluster=cluster,
)

print("\n🥣️ cluster creation request")
print(request)

Expand Down
4 changes: 2 additions & 2 deletions kubescaler/version.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright 2023 Lawrence Livermore National Security, LLC and other
# Copyright 2023-2024 Lawrence Livermore National Security, LLC and other
# HPCIC DevTools Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (MIT)

__version__ = "0.0.18"
__version__ = "0.0.19"
AUTHOR = "Vanessa Sochat"
EMAIL = "[email protected]"
NAME = "kubescaler"
Expand Down

0 comments on commit 1a0a294

Please sign in to comment.