Skip to content

Commit

Permalink
allow aws eks cluster to have multiple node groups
Browse files Browse the repository at this point in the history
we are doing experiments with spot instances, and it is a very
bad idea to install operators to spot nodes that are going to
be destroyed. Instead I am testing the ability to create a one-off
node group (also associated with the cluster) that could run
a persistent (smaller) node for the operators

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch committed Dec 2, 2023
1 parent 1dbf926 commit 9209cc9
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 24 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
The versions coincide with releases on pip. Only major versions will be released as tags on Github.

## [0.0.x](https://github.com/converged-computing/kubescaler/tree/main) (0.0.x)
- support adding one-off node groups to a cluster (0.0.17)
- allow manual customization and timing of nodegroup (e.g., for spot) (0.0.16)
- extensive changes to aws client (thanks to @rajibhossen!) (0.0.15)
- use api client with consistent token to associate nodes to cluster (0.0.14)
Expand Down
82 changes: 59 additions & 23 deletions kubescaler/scaler/aws/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def __init__(
self.vpc_id = None
self.set_roles()

# switch for eks managed nodegroup or cloudformation
# switch for eks managed nodegroup (True) or cloudformation (False)
self.eks_nodegroup = eks_nodegroup

if self.eks_nodegroup:
Expand Down Expand Up @@ -196,6 +196,7 @@ def create_cluster(self, machine_types=None, create_nodes=True):
return self.cluster
return self.create_cluster_nodes(machine_types)

@timed
@timed
def create_cluster_nodes(self, machine_types=None):
"""
Expand All @@ -207,6 +208,7 @@ def create_cluster_nodes(self, machine_types=None):
if self.eks_nodegroup:
self.set_or_create_nodegroup(machine_types=machine_types)
else:
# This uses the node group / workers stack associated
self.set_workers_stack()

# enabling cluster autoscaler. we will create an oidc provider and a cluster autoscaler role to be used by serviceaccount
Expand Down Expand Up @@ -490,19 +492,24 @@ def set_workers_stack(self):
if output["OutputKey"] == "NodeAutoScalingGroup":
self.node_autoscaling_group_name = output["OutputValue"]

def set_or_create_nodegroup(self, machine_types=None):
def set_or_create_nodegroup(self, machine_types=None, node_group_name=None):
"""
Get or create the workers stack, or the nodes for the cluster.
If the nodgroup is not created yet, you can set a custom set of machine_types.
This is intended for the spot instance creation case.
This is intended for the spot instance creation case. This allows customizing
the node group name for more advanced use cases (e.g., adding a separate node
group on your own)!
"""
node_group_name = node_group_name or self.node_group_name
try:
self.nodegroup = self.eks.describe_nodegroup(
clusterName=self.cluster_name, nodegroupName=self.node_group_name
clusterName=self.cluster_name, nodegroupName=node_group_name
)
except Exception:
self.nodegroup = self.create_nodegroup(machine_types=machine_types)
self.nodegroup = self.create_nodegroup(
machine_types=machine_types, node_group_name=node_group_name
)

def set_oidc_provider(self):
"""
Expand Down Expand Up @@ -589,6 +596,9 @@ def create_oidc_provider(self):
def create_workers_stack(self):
"""
Create the workers stack (the nodes for the EKS cluster)
Note that this currently just supports the node group directly
associated with the cluster (not one created manually).
"""
stack = self.cf.create_stack(
StackName=self.workers_name,
Expand Down Expand Up @@ -634,24 +644,43 @@ def create_workers_stack(self):
return self._create_stack(stack, self.workers_name)

@timed
def create_nodegroup(self, machine_types=None):
def create_nodegroup(
self,
machine_types=None,
node_group_name=None,
min_nodes=None,
max_nodes=None,
node_count=None,
capacity_type=None,
):
"""
Create the EKS Managed Node Group (the nodes for the EKS cluster)
Add additional machine types with machine_types.
Add additional machine types with machine_types. You can provide a custom
node_group_name and min/max/count for "one off" creations. E.g., for
an experiment we are creating spot instances for the main groups,
but then have one persistent group for operators to be installed to.
The same VPC, subsets, etc. are used.
"""
# Allow to customize one off name for new group
node_group_name = node_group_name or self.node_group_name
min_nodes = min_nodes or self.min_nodes
max_nodes = max_nodes or self.max_nodes
node_count = node_count or self.node_count
capacity_type = capacity_type or self.capacity_type

# Allow a custom set of 'on the fly' machine types for spot experiments
machine_types = machine_types or []
if not machine_types:
machine_types = [self.machine_type]

node_group = self.eks.create_nodegroup(
clusterName=self.cluster_name,
nodegroupName=self.node_group_name,
nodegroupName=node_group_name,
scalingConfig={
"minSize": self.min_nodes,
"maxSize": self.max_nodes,
"desiredSize": self.node_count,
"minSize": min_nodes,
"maxSize": max_nodes,
"desiredSize": node_count,
},
subnets=[str(subnet) for subnet in self.vpc_subnet_ids],
instanceTypes=machine_types,
Expand All @@ -665,10 +694,10 @@ def create_nodegroup(self, machine_types=None):
"k8s.io/cluster-autoscaler/enabled": "true",
"k8s.io/cluster-autoscaler/" + self.cluster_name: "None",
},
capacityType=self.capacity_type,
capacityType=capacity_type,
)
print(f"The status of nodegroup {node_group['nodegroup']['status']}")
return self._create_nodegroup(node_group, self.node_group_name)
return self._create_nodegroup(node_group, node_group_name)

@timed
def new_cluster(self):
Expand Down Expand Up @@ -1038,30 +1067,28 @@ def node_group_name(self):
return self.cluster_name + "-worker-group"

@timed
def delete_nodegroup(self, nodegroup_name):
def delete_nodegroup(self, node_group_name):
"""
Delete a stack and wait for it to be deleted
"""
print(f"🥞️ Attempting delete of node group {nodegroup_name}...")
print(f"🥞️ Attempting delete of node group {node_group_name}...")

try:
self.eks.delete_nodegroup(
clusterName=self.cluster_name, nodegroupName=self.node_group_name
clusterName=self.cluster_name, nodegroupName=node_group_name
)
except Exception:
logger.warning(f"✖️ Node Group {nodegroup_name} does not exist.")
logger.warning(f"✖️ Node Group {node_group_name} does not exist.")
return

try:
logger.info(f"Waiting for {nodegroup_name} to be deleted..")
logger.info(f"Waiting for {node_group_name} to be deleted..")
waiter = self.eks.get_waiter("nodegroup_deleted")
waiter.wait(
clusterName=self.cluster_name, nodegroupName=self.node_group_name
)
waiter.wait(clusterName=self.cluster_name, nodegroupName=node_group_name)
except Exception:
raise ValueError("Waiting for nodegroup deletion exceeded wait time.")
else:
print(f"Node group {nodegroup_name} is deleted successfully")
print(f"Node group {node_group_name} is deleted successfully")

@timed
def _delete_cluster(self):
Expand Down Expand Up @@ -1090,6 +1117,9 @@ def delete_cluster(self):
And let's go backwards - deleting first what we created last.
"""
logger.info("🔨️ Deleting node workers...")
logger.info(
" If you have one-off created nodegroups, you'll need to delete them yourself."
)
if self.eks_nodegroup:
self.delete_nodegroup(self.node_group_name)
else:
Expand Down Expand Up @@ -1135,7 +1165,10 @@ def scale(self, count):
@retry
def _scale_using_cf(self, count):
"""
Make a request to scale the cluster
Make a request to scale the cluster.
Note that this currently only supports the node group associated directly
with the cluster (not one that you manually create).
"""
response = self.cf.update_stack(
StackName=self.workers_name,
Expand Down Expand Up @@ -1208,6 +1241,9 @@ def _scale_using_cf(self, count):
def _scale_using_eks_nodegroup(self, count):
"""
Make a request to scale the cluster
Note that this currently only supports the node group associated directly
with the cluster (not one that you manually create).
"""
response = self.eks.update_nodegroup_config(
clusterName=self.cluster_name,
Expand Down
2 changes: 1 addition & 1 deletion kubescaler/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# SPDX-License-Identifier: (MIT)

__version__ = "0.0.16"
__version__ = "0.0.17"
AUTHOR = "Vanessa Sochat"
EMAIL = "[email protected]"
NAME = "kubescaler"
Expand Down

0 comments on commit 9209cc9

Please sign in to comment.