Skip to content

Commit

Permalink
PLAT-4562: Allow nodegroup selection per AZ (#91)
Browse files Browse the repository at this point in the history
* Optional AZ selection for nodegroup

* Fix tests

* Check if nodegroup subnets wouldn't exist

* Update black again

* Run black

* Concretely check whether nodegroup azs in vpc

* Add creds to config generation step
  • Loading branch information
Secretions authored Apr 27, 2022
1 parent 0663e3e commit 8168a22
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ jobs:
- name: Create/lint default config
env:
AWS_ACCOUNT_ID: ${{ secrets.DELTA_ACCOUNT_ID }}
AWS_ACCESS_KEY_ID: ${{ secrets.DELTA_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.DELTA_ACCESS_KEY }}
GITHUB_SHA: ${{ github.sha }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
Expand Down
22 changes: 21 additions & 1 deletion cdk/domino_cdk/config/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from textwrap import dedent
from typing import Dict

import boto3
from field_properties import field_property, unwrap_property
from ruamel.yaml.comments import CommentedMap

Expand Down Expand Up @@ -98,7 +99,11 @@ def from_0_0_2(c: dict):
# NOTE: On next schema change, fill this out and remove v0.0.0 support
return DominoCDKConfig.from_0_0_1(c)

def __post_init__(self):
def get_vpc_azs(self):
ec2 = boto3.client("ec2", region_name=self.aws_region)
return [az["ZoneName"] for az in ec2.describe_availability_zones()["AvailabilityZones"]][: self.vpc.max_azs]

def __post_init__(self): # noqa: C901
errors = []

def val(path: str, obj):
Expand Down Expand Up @@ -127,6 +132,21 @@ def val(path: str, obj):
errors.append(f"{path}.{f.name} type ({f.type}) does not match value: [{value}] ({type(value)})")

val("config", self)

# Don't run these checks if we're just loading a template
if self.aws_region != "__FILL__":
vpc_azs = self.get_vpc_azs()

for ngs in [self.eks.managed_nodegroups, self.eks.unmanaged_nodegroups]:
for ng, cfg in ngs.items():
if not cfg.availability_zones:
continue
bad_azs = [az for az in cfg.availability_zones if az not in vpc_azs]
if bad_azs:
errors.append(
f"Nodegroup {ng} availability zones {bad_azs} don't exist in vpc.max_azs's resulting availability zones {vpc_azs}"
)

if errors:
raise ValueError("\n".join(errors))

Expand Down
3 changes: 3 additions & 0 deletions cdk/domino_cdk/config/eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class NodegroupBase:
key_name: some-key-pair - Pre-existing AWS key pair to configure for instances in the nodegorup
min_size: 1 - Minimum node count for nodegroup. Can't be 0 on managed nodegroups.
max_size: 10 - Maximum limit for node count in node gorup
availability_zones: Availability zones to greate subnets in. Leave this null to autogenerate.
ami_id: ami-123abc - AMI to use for nodegroup, empty/null will default to the current EKS AMI.
When specifying an AMI, you MUST specify a custom user_data script to join
the node to the cluster, and this script must do any sort of node setup that
Expand Down Expand Up @@ -61,6 +62,7 @@ class NodegroupBase:
key_name: str
min_size: int
max_size: int
availability_zones: List[str]
ami_id: str
user_data: str
instance_types: List[str]
Expand All @@ -75,6 +77,7 @@ def base_load(ng):
"key_name": ng.pop("key_name", None),
"min_size": ng.pop("min_size"),
"max_size": ng.pop("max_size"),
"availability_zones": ng.pop("availability_zones", None),
"ami_id": ng.pop("ami_id", None),
"user_data": ng.pop("user_data", None),
"instance_types": ng.pop("instance_types"),
Expand Down
1 change: 1 addition & 0 deletions cdk/domino_cdk/config/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def add_nodegroups(
key_name=keypair_name,
min_size=min_size,
max_size=max_size,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=instance_types,
Expand Down
7 changes: 5 additions & 2 deletions cdk/domino_cdk/provisioners/eks/eks_nodegroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def provision_managed_nodegroup(
self.scope.untagged_resources["ec2"].append(lt.launch_template_id)
lts = eks.LaunchTemplateSpec(id=lt.launch_template_id, version=lt.version_number)

for i, az in enumerate(self.vpc.availability_zones[:max_nodegroup_azs]):
availability_zones = ng.availability_zones or self.vpc.availability_zones[:max_nodegroup_azs]

for i, az in enumerate(availability_zones):
self.cluster.add_nodegroup_capacity(
f"{self.stack_name}-{name}-{i}",
nodegroup_name=f"{self.stack_name}-{name}-{az}",
Expand Down Expand Up @@ -129,7 +131,8 @@ def provision_unmanaged_nodegroup(

scope = cdk.Construct(self.scope, f"UnmanagedNodeGroup{name}")
cfn_lt = None
for i, az in enumerate(self.vpc.availability_zones[:max_nodegroup_azs]):
availability_zones = ng.availability_zones or self.vpc.availability_zones[:max_nodegroup_azs]
for i, az in enumerate(availability_zones):
indexed_name = f"{self.stack_name}-{name}-{az}"
asg = aws_autoscaling.AutoScalingGroup(
scope,
Expand Down
2 changes: 1 addition & 1 deletion cdk/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ flake8~=3.9
isort~=5.8.0
types-requests~=2.25.0
types-PyYAML~=5.4.1
black~=22.1.0
black~=22.3.0
6 changes: 6 additions & 0 deletions cdk/tests/unit/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
key_name=None,
min_size=1,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['m5.2xlarge'],
Expand All @@ -86,6 +87,7 @@
key_name=None,
min_size=0,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['m5.2xlarge'],
Expand All @@ -102,6 +104,7 @@
key_name=None,
min_size=0,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['p3.2xlarge'],
Expand Down Expand Up @@ -262,6 +265,7 @@
key_name=None,
min_size=1,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['m5.2xlarge'],
Expand All @@ -278,6 +282,7 @@
key_name=None,
min_size=1,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['m5.2xlarge'],
Expand All @@ -294,6 +299,7 @@
key_name=None,
min_size=0,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=['g4dn.xlarge'],
Expand Down
54 changes: 54 additions & 0 deletions cdk/tests/unit/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,60 @@ def test_invalid_version_syntax(self):
with self.assertRaisesRegex(ValueError, f"Invalid version string: '{suffixed_schema}'"):
config_loader(c)

def test_eks_ng_az_mismatch(self):
with patch("domino_cdk.config.DominoCDKConfig.get_vpc_azs") as get_vpc_azs:
get_vpc_azs.return_value = ["us-west-2a", "us-west-2b", "us-west-2c"]

c = config_template().render()
c["aws_region"] = "us-west-2"
c["vpc"]["max_azs"] = 3
c["eks"]["unmanaged_nodegroups"]["platform-0"]["availability_zones"] = [
"us-west-2a",
"us-west-2b",
"us-west-2c",
]
config_loader(c)

c = config_template().render()
c["aws_region"] = "us-west-2"
c["vpc"]["max_azs"] = 3
c["eks"]["unmanaged_nodegroups"]["platform-0"]["availability_zones"] = [
"us-west-2a",
"us-west-2d",
"us-west-2c",
]
with self.assertRaises(
ValueError,
msg="Nodegroup platform-0 availability zones ['us-west-2d'] don't exist in vpc.max_azs's resulting availability zones ['us-west-2a', 'us-west-2b', 'us-west-2c']",
):
config_loader(c)

get_vpc_azs.return_value = ["ap-northeast-1a", "ap-northeast-1c", "ap-northeast-1d"]

c = config_template().render()
c["aws_region"] = "ap-northeast-1"
c["vpc"]["max_azs"] = 3
c["eks"]["unmanaged_nodegroups"]["platform-0"]["availability_zones"] = [
"ap-northeast-1a",
"ap-northeast-1c",
"ap-northeast-1d",
]
config_loader(c)

c = config_template().render()
c["aws_region"] = "ap-northeast-1"
c["vpc"]["max_azs"] = 3
c["eks"]["unmanaged_nodegroups"]["platform-0"]["availability_zones"] = [
"ap-northeast-1a",
"ap-northeast-1b",
"ap-northeast-1c",
]
with self.assertRaises(
ValueError,
msg="Nodegroup platform-0 availability zones ['ap-northeast-1b'] don't exist in vpc.max_azs's resulting availability zones ['ap-northeast-1a', 'ap-northeast-1c', 'ap-northeast-1d']",
):
config_loader(c)

def test_istio(self):
c = config_template(istio_compatible=True)
self.assertEqual(["m5.4xlarge"], c.eks.unmanaged_nodegroups["platform-0"].instance_types)
Expand Down
6 changes: 6 additions & 0 deletions cdk/tests/unit/config/test_eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,11 @@
eks_0_0_1_cfg = deepcopy(eks_0_0_0_cfg)
eks_0_0_1_cfg["unmanaged_nodegroups"] = eks_0_0_1_cfg["nodegroups"]
eks_0_0_1_cfg["unmanaged_nodegroups"]["platform"]["imdsv2_required"] = False
eks_0_0_1_cfg["unmanaged_nodegroups"]["platform"]["availability_zones"] = None
eks_0_0_1_cfg["unmanaged_nodegroups"]["nvidia"]["imdsv2_required"] = False
eks_0_0_1_cfg["unmanaged_nodegroups"]["nvidia"]["availability_zones"] = None
del eks_0_0_1_cfg["nodegroups"]
eks_0_0_1_cfg["managed_nodegroups"]["compute"]["availability_zones"] = None

managed_ngs = {
"compute": EKS.ManagedNodegroup(
Expand All @@ -63,6 +66,7 @@
key_name=None,
min_size=1,
max_size=1,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=["t2.micro"],
Expand All @@ -78,6 +82,7 @@
key_name=None,
min_size=1,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=["m5.2xlarge"],
Expand All @@ -94,6 +99,7 @@
key_name=None,
min_size=0,
max_size=10,
availability_zones=None,
ami_id=None,
user_data=None,
instance_types=["p3.2xlarge"],
Expand Down

0 comments on commit 8168a22

Please sign in to comment.