From c0b0c87c3faaf67cbf4a0e390863797048c813f5 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Wed, 28 Aug 2024 15:36:23 -0300 Subject: [PATCH 01/12] Add launch_template config options to aws cluster --- src/_nebari/stages/infrastructure/__init__.py | 14 ++++++ .../infrastructure/template/aws/main.tf | 1 + .../modules/kubernetes/files/user_data.tftpl | 22 ++++++++++ .../template/aws/modules/kubernetes/main.tf | 43 +++++++++++++++++++ .../aws/modules/kubernetes/variables.tf | 6 +++ .../infrastructure/template/aws/variables.tf | 6 +++ 6 files changed, 92 insertions(+) create mode 100644 src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index cda515b786..832b138455 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -128,6 +128,15 @@ class AzureInputVars(schema.Base): workload_identity_enabled: bool = False +class AWSNodeLaunchTemplate(schema.Base): + user_data: Optional[str] = None + pre_bootstrap_command: Optional[str] = None + ebs_device_name: Optional[str] = None + ebs_volume_size: Optional[int] = None + ebs_volume_type: Optional[str] = None + vars: Optional[Dict[str, str]] = None + + class AWSNodeGroupInputVars(schema.Base): name: str instance_type: str @@ -137,6 +146,7 @@ class AWSNodeGroupInputVars(schema.Base): max_size: int single_subnet: bool permissions_boundary: Optional[str] = None + node_launch_template: Optional[AWSNodeLaunchTemplate] = None class AWSInputVars(schema.Base): @@ -146,6 +156,7 @@ class AWSInputVars(schema.Base): existing_subnet_ids: Optional[List[str]] = None region: str kubernetes_version: str + node_launch_template: Optional[AWSNodeLaunchTemplate] = None node_groups: List[AWSNodeGroupInputVars] availability_zones: List[str] vpc_cidr_block: str @@ -465,6 +476,7 @@ class AmazonWebServicesProvider(schema.Base): kubernetes_version: str availability_zones: Optional[List[str]] node_groups: Dict[str, AWSNodeGroup] = DEFAULT_AWS_NODE_GROUPS + node_launch_template: Optional[AWSNodeLaunchTemplate] = None existing_subnet_ids: Optional[List[str]] = None existing_security_group_id: Optional[str] = None vpc_cidr_block: str = "10.10.0.0/16" @@ -808,6 +820,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): return AWSInputVars( name=self.config.escaped_project_name, environment=self.config.namespace, + node_launch_template=self.config.amazon_web_services.node_launch_template, existing_subnet_ids=self.config.amazon_web_services.existing_subnet_ids, existing_security_group_id=self.config.amazon_web_services.existing_security_group_id, region=self.config.amazon_web_services.region, @@ -822,6 +835,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): max_size=node_group.max_nodes, single_subnet=node_group.single_subnet, permissions_boundary=node_group.permissions_boundary, + node_launch_template=node_group.node_launch_template, ) for name, node_group in self.config.amazon_web_services.node_groups.items() ], diff --git a/src/_nebari/stages/infrastructure/template/aws/main.tf b/src/_nebari/stages/infrastructure/template/aws/main.tf index 356ce8f957..2edd8ac6ca 100644 --- a/src/_nebari/stages/infrastructure/template/aws/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/main.tf @@ -93,6 +93,7 @@ module "kubernetes" { node_groups = var.node_groups + node_launch_template = var.node_launch_template endpoint_private_access = var.eks_endpoint_private_access public_access_cidrs = var.eks_public_access_cidrs permissions_boundary = var.permissions_boundary diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl new file mode 100644 index 0000000000..3bd7c37689 --- /dev/null +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl @@ -0,0 +1,22 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" + +--==MYBOUNDARY== +Content-Type: text/x-shellscript; charset="us-ascii" + +%{ if node_prebootstrap_command != null }${node_prebootstrap_command}%{ endif } + +%{ if split_user_data == true } +--==MYBOUNDARY== +Content-Type: text/x-shellscript; charset="us-ascii" +#!/bin/bash + +export CLUSTER_NAME="{{ cluster_name }}" +export CLUSTER_CERT_AUTHORITY="{{ cluster_cert_authority }}" +export CLUSTER_ENDPOINT="{{ cluster_endpoint }}" + +{{ user_data }} + +%{ endif } + +--==MYBOUNDARY==-- diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 521096cae0..a9f0bb0b96 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -20,6 +20,49 @@ resource "aws_eks_cluster" "main" { tags = merge({ Name = var.name }, var.tags) } +## aws_launch_template user_data invocation +## If using a Custom AMI, then the /etc/eks/bootstrap cmds and args must be included/modified, +## otherwise, on default AWS EKS Node AMI, the bootstrap cmd is appended automatically +resource "aws_launch_template" "main" { + # Invoke launch_template only if var.node_prebootstrap_command is not null or custom_ami is not null + # count = var.node_prebootstrap_command != null ? length(var.node_groups) : length(local.cust_ami_node_index) + # name = var.node_prebootstrap_command != null ? var.node_groups[count.index].name : var.node_groups[local.cust_ami_node_index[count.index]].name + # image_id = var.node_prebootstrap_command != null ? var.node_groups[count.index].custom_ami : var.node_groups[local.cust_ami_node_index[count.index]].custom_ami + count = var.node_launch_template != null ? length(var.node_groups) : 0 + name = var.node_launch_template != null ? var.node_groups[count.index].name : null + + + vpc_security_group_ids = var.cluster_security_groups + + metadata_options { + http_tokens = "required" + http_endpoint = "enabled" + instance_metadata_tags = "enabled" + } + + block_device_mappings { + device_name = "/dev/xvda" + ebs { + volume_size = 50 + volume_type = "gp2" + } + } + + # https://docs.aws.amazon.com/eks/latest/userguide/launch-templates.html#launch-template-basics + user_data = base64encode( + templatefile( + "${path.module}/files/user_data.tftpl", + { + # node_prebootstrap_command = var.node_prebootstrap_command + # split_user_data = var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami != null ? true : false + # include_bootstrap_cmd = var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null ? false : true + cluster_name = aws_eks_cluster.main.name + cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data + cluster_endpoint = aws_eks_cluster.main.endpoint + } + ) + ) +} resource "aws_eks_node_group" "main" { count = length(var.node_groups) diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index e22c640929..099ed7bad6 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -60,6 +60,12 @@ variable "node_group_instance_type" { default = "m5.large" } +variable "node_launch_template" { + description = "Custom launch template for EKS nodes" + type = string + default = null +} + variable "endpoint_private_access" { type = bool default = false diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index b0455d42ed..aa8beb7508 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -56,6 +56,12 @@ variable "kubeconfig_filename" { type = string } +variable "node_launch_template" { + description = "Custom launch template for EKS nodes" + type = string + default = null +} + variable "eks_endpoint_private_access" { type = bool default = false From 245db3b56bed538d13e1ec6c739cee57248ee8da Mon Sep 17 00:00:00 2001 From: vinicius douglas cerutti Date: Wed, 28 Aug 2024 16:36:17 -0300 Subject: [PATCH 02/12] add ami_type options to pydantic schema to reduce HCL conditionals --- src/_nebari/stages/infrastructure/__init__.py | 9 ++++++++- .../template/aws/modules/kubernetes/main.tf | 18 ++++++++++++++---- .../aws/modules/kubernetes/variables.tf | 16 +++++++++------- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 832b138455..c788ffdabb 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -6,7 +6,7 @@ import re import sys import tempfile -from typing import Annotated, Any, Dict, List, Optional, Tuple, Type, Union +from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union from pydantic import Field, field_validator, model_validator @@ -134,6 +134,7 @@ class AWSNodeLaunchTemplate(schema.Base): ebs_device_name: Optional[str] = None ebs_volume_size: Optional[int] = None ebs_volume_type: Optional[str] = None + ami_id: Optional[str] = None vars: Optional[Dict[str, str]] = None @@ -146,6 +147,7 @@ class AWSNodeGroupInputVars(schema.Base): max_size: int single_subnet: bool permissions_boundary: Optional[str] = None + ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = "AL2_x86_64" node_launch_template: Optional[AWSNodeLaunchTemplate] = None @@ -836,6 +838,11 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): single_subnet=node_group.single_subnet, permissions_boundary=node_group.permissions_boundary, node_launch_template=node_group.node_launch_template, + ami_type=( + node_group.ami_type + if not node_group.gpu + else "AL2_x86_64_GPU" + ), ) for name, node_group in self.config.amazon_web_services.node_groups.items() ], diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index a9f0bb0b96..0d06b1f551 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -28,12 +28,20 @@ resource "aws_launch_template" "main" { # count = var.node_prebootstrap_command != null ? length(var.node_groups) : length(local.cust_ami_node_index) # name = var.node_prebootstrap_command != null ? var.node_groups[count.index].name : var.node_groups[local.cust_ami_node_index[count.index]].name # image_id = var.node_prebootstrap_command != null ? var.node_groups[count.index].custom_ami : var.node_groups[local.cust_ami_node_index[count.index]].custom_ami - count = var.node_launch_template != null ? length(var.node_groups) : 0 - name = var.node_launch_template != null ? var.node_groups[count.index].name : null + # count = var.node_launch_template != null ? length(var.node_groups) : 0 + # name = var.node_launch_template != null ? var.node_groups[count.index].name : null + for_each = { + for node_group in var.node_groups : + node_group.name => node_group + if node_group.launch_template != null + } + name = each.value.name + image_id = each.value.launch_template.ami_id vpc_security_group_ids = var.cluster_security_groups + metadata_options { http_tokens = "required" http_endpoint = "enabled" @@ -73,8 +81,10 @@ resource "aws_eks_node_group" "main" { subnet_ids = var.node_groups[count.index].single_subnet ? [element(var.cluster_subnets, 0)] : var.cluster_subnets instance_types = [var.node_groups[count.index].instance_type] - ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : "AL2_x86_64" - disk_size = 50 + # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : + # "AL2_x86_64" + ami_type = var.node_groups[count.index].ami_type + disk_size = 50 scaling_config { min_size = var.node_groups[count.index].min_size diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index 099ed7bad6..ffdebc1c9e 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -44,13 +44,15 @@ variable "node_group_additional_policies" { variable "node_groups" { description = "Node groups to add to EKS Cluster" type = list(object({ - name = string - instance_type = string - gpu = bool - min_size = number - desired_size = number - max_size = number - single_subnet = bool + name = string + instance_type = string + gpu = bool + min_size = number + desired_size = number + max_size = number + single_subnet = bool + launch_template = string + ami_type = string })) } From 38b7d0d306a27c0354603c1837a4754e8c007ec8 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Wed, 28 Aug 2024 17:48:46 -0300 Subject: [PATCH 03/12] add dynamic launch_template to eks_node_group --- src/_nebari/stages/infrastructure/__init__.py | 15 ++++++---- .../template/aws/modules/kubernetes/main.tf | 29 ++++++++++++------- .../infrastructure/template/aws/variables.tf | 21 +++++++++----- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index c788ffdabb..0675bbc1bc 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -131,9 +131,9 @@ class AzureInputVars(schema.Base): class AWSNodeLaunchTemplate(schema.Base): user_data: Optional[str] = None pre_bootstrap_command: Optional[str] = None - ebs_device_name: Optional[str] = None - ebs_volume_size: Optional[int] = None - ebs_volume_type: Optional[str] = None + ebs_device_name: Optional[str] = "/dev/xvda" + ebs_volume_size: Optional[int] = 50 + ebs_volume_type: Optional[str] = "gp2" ami_id: Optional[str] = None vars: Optional[Dict[str, str]] = None @@ -148,7 +148,7 @@ class AWSNodeGroupInputVars(schema.Base): single_subnet: bool permissions_boundary: Optional[str] = None ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = "AL2_x86_64" - node_launch_template: Optional[AWSNodeLaunchTemplate] = None + launch_template: Optional[AWSNodeLaunchTemplate] = None class AWSInputVars(schema.Base): @@ -822,7 +822,6 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): return AWSInputVars( name=self.config.escaped_project_name, environment=self.config.namespace, - node_launch_template=self.config.amazon_web_services.node_launch_template, existing_subnet_ids=self.config.amazon_web_services.existing_subnet_ids, existing_security_group_id=self.config.amazon_web_services.existing_security_group_id, region=self.config.amazon_web_services.region, @@ -837,7 +836,11 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): max_size=node_group.max_nodes, single_subnet=node_group.single_subnet, permissions_boundary=node_group.permissions_boundary, - node_launch_template=node_group.node_launch_template, + launch_template=( + self.config.amazon_web_services.node_launch_template + if not node_group.node_launch_template + else node_group.node_launch_template + ), ami_type=( node_group.ami_type if not node_group.gpu diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 0d06b1f551..33e0e8b74c 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -49,10 +49,10 @@ resource "aws_launch_template" "main" { } block_device_mappings { - device_name = "/dev/xvda" + device_name = each.value.launch_template.ebs_device_name ebs { - volume_size = 50 - volume_type = "gp2" + volume_size = each.value.launch_template.ebs_volume_size + volume_type = each.value.launch_template.ebs_volume_type } } @@ -61,12 +61,12 @@ resource "aws_launch_template" "main" { templatefile( "${path.module}/files/user_data.tftpl", { - # node_prebootstrap_command = var.node_prebootstrap_command - # split_user_data = var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami != null ? true : false - # include_bootstrap_cmd = var.node_prebootstrap_command != null && var.node_groups[count.index].custom_ami == null ? false : true - cluster_name = aws_eks_cluster.main.name - cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data - cluster_endpoint = aws_eks_cluster.main.endpoint + node_prebootstrap_command = each.value.launch_template.pre_bootstrap_command + user_data = each.value.launch_template.user_data + split_user_data = each.value.launch_template.user_data != null ? true : false + cluster_name = aws_eks_cluster.main.name + cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data + cluster_endpoint = aws_eks_cluster.main.endpoint } ) ) @@ -84,7 +84,7 @@ resource "aws_eks_node_group" "main" { # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : # "AL2_x86_64" ami_type = var.node_groups[count.index].ami_type - disk_size = 50 + disk_size = var.node_groups[count.index].launch_template == null ? 50 : null scaling_config { min_size = var.node_groups[count.index].min_size @@ -92,6 +92,15 @@ resource "aws_eks_node_group" "main" { max_size = var.node_groups[count.index].max_size } + # Only set launch_template if its node_group counterpart parameter is not null + dynamic "launch_template" { + for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] + content { + id = aws_launch_template.main[each.key].id + version = aws_launch_template.main[each.key].latest_version + } + } + labels = { "dedicated" = var.node_groups[count.index].name } diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index aa8beb7508..983180b957 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -31,16 +31,23 @@ variable "kubernetes_version" { variable "node_groups" { description = "AWS node groups" type = list(object({ - name = string - instance_type = string - gpu = bool - min_size = number - desired_size = number - max_size = number - single_subnet = bool + name = string + instance_type = string + gpu = bool + min_size = number + desired_size = number + max_size = number + single_subnet = bool + launch_template = string + ami_type = string })) } +variable "node_launch_template" { + description = "Custom launch template for EKS nodes (placeholder)" + type = string +} + variable "availability_zones" { description = "AWS availability zones within AWS region" type = list(string) From b93361bcb65ae3c14009bfa093158cc894ff8a75 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Fri, 6 Sep 2024 14:22:47 -0300 Subject: [PATCH 04/12] small cleanup refactoring of launch_template model rm unecessary parameters & update template & set ami_type as private var --- src/_nebari/stages/infrastructure/__init__.py | 18 +++++++++++------- .../modules/kubernetes/files/user_data.tftpl | 16 +++++++++------- .../template/aws/modules/kubernetes/main.tf | 17 +++++------------ .../aws/modules/kubernetes/variables.tf | 6 +++--- .../infrastructure/template/aws/variables.tf | 6 +++--- 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 0675bbc1bc..d218bb0503 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -129,13 +129,15 @@ class AzureInputVars(schema.Base): class AWSNodeLaunchTemplate(schema.Base): - user_data: Optional[str] = None pre_bootstrap_command: Optional[str] = None - ebs_device_name: Optional[str] = "/dev/xvda" - ebs_volume_size: Optional[int] = 50 - ebs_volume_type: Optional[str] = "gp2" ami_id: Optional[str] = None - vars: Optional[Dict[str, str]] = None + + @field_validator("ami_id") + @classmethod + def _validate_ami_id(cls, value: Optional[str]) -> str: + if value is None: + raise ValueError("ami_id is required if pre_bootstrap_command is passed") + return value class AWSNodeGroupInputVars(schema.Base): @@ -147,8 +149,10 @@ class AWSNodeGroupInputVars(schema.Base): max_size: int single_subnet: bool permissions_boundary: Optional[str] = None - ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = "AL2_x86_64" launch_template: Optional[AWSNodeLaunchTemplate] = None + _ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = ( + "AL2_x86_64" + ) class AWSInputVars(schema.Base): @@ -841,7 +845,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): if not node_group.node_launch_template else node_group.node_launch_template ), - ami_type=( + _ami_type=( node_group.ami_type if not node_group.gpu else "AL2_x86_64_GPU" diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl index 3bd7c37689..e7b3af9a21 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl @@ -4,19 +4,21 @@ Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" --==MYBOUNDARY== Content-Type: text/x-shellscript; charset="us-ascii" -%{ if node_prebootstrap_command != null }${node_prebootstrap_command}%{ endif } +%{ if node_prebootstrap_command != null } +${node_prebootstrap_command} +%{ endif } -%{ if split_user_data == true } +%{ if include_bootstrap_cmd == true } --==MYBOUNDARY== Content-Type: text/x-shellscript; charset="us-ascii" #!/bin/bash +set -ex -export CLUSTER_NAME="{{ cluster_name }}" -export CLUSTER_CERT_AUTHORITY="{{ cluster_cert_authority }}" -export CLUSTER_ENDPOINT="{{ cluster_endpoint }}" - -{{ user_data }} +CLUSTER_NAME="{{ cluster_name }}" +B64_CLUSTER_CA="{{ cluster_cert_authority }}" +API_SERVER_URL="{{ cluster_endpoint }}" +/etc/eks/bootstrap.sh $CLUSTER_NAME --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL %{ endif } --==MYBOUNDARY==-- diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 33e0e8b74c..f5ce37f1c1 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -24,12 +24,6 @@ resource "aws_eks_cluster" "main" { ## If using a Custom AMI, then the /etc/eks/bootstrap cmds and args must be included/modified, ## otherwise, on default AWS EKS Node AMI, the bootstrap cmd is appended automatically resource "aws_launch_template" "main" { - # Invoke launch_template only if var.node_prebootstrap_command is not null or custom_ami is not null - # count = var.node_prebootstrap_command != null ? length(var.node_groups) : length(local.cust_ami_node_index) - # name = var.node_prebootstrap_command != null ? var.node_groups[count.index].name : var.node_groups[local.cust_ami_node_index[count.index]].name - # image_id = var.node_prebootstrap_command != null ? var.node_groups[count.index].custom_ami : var.node_groups[local.cust_ami_node_index[count.index]].custom_ami - # count = var.node_launch_template != null ? length(var.node_groups) : 0 - # name = var.node_launch_template != null ? var.node_groups[count.index].name : null for_each = { for node_group in var.node_groups : node_group.name => node_group @@ -49,10 +43,10 @@ resource "aws_launch_template" "main" { } block_device_mappings { - device_name = each.value.launch_template.ebs_device_name + device_name = "/dev/xvda" ebs { - volume_size = each.value.launch_template.ebs_volume_size - volume_type = each.value.launch_template.ebs_volume_type + volume_size = 50 + volume_type = "gp2" } } @@ -62,8 +56,7 @@ resource "aws_launch_template" "main" { "${path.module}/files/user_data.tftpl", { node_prebootstrap_command = each.value.launch_template.pre_bootstrap_command - user_data = each.value.launch_template.user_data - split_user_data = each.value.launch_template.user_data != null ? true : false + include_bootstrap_cmd = each.value._ami_type == "CUSTOM" ? true : false cluster_name = aws_eks_cluster.main.name cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data cluster_endpoint = aws_eks_cluster.main.endpoint @@ -83,7 +76,7 @@ resource "aws_eks_node_group" "main" { instance_types = [var.node_groups[count.index].instance_type] # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : # "AL2_x86_64" - ami_type = var.node_groups[count.index].ami_type + ami_type = var.node_groups[count.index]._ami_type disk_size = var.node_groups[count.index].launch_template == null ? 50 : null scaling_config { diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index ffdebc1c9e..418730796d 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -51,8 +51,8 @@ variable "node_groups" { desired_size = number max_size = number single_subnet = bool - launch_template = string - ami_type = string + launch_template = map(any) + _ami_type = string })) } @@ -64,7 +64,7 @@ variable "node_group_instance_type" { variable "node_launch_template" { description = "Custom launch template for EKS nodes" - type = string + type = map(any) default = null } diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index 983180b957..cf4de783f4 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -38,14 +38,14 @@ variable "node_groups" { desired_size = number max_size = number single_subnet = bool - launch_template = string - ami_type = string + launch_template = map(any) + _ami_type = string })) } variable "node_launch_template" { description = "Custom launch template for EKS nodes (placeholder)" - type = string + type = map(any) } variable "availability_zones" { From 4afb503d6c3af280a8610f0b34cf528077415ad1 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Fri, 6 Sep 2024 14:46:09 -0300 Subject: [PATCH 05/12] use exclude for ami_type instead of private method --- src/_nebari/stages/infrastructure/__init__.py | 6 +++--- .../infrastructure/template/aws/modules/kubernetes/main.tf | 4 ++-- .../template/aws/modules/kubernetes/variables.tf | 2 +- src/_nebari/stages/infrastructure/template/aws/variables.tf | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index d218bb0503..a8dfeb3c12 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -150,8 +150,8 @@ class AWSNodeGroupInputVars(schema.Base): single_subnet: bool permissions_boundary: Optional[str] = None launch_template: Optional[AWSNodeLaunchTemplate] = None - _ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = ( - "AL2_x86_64" + ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = Field( + "AL2_x86_64", exclude=True ) @@ -845,7 +845,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): if not node_group.node_launch_template else node_group.node_launch_template ), - _ami_type=( + ami_type=( node_group.ami_type if not node_group.gpu else "AL2_x86_64_GPU" diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index f5ce37f1c1..ae3c365f2e 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -56,7 +56,7 @@ resource "aws_launch_template" "main" { "${path.module}/files/user_data.tftpl", { node_prebootstrap_command = each.value.launch_template.pre_bootstrap_command - include_bootstrap_cmd = each.value._ami_type == "CUSTOM" ? true : false + include_bootstrap_cmd = each.value.ami_type == "CUSTOM" ? true : false cluster_name = aws_eks_cluster.main.name cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data cluster_endpoint = aws_eks_cluster.main.endpoint @@ -76,7 +76,7 @@ resource "aws_eks_node_group" "main" { instance_types = [var.node_groups[count.index].instance_type] # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : # "AL2_x86_64" - ami_type = var.node_groups[count.index]._ami_type + ami_type = var.node_groups[count.index].ami_type disk_size = var.node_groups[count.index].launch_template == null ? 50 : null scaling_config { diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index 418730796d..d0a2160ae4 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -52,7 +52,7 @@ variable "node_groups" { max_size = number single_subnet = bool launch_template = map(any) - _ami_type = string + ami_type = string })) } diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index cf4de783f4..8f2706d9f1 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -39,7 +39,7 @@ variable "node_groups" { max_size = number single_subnet = bool launch_template = map(any) - _ami_type = string + ami_type = string })) } From 7981ba9ce702cc8a8590205fcdf64cedc59df997 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Wed, 11 Sep 2024 18:43:47 -0300 Subject: [PATCH 06/12] fix missing var name & fix deployment bug & rm validation restrictions --- src/_nebari/stages/infrastructure/__init__.py | 43 +++++++++++-------- .../infrastructure/template/aws/main.tf | 1 - .../aws/modules/kubernetes/variables.tf | 6 --- .../infrastructure/template/aws/variables.tf | 11 ----- .../stages/terraform_state/__init__.py | 8 +++- 5 files changed, 32 insertions(+), 37 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 701efad3ed..bd36d06190 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -132,13 +132,6 @@ class AWSNodeLaunchTemplate(schema.Base): pre_bootstrap_command: Optional[str] = None ami_id: Optional[str] = None - @field_validator("ami_id") - @classmethod - def _validate_ami_id(cls, value: Optional[str]) -> str: - if value is None: - raise ValueError("ami_id is required if pre_bootstrap_command is passed") - return value - class AWSNodeGroupInputVars(schema.Base): name: str @@ -150,9 +143,28 @@ class AWSNodeGroupInputVars(schema.Base): single_subnet: bool permissions_boundary: Optional[str] = None launch_template: Optional[AWSNodeLaunchTemplate] = None - ami_type: Optional[Literal["AL2_x86_64", "AL2_x86_64_GPU", "CUSTOM"]] = Field( - "AL2_x86_64", exclude=True - ) + ami_type: Optional[str] = None + + @field_validator("ami_type", mode="before") + @classmethod + def _infer_and_validate_ami_type(cls, value, values) -> str: + gpu_enabled = values.get("gpu", False) + + # Auto-set ami_type if not provided + if not value: + if values.get("launch_template") and values["launch_template"].ami_id: + return "CUSTOM" + if gpu_enabled: + return "AL2_x86_64_GPU" + return "AL2_x86_64" + + # Explicit validation + if value == "AL2_x86_64" and gpu_enabled: + raise ValueError( + "ami_type 'AL2_x86_64' cannot be used with GPU enabled (gpu=True)." + ) + + return value class AWSInputVars(schema.Base): @@ -162,7 +174,6 @@ class AWSInputVars(schema.Base): existing_subnet_ids: Optional[List[str]] = None region: str kubernetes_version: str - node_launch_template: Optional[AWSNodeLaunchTemplate] = None eks_endpoint_access: Optional[ Literal["private", "public", "public_and_private"] ] = "public" @@ -467,6 +478,7 @@ class AWSNodeGroup(schema.Base): gpu: bool = False single_subnet: bool = False permissions_boundary: Optional[str] = None + launch_template: Optional[AWSNodeLaunchTemplate] = None DEFAULT_AWS_NODE_GROUPS = { @@ -849,13 +861,8 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): permissions_boundary=node_group.permissions_boundary, launch_template=( self.config.amazon_web_services.node_launch_template - if not node_group.node_launch_template - else node_group.node_launch_template - ), - ami_type=( - node_group.ami_type - if not node_group.gpu - else "AL2_x86_64_GPU" + if not node_group.launch_template + else node_group.launch_template ), ) for name, node_group in self.config.amazon_web_services.node_groups.items() diff --git a/src/_nebari/stages/infrastructure/template/aws/main.tf b/src/_nebari/stages/infrastructure/template/aws/main.tf index 2b561ba049..feffd35291 100644 --- a/src/_nebari/stages/infrastructure/template/aws/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/main.tf @@ -97,7 +97,6 @@ module "kubernetes" { node_groups = var.node_groups - node_launch_template = var.node_launch_template endpoint_public_access = var.eks_endpoint_access == "private" ? false : true endpoint_private_access = var.eks_endpoint_access == "public" ? false : true public_access_cidrs = var.eks_public_access_cidrs diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf index 7cdedcdbac..4d38d10a19 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/variables.tf @@ -62,12 +62,6 @@ variable "node_group_instance_type" { default = "m5.large" } -variable "node_launch_template" { - description = "Custom launch template for EKS nodes" - type = map(any) - default = null -} - variable "endpoint_public_access" { type = bool default = true diff --git a/src/_nebari/stages/infrastructure/template/aws/variables.tf b/src/_nebari/stages/infrastructure/template/aws/variables.tf index 794d7eb318..a3f37b9eb9 100644 --- a/src/_nebari/stages/infrastructure/template/aws/variables.tf +++ b/src/_nebari/stages/infrastructure/template/aws/variables.tf @@ -43,11 +43,6 @@ variable "node_groups" { })) } -variable "node_launch_template" { - description = "Custom launch template for EKS nodes (placeholder)" - type = map(any) -} - variable "availability_zones" { description = "AWS availability zones within AWS region" type = list(string) @@ -63,12 +58,6 @@ variable "kubeconfig_filename" { type = string } -variable "node_launch_template" { - description = "Custom launch template for EKS nodes" - type = string - default = null -} - variable "eks_endpoint_access" { description = "EKS cluster api server endpoint access setting" type = string diff --git a/src/_nebari/stages/terraform_state/__init__.py b/src/_nebari/stages/terraform_state/__init__.py index d9afff36e4..97fb626524 100644 --- a/src/_nebari/stages/terraform_state/__init__.py +++ b/src/_nebari/stages/terraform_state/__init__.py @@ -2,6 +2,7 @@ import enum import functools import inspect +import json import os import pathlib import re @@ -261,11 +262,16 @@ def check_immutable_fields(self): nebari_config_diff = utils.JsonDiff( nebari_config_state.model_dump(), self.config.model_dump() ) - + # save both for testing: + with open("nebari_config_state.json", "w") as f: + f.write(json.dumps(nebari_config_state.model_dump(), indent=4)) + with open("nebari_config.json", "w") as f: + f.write(json.dumps(self.config.model_dump(), indent=4)) # check if any changed fields are immutable for keys, old, new in nebari_config_diff.modified(): bottom_level_schema = self.config if len(keys) > 1: + print(keys) bottom_level_schema = functools.reduce( lambda m, k: getattr(m, k), keys[:-1], self.config ) From 6aafcdca8056ac6d9eebcaeab0423810389cb8e3 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Mon, 16 Sep 2024 17:50:15 -0300 Subject: [PATCH 07/12] fixes --- .../provider/cloud/amazon_web_services.py | 37 +++++++++++++- src/_nebari/provider/cloud/commons.py | 15 ++++++ src/_nebari/stages/infrastructure/__init__.py | 49 ++++++++++++++++--- .../template/aws/modules/kubernetes/main.tf | 28 +++++++---- .../stages/terraform_state/__init__.py | 22 ++++----- 5 files changed, 123 insertions(+), 28 deletions(-) diff --git a/src/_nebari/provider/cloud/amazon_web_services.py b/src/_nebari/provider/cloud/amazon_web_services.py index 1123c07fe0..cefbee276d 100644 --- a/src/_nebari/provider/cloud/amazon_web_services.py +++ b/src/_nebari/provider/cloud/amazon_web_services.py @@ -1,4 +1,5 @@ import functools +import json import os import re import time @@ -8,7 +9,10 @@ from botocore.exceptions import ClientError, EndpointConnectionError from _nebari.constants import AWS_ENV_DOCS -from _nebari.provider.cloud.commons import filter_by_highest_supported_k8s_version +from _nebari.provider.cloud.commons import ( + filter_amis_by_latest_version, + filter_by_highest_supported_k8s_version, +) from _nebari.utils import check_environment_variables from nebari import schema @@ -109,6 +113,37 @@ def kubernetes_versions(region: str) -> List[str]: return filter_by_highest_supported_k8s_version(supported_kubernetes_versions) +@functools.lru_cache() +def amis(region: str, k8s_version: str, ami_type: str) -> Dict[str, str]: + # do an ssm get-parameters-by-path to get the latest AMI for the k8s version + session = aws_session(region=region) + ami_ssm_format = { + "AL2_x86_64": "/aws/service/eks/optimized-ami/{}/amazon-linux-2", + "AL2_x86_64_GPU": "/aws/service/eks/optimized-ami/{}/amazon-linux-2-gpu", + } + ami_specifier = ami_ssm_format.get(ami_type).format(k8s_version) + if ami_specifier is None: + raise ValueError(f"Unsupported ami_type: {ami_type}") + + ssm_client = session.client("ssm") + paginator = ssm_client.get_paginator("get_parameters_by_path") + page_iterator = paginator.paginate( + Path=ami_specifier, + ) + ssm_param_name_list = [] + for page in page_iterator: + for parameter in page["Parameters"]: + values = json.loads(parameter["Value"]) + ssm_param_name_list.append( + { + "Name": values["image_name"], + "Value": values["image_id"], + "LastModifiedDate": parameter["LastModifiedDate"], + } + ) + return filter_amis_by_latest_version(ssm_param_name_list) + + @functools.lru_cache() def instances(region: str) -> Dict[str, str]: """Return dict of available instance types for the AWS region.""" diff --git a/src/_nebari/provider/cloud/commons.py b/src/_nebari/provider/cloud/commons.py index 566b2029a4..e7ac769c4d 100644 --- a/src/_nebari/provider/cloud/commons.py +++ b/src/_nebari/provider/cloud/commons.py @@ -12,3 +12,18 @@ def filter_by_highest_supported_k8s_version(k8s_versions_list): if version <= HIGHEST_SUPPORTED_K8S_VERSION: filtered_k8s_versions_list.append(k8s_version) return filtered_k8s_versions_list + + +def filter_amis_by_latest_version(amis_list): + print(amis_list) + latest_amis = {} + for ami in amis_list: + version = tuple( + filter(None, re.search(r"(\d+)\.(\d+)(?:\.(\d+))?", ami["Name"]).groups()) + ) + if version not in latest_amis: + latest_amis[version] = ami.pop("LastModifiedDate") + else: + if ami["LastModifiedDate"] > latest_amis[version]["LastModifiedDate"]: + latest_amis[version] = ami.pop("LastModifiedDate") + return list(latest_amis.values()) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index bd36d06190..d991dadb17 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -128,6 +128,12 @@ class AzureInputVars(schema.Base): workload_identity_enabled: bool = False +class AWSAmiTypes(enum.Enum): + AL2_x86_64 = "AL2_x86_64" + AL2_x86_64_GPU = "AL2_x86_64_GPU" + CUSTOM = "CUSTOM" + + class AWSNodeLaunchTemplate(schema.Base): pre_bootstrap_command: Optional[str] = None ami_id: Optional[str] = None @@ -142,8 +148,8 @@ class AWSNodeGroupInputVars(schema.Base): max_size: int single_subnet: bool permissions_boundary: Optional[str] = None + ami_type: Optional[AWSAmiTypes] = None launch_template: Optional[AWSNodeLaunchTemplate] = None - ami_type: Optional[str] = None @field_validator("ami_type", mode="before") @classmethod @@ -497,7 +503,6 @@ class AmazonWebServicesProvider(schema.Base): kubernetes_version: str availability_zones: Optional[List[str]] node_groups: Dict[str, AWSNodeGroup] = DEFAULT_AWS_NODE_GROUPS - node_launch_template: Optional[AWSNodeLaunchTemplate] = None eks_endpoint_access: Optional[ Literal["private", "public", "public_and_private"] ] = "public" @@ -546,6 +551,8 @@ def _check_input(cls, data: Any) -> Any: # check if instances are valid available_instances = amazon_web_services.instances(data["region"]) if "node_groups" in data: + # Cache for available AMIs per ami_type + available_amis_cache = {} for _, node_group in data["node_groups"].items(): instance = ( node_group["instance"] @@ -556,6 +563,38 @@ def _check_input(cls, data: Any) -> Any: raise ValueError( f"Amazon Web Services instance {node_group.instance} not one of available instance types={available_instances}" ) + + # Check if launch_template and ami_id are provided + print(available_amis_cache) + launch_template = getattr(node_group, "launch_template", None) + if ( + launch_template + and getattr(node_group, "ami_type", None) != "CUSTOM" + ): + if getattr(launch_template, "ami_id", None): + ami_id = launch_template.ami_id + ami_type = getattr(node_group, "ami_type", None) + + # Retrieve available AMIs from cache or API + if ami_type not in available_amis_cache: + available_amis_cache[ami_type] = amazon_web_services.amis( + region=data["region"], + k8s_version=data["kubernetes_version"], + ami_type=ami_type, + ) + + available_amis = available_amis_cache[ami_type] + + # Validate AMI ID + if ami_id not in available_amis: + raise ValueError( + f"Amazon Web Services AMI '{ami_id}' is not among the available AMIs: {available_amis} for AMI type '{ami_type}'" + ) + else: + raise ValueError( + "Launch template provided without AMI ID. Please provide an AMI ID." + ) + return data @@ -859,11 +898,7 @@ def input_vars(self, stage_outputs: Dict[str, Dict[str, Any]]): max_size=node_group.max_nodes, single_subnet=node_group.single_subnet, permissions_boundary=node_group.permissions_boundary, - launch_template=( - self.config.amazon_web_services.node_launch_template - if not node_group.launch_template - else node_group.launch_template - ), + launch_template=node_group.launch_template, ) for name, node_group in self.config.amazon_web_services.node_groups.items() ], diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index a4c7027981..f52002c209 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -77,8 +77,9 @@ resource "aws_eks_node_group" "main" { instance_types = [var.node_groups[count.index].instance_type] # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : # "AL2_x86_64" - ami_type = var.node_groups[count.index].ami_type - disk_size = var.node_groups[count.index].launch_template == null ? 50 : null + ami_type = var.node_groups[count.index].ami_type + # disk_size = var.node_groups[count.index].launch_template == null ? 50 : null + disk_size = 50 scaling_config { min_size = var.node_groups[count.index].min_size @@ -87,13 +88,22 @@ resource "aws_eks_node_group" "main" { } # Only set launch_template if its node_group counterpart parameter is not null - dynamic "launch_template" { - for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] - content { - id = aws_launch_template.main[each.key].id - version = aws_launch_template.main[each.key].latest_version - } - } + # dynamic "launch_template" { + # for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] + # content { + # id = aws_launch_template.main[each.key].id + # version = aws_launch_template.main[each.key].latest_version + # } + # } + # The "each" object can be used only in "module" or "resource" blocks, and only when + # the "for_each" argument is set. + # dynamic "launch_template" { + # for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] + # content { + # id = aws_launch_template.main + # version = launch_template.latest_version + # } + # } labels = { "dedicated" = var.node_groups[count.index].name diff --git a/src/_nebari/stages/terraform_state/__init__.py b/src/_nebari/stages/terraform_state/__init__.py index 97fb626524..d8895a9d99 100644 --- a/src/_nebari/stages/terraform_state/__init__.py +++ b/src/_nebari/stages/terraform_state/__init__.py @@ -1,8 +1,6 @@ import contextlib import enum -import functools import inspect -import json import os import pathlib import re @@ -262,19 +260,21 @@ def check_immutable_fields(self): nebari_config_diff = utils.JsonDiff( nebari_config_state.model_dump(), self.config.model_dump() ) - # save both for testing: - with open("nebari_config_state.json", "w") as f: - f.write(json.dumps(nebari_config_state.model_dump(), indent=4)) - with open("nebari_config.json", "w") as f: - f.write(json.dumps(self.config.model_dump(), indent=4)) # check if any changed fields are immutable for keys, old, new in nebari_config_diff.modified(): bottom_level_schema = self.config if len(keys) > 1: - print(keys) - bottom_level_schema = functools.reduce( - lambda m, k: getattr(m, k), keys[:-1], self.config - ) + for key in keys[:-1]: + try: + bottom_level_schema = getattr(bottom_level_schema, key) + except AttributeError as e: + if isinstance(bottom_level_schema, dict): + # handle case where value is a dict + bottom_level_schema = bottom_level_schema[key] + print(bottom_level_schema) + else: + raise e + extra_field_schema = schema.ExtraFieldSchema( **bottom_level_schema.model_fields[keys[-1]].json_schema_extra or {} ) From c211fa6949ebfd699c0d27df81f2a3dba1ede190 Mon Sep 17 00:00:00 2001 From: vinicius douglas cerutti Date: Tue, 17 Sep 2024 00:28:34 -0300 Subject: [PATCH 08/12] fixes on ami_id --- .../provider/cloud/amazon_web_services.py | 43 ++++++-------- src/_nebari/provider/cloud/commons.py | 15 ----- src/_nebari/stages/infrastructure/__init__.py | 59 +++++-------------- .../template/aws/modules/kubernetes/main.tf | 38 +++++------- 4 files changed, 49 insertions(+), 106 deletions(-) diff --git a/src/_nebari/provider/cloud/amazon_web_services.py b/src/_nebari/provider/cloud/amazon_web_services.py index cefbee276d..d0dc766089 100644 --- a/src/_nebari/provider/cloud/amazon_web_services.py +++ b/src/_nebari/provider/cloud/amazon_web_services.py @@ -9,10 +9,7 @@ from botocore.exceptions import ClientError, EndpointConnectionError from _nebari.constants import AWS_ENV_DOCS -from _nebari.provider.cloud.commons import ( - filter_amis_by_latest_version, - filter_by_highest_supported_k8s_version, -) +from _nebari.provider.cloud.commons import filter_by_highest_supported_k8s_version from _nebari.utils import check_environment_variables from nebari import schema @@ -114,34 +111,32 @@ def kubernetes_versions(region: str) -> List[str]: @functools.lru_cache() -def amis(region: str, k8s_version: str, ami_type: str) -> Dict[str, str]: +def amis(region: str, k8s_version: str, ami_type: str = None) -> Dict[str, str]: # do an ssm get-parameters-by-path to get the latest AMI for the k8s version session = aws_session(region=region) + ssm_client = session.client("ssm") ami_ssm_format = { "AL2_x86_64": "/aws/service/eks/optimized-ami/{}/amazon-linux-2", "AL2_x86_64_GPU": "/aws/service/eks/optimized-ami/{}/amazon-linux-2-gpu", } - ami_specifier = ami_ssm_format.get(ami_type).format(k8s_version) - if ami_specifier is None: + amis = {} + + if ami_type and ami_type not in ami_ssm_format: raise ValueError(f"Unsupported ami_type: {ami_type}") - ssm_client = session.client("ssm") - paginator = ssm_client.get_paginator("get_parameters_by_path") - page_iterator = paginator.paginate( - Path=ami_specifier, - ) - ssm_param_name_list = [] - for page in page_iterator: - for parameter in page["Parameters"]: - values = json.loads(parameter["Value"]) - ssm_param_name_list.append( - { - "Name": values["image_name"], - "Value": values["image_id"], - "LastModifiedDate": parameter["LastModifiedDate"], - } - ) - return filter_amis_by_latest_version(ssm_param_name_list) + for type, ssm_path_specifier in ami_ssm_format.items(): + if ami_type and ami_type != type: + continue + ami_specifier = ssm_path_specifier.format(k8s_version) + paginator = ssm_client.get_paginator("get_parameters_by_path") + page_iterator = paginator.paginate( + Path=ami_specifier, + ) + for page in page_iterator: + for parameter in page["Parameters"]: + values = json.loads(parameter["Value"]) + amis[values["image_id"]] = values["image_name"] + return amis @functools.lru_cache() diff --git a/src/_nebari/provider/cloud/commons.py b/src/_nebari/provider/cloud/commons.py index e7ac769c4d..566b2029a4 100644 --- a/src/_nebari/provider/cloud/commons.py +++ b/src/_nebari/provider/cloud/commons.py @@ -12,18 +12,3 @@ def filter_by_highest_supported_k8s_version(k8s_versions_list): if version <= HIGHEST_SUPPORTED_K8S_VERSION: filtered_k8s_versions_list.append(k8s_version) return filtered_k8s_versions_list - - -def filter_amis_by_latest_version(amis_list): - print(amis_list) - latest_amis = {} - for ami in amis_list: - version = tuple( - filter(None, re.search(r"(\d+)\.(\d+)(?:\.(\d+))?", ami["Name"]).groups()) - ) - if version not in latest_amis: - latest_amis[version] = ami.pop("LastModifiedDate") - else: - if ami["LastModifiedDate"] > latest_amis[version]["LastModifiedDate"]: - latest_amis[version] = ami.pop("LastModifiedDate") - return list(latest_amis.values()) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index d991dadb17..9d2f409782 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -547,53 +547,26 @@ def _check_input(cls, data: Any) -> Any: raise ValueError( f"Amazon Web Services availability zone={zone} is not one of {available_zones}" ) - - # check if instances are valid - available_instances = amazon_web_services.instances(data["region"]) + # check if instances and/or ami_ids are valid if "node_groups" in data: - # Cache for available AMIs per ami_type - available_amis_cache = {} + available_instances = set(amazon_web_services.instances(data["region"])) + # available_amis = set( + # amazon_web_services.amis(data["region"], data["kubernetes_version"]) + # ) + for _, node_group in data["node_groups"].items(): - instance = ( - node_group["instance"] - if hasattr(node_group, "__getitem__") - else node_group.instance - ) - if instance not in available_instances: + instance = node_group.get("instance") + if instance and instance not in available_instances: raise ValueError( - f"Amazon Web Services instance {node_group.instance} not one of available instance types={available_instances}" + f"Amazon Web Services instance '{instance}' is not among the available instance types for your region or account." ) - - # Check if launch_template and ami_id are provided - print(available_amis_cache) - launch_template = getattr(node_group, "launch_template", None) - if ( - launch_template - and getattr(node_group, "ami_type", None) != "CUSTOM" - ): - if getattr(launch_template, "ami_id", None): - ami_id = launch_template.ami_id - ami_type = getattr(node_group, "ami_type", None) - - # Retrieve available AMIs from cache or API - if ami_type not in available_amis_cache: - available_amis_cache[ami_type] = amazon_web_services.amis( - region=data["region"], - k8s_version=data["kubernetes_version"], - ami_type=ami_type, - ) - - available_amis = available_amis_cache[ami_type] - - # Validate AMI ID - if ami_id not in available_amis: - raise ValueError( - f"Amazon Web Services AMI '{ami_id}' is not among the available AMIs: {available_amis} for AMI type '{ami_type}'" - ) - else: - raise ValueError( - "Launch template provided without AMI ID. Please provide an AMI ID." - ) + # launch_template = node_group.get("launch_template") + # if launch_template: + # ami_id = launch_template.get("ami_id") + # if ami_id and ami_id not in available_amis: + # raise ValueError( + # f"Invalid AMI ID '{ami_id}' specified in launch_template." + # ) return data diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index f52002c209..ef5fd271fc 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -31,8 +31,9 @@ resource "aws_launch_template" "main" { if node_group.launch_template != null } - name = each.value.name - image_id = each.value.launch_template.ami_id + name = each.value.name + image_id = each.value.launch_template.ami_id + instance_type = each.value.instance_type vpc_security_group_ids = var.cluster_security_groups @@ -66,6 +67,7 @@ resource "aws_launch_template" "main" { ) } + resource "aws_eks_node_group" "main" { count = length(var.node_groups) @@ -74,12 +76,9 @@ resource "aws_eks_node_group" "main" { node_role_arn = aws_iam_role.node-group.arn subnet_ids = var.node_groups[count.index].single_subnet ? [element(var.cluster_subnets, 0)] : var.cluster_subnets - instance_types = [var.node_groups[count.index].instance_type] - # ami_type = var.node_groups[count.index].gpu == true ? "AL2_x86_64_GPU" : - # "AL2_x86_64" - ami_type = var.node_groups[count.index].ami_type - # disk_size = var.node_groups[count.index].launch_template == null ? 50 : null - disk_size = 50 + instance_types = var.node_groups[count.index].launch_template == null ? [var.node_groups[count.index].instance_type] : null + ami_type = var.node_groups[count.index].ami_type + disk_size = var.node_groups[count.index].launch_template == null ? 50 : null scaling_config { min_size = var.node_groups[count.index].min_size @@ -88,22 +87,13 @@ resource "aws_eks_node_group" "main" { } # Only set launch_template if its node_group counterpart parameter is not null - # dynamic "launch_template" { - # for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] - # content { - # id = aws_launch_template.main[each.key].id - # version = aws_launch_template.main[each.key].latest_version - # } - # } - # The "each" object can be used only in "module" or "resource" blocks, and only when - # the "for_each" argument is set. - # dynamic "launch_template" { - # for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] - # content { - # id = aws_launch_template.main - # version = launch_template.latest_version - # } - # } + dynamic "launch_template" { + for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] + content { + id = aws_launch_template.main[var.node_groups[count.index].name].id + version = aws_launch_template.main[var.node_groups[count.index].name].latest_version + } + } labels = { "dedicated" = var.node_groups[count.index].name From 1f392e8caefbe26d22d9c10eb44d9c6750c91ed2 Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Tue, 17 Sep 2024 12:31:06 -0300 Subject: [PATCH 09/12] add try to assert block to inspect error --- tests/tests_unit/test_cli_validate.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/tests_unit/test_cli_validate.py b/tests/tests_unit/test_cli_validate.py index faf2efa8a1..07a931acd4 100644 --- a/tests/tests_unit/test_cli_validate.py +++ b/tests/tests_unit/test_cli_validate.py @@ -114,20 +114,26 @@ def test_cli_validate_from_env(): ["validate", "--config", tmp_file.resolve()], env={"NEBARI_SECRET__amazon_web_services__kubernetes_version": "1.20"}, ) - - assert 0 == valid_result.exit_code - assert not valid_result.exception - assert "Successfully validated configuration" in valid_result.stdout + try: + assert 0 == valid_result.exit_code + assert not valid_result.exception + assert "Successfully validated configuration" in valid_result.stdout + except AssertionError: + print(valid_result.stdout) + raise invalid_result = runner.invoke( app, ["validate", "--config", tmp_file.resolve()], env={"NEBARI_SECRET__amazon_web_services__kubernetes_version": "1.0"}, ) - - assert 1 == invalid_result.exit_code - assert invalid_result.exception - assert "Invalid `kubernetes-version`" in invalid_result.stdout + try: + assert 1 == invalid_result.exit_code + assert invalid_result.exception + assert "Invalid `kubernetes-version`" in invalid_result.stdout + except AssertionError: + print(invalid_result.stdout) + raise @pytest.mark.parametrize( From 50c6a5fccafae1468ecd7dbb09bfca705ed4e179 Mon Sep 17 00:00:00 2001 From: vinicius douglas cerutti Date: Wed, 18 Sep 2024 09:55:02 -0300 Subject: [PATCH 10/12] fix user_data and CUSTOM ami_type logic --- src/_nebari/stages/infrastructure/__init__.py | 1 - .../modules/kubernetes/files/user_data.tftpl | 20 ++++++++----------- .../template/aws/modules/kubernetes/main.tf | 20 +++++++++---------- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 81a83e3aab..5bc770ed65 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -169,7 +169,6 @@ def _infer_and_validate_ami_type(cls, value, values) -> str: raise ValueError( "ami_type 'AL2_x86_64' cannot be used with GPU enabled (gpu=True)." ) - return value diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl index e7b3af9a21..278e9a6270 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/files/user_data.tftpl @@ -1,24 +1,20 @@ MIME-Version: 1.0 -Content-Type: multipart/mixed; boundary="==MYBOUNDARY==" +Content-Type: multipart/mixed; boundary="//" ---==MYBOUNDARY== +%{ if node_pre_bootstrap_command != null } +--// Content-Type: text/x-shellscript; charset="us-ascii" -%{ if node_prebootstrap_command != null } -${node_prebootstrap_command} +${node_pre_bootstrap_command} %{ endif } -%{ if include_bootstrap_cmd == true } ---==MYBOUNDARY== +%{ if include_bootstrap_cmd } +--// Content-Type: text/x-shellscript; charset="us-ascii" #!/bin/bash set -ex -CLUSTER_NAME="{{ cluster_name }}" -B64_CLUSTER_CA="{{ cluster_cert_authority }}" -API_SERVER_URL="{{ cluster_endpoint }}" - -/etc/eks/bootstrap.sh $CLUSTER_NAME --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL +/etc/eks/bootstrap.sh ${cluster_name} --b64-cluster-ca ${cluster_cert_authority} --apiserver-endpoint ${cluster_endpoint} %{ endif } ---==MYBOUNDARY==-- + --// diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index ef5fd271fc..0ee2143c79 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -31,9 +31,8 @@ resource "aws_launch_template" "main" { if node_group.launch_template != null } - name = each.value.name - image_id = each.value.launch_template.ami_id - instance_type = each.value.instance_type + name_prefix = "eks-${var.name}-${each.value.name}-" + image_id = each.value.launch_template.ami_id vpc_security_group_ids = var.cluster_security_groups @@ -57,11 +56,12 @@ resource "aws_launch_template" "main" { templatefile( "${path.module}/files/user_data.tftpl", { - node_prebootstrap_command = each.value.launch_template.pre_bootstrap_command - include_bootstrap_cmd = each.value.ami_type == "CUSTOM" ? true : false - cluster_name = aws_eks_cluster.main.name - cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data - cluster_endpoint = aws_eks_cluster.main.endpoint + node_pre_bootstrap_command = each.value.launch_template.pre_bootstrap_command + # This will ensure the boostrap user data is used to join the node + include_bootstrap_cmd = each.value.launch_template.ami_id != null ? true : false + cluster_name = aws_eks_cluster.main.name + cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data + cluster_endpoint = aws_eks_cluster.main.endpoint } ) ) @@ -76,7 +76,7 @@ resource "aws_eks_node_group" "main" { node_role_arn = aws_iam_role.node-group.arn subnet_ids = var.node_groups[count.index].single_subnet ? [element(var.cluster_subnets, 0)] : var.cluster_subnets - instance_types = var.node_groups[count.index].launch_template == null ? [var.node_groups[count.index].instance_type] : null + instance_types = [var.node_groups[count.index].instance_type] ami_type = var.node_groups[count.index].ami_type disk_size = var.node_groups[count.index].launch_template == null ? 50 : null @@ -88,7 +88,7 @@ resource "aws_eks_node_group" "main" { # Only set launch_template if its node_group counterpart parameter is not null dynamic "launch_template" { - for_each = var.node_groups[count.index].launch_template != null ? [var.node_groups[count.index].launch_template] : [] + for_each = var.node_groups[count.index].launch_template != null ? [0] : [] content { id = aws_launch_template.main[var.node_groups[count.index].name].id version = aws_launch_template.main[var.node_groups[count.index].name].latest_version From 749ae3844ec4b9c2e63c1ffa7d7132ce2d4bd59b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 12:55:20 +0000 Subject: [PATCH 11/12] [pre-commit.ci] Apply automatic pre-commit fixes --- .../infrastructure/template/aws/modules/kubernetes/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf index 0ee2143c79..5b66201f83 100644 --- a/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf +++ b/src/_nebari/stages/infrastructure/template/aws/modules/kubernetes/main.tf @@ -57,7 +57,7 @@ resource "aws_launch_template" "main" { "${path.module}/files/user_data.tftpl", { node_pre_bootstrap_command = each.value.launch_template.pre_bootstrap_command - # This will ensure the boostrap user data is used to join the node + # This will ensure the bootstrap user data is used to join the node include_bootstrap_cmd = each.value.launch_template.ami_id != null ? true : false cluster_name = aws_eks_cluster.main.name cluster_cert_authority = aws_eks_cluster.main.certificate_authority[0].data From 5a6bda35337d3ddbde8e8f411bd841f9c0ab625f Mon Sep 17 00:00:00 2001 From: viniciusdc Date: Wed, 18 Sep 2024 11:30:25 -0300 Subject: [PATCH 12/12] rm aux aws.amis method --- .../provider/cloud/amazon_web_services.py | 30 ------------------- src/_nebari/stages/infrastructure/__init__.py | 26 +++++++--------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/src/_nebari/provider/cloud/amazon_web_services.py b/src/_nebari/provider/cloud/amazon_web_services.py index d0dc766089..1123c07fe0 100644 --- a/src/_nebari/provider/cloud/amazon_web_services.py +++ b/src/_nebari/provider/cloud/amazon_web_services.py @@ -1,5 +1,4 @@ import functools -import json import os import re import time @@ -110,35 +109,6 @@ def kubernetes_versions(region: str) -> List[str]: return filter_by_highest_supported_k8s_version(supported_kubernetes_versions) -@functools.lru_cache() -def amis(region: str, k8s_version: str, ami_type: str = None) -> Dict[str, str]: - # do an ssm get-parameters-by-path to get the latest AMI for the k8s version - session = aws_session(region=region) - ssm_client = session.client("ssm") - ami_ssm_format = { - "AL2_x86_64": "/aws/service/eks/optimized-ami/{}/amazon-linux-2", - "AL2_x86_64_GPU": "/aws/service/eks/optimized-ami/{}/amazon-linux-2-gpu", - } - amis = {} - - if ami_type and ami_type not in ami_ssm_format: - raise ValueError(f"Unsupported ami_type: {ami_type}") - - for type, ssm_path_specifier in ami_ssm_format.items(): - if ami_type and ami_type != type: - continue - ami_specifier = ssm_path_specifier.format(k8s_version) - paginator = ssm_client.get_paginator("get_parameters_by_path") - page_iterator = paginator.paginate( - Path=ami_specifier, - ) - for page in page_iterator: - for parameter in page["Parameters"]: - values = json.loads(parameter["Value"]) - amis[values["image_id"]] = values["image_name"] - return amis - - @functools.lru_cache() def instances(region: str) -> Dict[str, str]: """Return dict of available instance types for the AWS region.""" diff --git a/src/_nebari/stages/infrastructure/__init__.py b/src/_nebari/stages/infrastructure/__init__.py index 5bc770ed65..682e9d50b6 100644 --- a/src/_nebari/stages/infrastructure/__init__.py +++ b/src/_nebari/stages/infrastructure/__init__.py @@ -545,26 +545,20 @@ def _check_input(cls, data: Any) -> Any: raise ValueError( f"Amazon Web Services availability zone={zone} is not one of {available_zones}" ) - # check if instances and/or ami_ids are valid - if "node_groups" in data: - available_instances = set(amazon_web_services.instances(data["region"])) - # available_amis = set( - # amazon_web_services.amis(data["region"], data["kubernetes_version"]) - # ) + # check if instances are valid + available_instances = amazon_web_services.instances(data["region"]) + if "node_groups" in data: for _, node_group in data["node_groups"].items(): - instance = node_group.get("instance") - if instance and instance not in available_instances: + instance = ( + node_group["instance"] + if hasattr(node_group, "__getitem__") + else node_group.instance + ) + if instance not in available_instances: raise ValueError( - f"Amazon Web Services instance '{instance}' is not among the available instance types for your region or account." + f"Amazon Web Services instance {node_group.instance} not one of available instance types={available_instances}" ) - # launch_template = node_group.get("launch_template") - # if launch_template: - # ami_id = launch_template.get("ami_id") - # if ami_id and ami_id not in available_amis: - # raise ValueError( - # f"Invalid AMI ID '{ami_id}' specified in launch_template." - # ) return data