From 2856477baea239f94e1f477b3110cc25325ec920 Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Fri, 21 Jun 2024 18:33:22 +0530 Subject: [PATCH 1/8] GKE ML Node group creation optional add --- terraform/modules/galileo-gke/README.md | 128 +++++++++--------- .../modules/galileo-gke/examples/main.tf | 2 + .../modules/galileo-gke/examples/variables.tf | 12 ++ terraform/modules/galileo-gke/main.tf | 36 +++-- terraform/modules/galileo-gke/variable.tf | 98 ++++++++------ 5 files changed, 160 insertions(+), 116 deletions(-) diff --git a/terraform/modules/galileo-gke/README.md b/terraform/modules/galileo-gke/README.md index 798c5f9..a099042 100644 --- a/terraform/modules/galileo-gke/README.md +++ b/terraform/modules/galileo-gke/README.md @@ -1,64 +1,66 @@ -# Galileo terraform GKE cluster - -Terraform module which creates GKE and IAM resources requred to deploy Galileo. - -## Prerequisites - -- Enabling services as referenced here https://cloud.google.com/migrate/containers/docs/config-dev-env#enabling_required_services" -- VPC network with secondary IP address range (`pods_subnet_name`, `service_subnet_name`) https://cloud.google.com/kubernetes-engine/docs/concepts/alias-ips - - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >=0.13 | -| [google](#requirement\_google) | >= 4.36.0, < 5.0 | -| [kubernetes](#requirement\_kubernetes) | ~> 2.10 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 4.36.0, < 5.0 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [galileo\_gke](#module\_galileo\_gke) | terraform-google-modules/kubernetes-engine/google | 23.3.0 | - -## Resources - -| Name | Type | -|------|------| -| [kubernetes_service_account.duplo_admin_user](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account) | resource | -| [kubernetes_cluster_role_binding.duplo_admin_user_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | -| [kubernetes_secret_v1.duplo_admin_user_secret](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | -| [google_service_account_iam_binding.workloadidentity](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/service_account_iam_binding) | resource | -| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | -| [google_project.galileo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [cluster\_name](#input\_cluster\_name) | The name of the cluster | `string` | `"galileo"` | no | -| [kubernetes\_version](#input\_kubernetes\_version) | The Kubernetes version of the masters | `string` | `"1.23"` | no | -| [network](#input\_network) | The VPC network to host the cluster in | `string` | n/a | yes | -| [pods\_subnet\_name](#input\_pods\_subnet\_name) | The name of the secondary subnet ip range to use for pods | `string` | n/a | yes | -| [region](#input\_region) | The region to host the cluster in | `string` | `"us-central1"` | no | -| [service\_subnet\_name](#input\_service\_subnet\_name) | The name of the secondary subnet range to use for services | `string` | n/a | yes | -| [subnetwork](#input\_subnetwork) | The subnetwork to host the cluster in | `string` | n/a | yes | -| [zones](#input\_zones) | The zones to host the cluster in | `list(string)` |
[| no | - -## Outputs - -| Name | Description | -|------|-------------| -| [ca\_certificate](#output\_ca\_certificate) | Cluster ca certificate (base64 encoded) | -| [admin\_token](#output\_admin\_token) | Cluster admin token | -| [cluster\_id](#output\_cluster\_id) | Cluster ID | -| [endpoint](#output\_endpoint) | Cluster endpoint | -| [node\_pools\_names](#output\_node\_pools\_names) | List of node pools names | +# Galileo terraform GKE cluster + +Terraform module which creates GKE and IAM resources requred to deploy Galileo. + +## Prerequisites + +- Enabling services as referenced here https://cloud.google.com/migrate/containers/docs/config-dev-env#enabling_required_services" +- VPC network with secondary IP address range (`pods_subnet_name`, `service_subnet_name`) https://cloud.google.com/kubernetes-engine/docs/concepts/alias-ips + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >=0.13 | +| [google](#requirement\_google) | >= 4.36.0, < 5.0 | +| [kubernetes](#requirement\_kubernetes) | ~> 2.10 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 4.36.0, < 5.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [galileo\_gke](#module\_galileo\_gke) | terraform-google-modules/kubernetes-engine/google | 23.3.0 | + +## Resources + +| Name | Type | +|------|------| +| [kubernetes_service_account.duplo_admin_user](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/service_account) | resource | +| [kubernetes_cluster_role_binding.duplo_admin_user_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | +| [kubernetes_secret_v1.duplo_admin_user_secret](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/secret_v1) | resource | +| [google_service_account_iam_binding.workloadidentity](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/service_account_iam_binding) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | +| [google_project.galileo](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/project) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [cluster\_name](#input\_cluster\_name) | The name of the cluster | `string` | `"galileo"` | no | +| [kubernetes\_version](#input\_kubernetes\_version) | The Kubernetes version of the masters | `string` | `"1.23"` | no | +| [network](#input\_network) | The VPC network to host the cluster in | `string` | n/a | yes | +| [pods\_subnet\_name](#input\_pods\_subnet\_name) | The name of the secondary subnet ip range to use for pods | `string` | n/a | yes | +| [region](#input\_region) | The region to host the cluster in | `string` | `"us-central1"` | no | +| [service\_subnet\_name](#input\_service\_subnet\_name) | The name of the secondary subnet range to use for services | `string` | n/a | yes | +| [subnetwork](#input\_subnetwork) | The subnetwork to host the cluster in | `string` | n/a | yes | +| [zones](#input\_zones) | The zones to host the cluster in | `list(string)` |
"us-central1-c"
]
[| no | +| [create\_ml\_node\_group](#input\_create\_ml\_node\_group) | Controls if ML node group should be created or not | `bool` | `false` | no | +| [ml\_node\_size](#input\_ml\_node\_size) | ML node instance size to use | `string` | `g2-standard-8` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [ca\_certificate](#output\_ca\_certificate) | Cluster ca certificate (base64 encoded) | +| [admin\_token](#output\_admin\_token) | Cluster admin token | +| [cluster\_id](#output\_cluster\_id) | Cluster ID | +| [endpoint](#output\_endpoint) | Cluster endpoint | +| [node\_pools\_names](#output\_node\_pools\_names) | List of node pools names | \ No newline at end of file diff --git a/terraform/modules/galileo-gke/examples/main.tf b/terraform/modules/galileo-gke/examples/main.tf index db01bc5..4fe1090 100644 --- a/terraform/modules/galileo-gke/examples/main.tf +++ b/terraform/modules/galileo-gke/examples/main.tf @@ -20,4 +20,6 @@ module "galileo" { service_subnet_name = var.service_subnet_name kubernetes_version = var.kubernetes_version zones = var.zones + create_ml_node_group = var.create_ml_node_group + ml_node_size = var.ml_node_size } diff --git a/terraform/modules/galileo-gke/examples/variables.tf b/terraform/modules/galileo-gke/examples/variables.tf index 9f7f3e1..34e70f6 100644 --- a/terraform/modules/galileo-gke/examples/variables.tf +++ b/terraform/modules/galileo-gke/examples/variables.tf @@ -42,3 +42,15 @@ variable "kubernetes_version" { type = string description = "The Kubernetes version of the masters" } + +variable "create_ml_node_group" { + description = "Set to true to launch ML node group / workers instances" + type = bool + default = false +} + +variable "ml_node_size" { + description = "ML/GPU node size. Defaults to `g2-standard-8`" + type = string + default = "g2-standard-8" +} diff --git a/terraform/modules/galileo-gke/main.tf b/terraform/modules/galileo-gke/main.tf index 18d90c2..c7e40fe 100644 --- a/terraform/modules/galileo-gke/main.tf +++ b/terraform/modules/galileo-gke/main.tf @@ -46,7 +46,7 @@ module "galileo_gke" { gpu_resources = [] } - node_pools = [ + node_pools = concat([ { name = "galileo-core" machine_type = "e2-standard-4" @@ -71,18 +71,30 @@ module "galileo_gke" { auto_upgrade = true initial_node_count = 1 }, + var.create_ml_node_group ? + [{ + name = "galileo-ml" + machine_type = "g2-standard-8" + image_type = "COS_CONTAINERD" + min_count = 1 + max_count = 5 + disk_size_gb = 100 + disk_type = "pd-standard" + auto_repair = true + auto_upgrade = true + initial_node_count = 1 + accelerator_count = 1 + accelerator_type = "nvidia-l4" + gpu_driver_version = "LATEST" + gpu_sharing_strategy = "TIME_SHARING" + max_shared_clients_per_gpu = 2 + }] + : [] ] + ) node_pools_oauth_scopes = { - galileo-core = [ - "https://www.googleapis.com/auth/devstorage.read_write", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring", - "https://www.googleapis.com/auth/servicecontrol", - "https://www.googleapis.com/auth/service.management.readonly", - "https://www.googleapis.com/auth/trace.append", - ] - galileo-runner = [ + all = [ "https://www.googleapis.com/auth/devstorage.read_write", "https://www.googleapis.com/auth/logging.write", "https://www.googleapis.com/auth/monitoring", @@ -99,6 +111,10 @@ module "galileo_gke" { galileo-runners = { galileo-node-type = "galileo-runner" } + + galileo-ml = { + galileo-node-type = "galileo-ml" + } } } diff --git a/terraform/modules/galileo-gke/variable.tf b/terraform/modules/galileo-gke/variable.tf index 7fe372f..03c549c 100644 --- a/terraform/modules/galileo-gke/variable.tf +++ b/terraform/modules/galileo-gke/variable.tf @@ -1,43 +1,55 @@ -variable "cluster_name" { - type = string - description = "The name of the cluster" - default = "galileo" -} - -variable "region" { - type = string - description = "The region to host the cluster in" - default = "us-central1" -} - -variable "zones" { - type = list(string) - description = "The zones to host the cluster in" - default = ["us-central1-c"] -} - -variable "network" { - type = string - description = "The VPC network to host the cluster in" -} - -variable "subnetwork" { - type = string - description = "The subnetwork to host the cluster in" -} - -variable "kubernetes_version" { - type = string - description = "The Kubernetes version of the masters" - default = "1.23" -} - -variable "pod_subnet_name" { - type = string - description = "The name of the secondary subnet ip range to use for pods" -} - -variable "service_subnet_name" { - type = string - description = "The name of the secondary subnet range to use for services" -} +variable "cluster_name" { + type = string + description = "The name of the cluster" + default = "galileo" +} + +variable "region" { + type = string + description = "The region to host the cluster in" + default = "us-central1" +} + +variable "zones" { + type = list(string) + description = "The zones to host the cluster in" + default = ["us-central1-c"] +} + +variable "network" { + type = string + description = "The VPC network to host the cluster in" +} + +variable "subnetwork" { + type = string + description = "The subnetwork to host the cluster in" +} + +variable "kubernetes_version" { + type = string + description = "The Kubernetes version of the masters" + default = "1.23" +} + +variable "pod_subnet_name" { + type = string + description = "The name of the secondary subnet ip range to use for pods" +} + +variable "service_subnet_name" { + type = string + description = "The name of the secondary subnet range to use for services" +} + +variable "create_ml_node_group" { + description = "Set to true to launch ML node group / workers instances" + type = bool + default = false +} + +variable "ml_node_size" { + description = "ML/GPU node size. Defaults to `g2-standard-8`" + type = string + default = "g2-standard-8" +} From c06688f243157ea963f485a3879455cc0dbb1a69 Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Fri, 21 Jun 2024 19:06:30 +0530 Subject: [PATCH 2/8] Line ending changes --- terraform/modules/galileo-gke/variable.tf | 110 +++++++++++----------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/terraform/modules/galileo-gke/variable.tf b/terraform/modules/galileo-gke/variable.tf index 03c549c..e2611b2 100644 --- a/terraform/modules/galileo-gke/variable.tf +++ b/terraform/modules/galileo-gke/variable.tf @@ -1,55 +1,55 @@ -variable "cluster_name" { - type = string - description = "The name of the cluster" - default = "galileo" -} - -variable "region" { - type = string - description = "The region to host the cluster in" - default = "us-central1" -} - -variable "zones" { - type = list(string) - description = "The zones to host the cluster in" - default = ["us-central1-c"] -} - -variable "network" { - type = string - description = "The VPC network to host the cluster in" -} - -variable "subnetwork" { - type = string - description = "The subnetwork to host the cluster in" -} - -variable "kubernetes_version" { - type = string - description = "The Kubernetes version of the masters" - default = "1.23" -} - -variable "pod_subnet_name" { - type = string - description = "The name of the secondary subnet ip range to use for pods" -} - -variable "service_subnet_name" { - type = string - description = "The name of the secondary subnet range to use for services" -} - -variable "create_ml_node_group" { - description = "Set to true to launch ML node group / workers instances" - type = bool - default = false -} - -variable "ml_node_size" { - description = "ML/GPU node size. Defaults to `g2-standard-8`" - type = string - default = "g2-standard-8" -} +variable "cluster_name" { + type = string + description = "The name of the cluster" + default = "galileo" +} + +variable "region" { + type = string + description = "The region to host the cluster in" + default = "us-central1" +} + +variable "zones" { + type = list(string) + description = "The zones to host the cluster in" + default = ["us-central1-c"] +} + +variable "network" { + type = string + description = "The VPC network to host the cluster in" +} + +variable "subnetwork" { + type = string + description = "The subnetwork to host the cluster in" +} + +variable "kubernetes_version" { + type = string + description = "The Kubernetes version of the masters" + default = "1.23" +} + +variable "pod_subnet_name" { + type = string + description = "The name of the secondary subnet ip range to use for pods" +} + +variable "service_subnet_name" { + type = string + description = "The name of the secondary subnet range to use for services" +} + +variable "create_ml_node_group" { + description = "Set to true to launch ML node group / workers instances" + type = bool + default = false +} + +variable "ml_node_size" { + description = "ML/GPU node size. Defaults to `g2-standard-8`" + type = string + default = "g2-standard-8" +} From 24b9050101a628c0ac8dd4fb2593c1944ac703d9 Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Fri, 21 Jun 2024 22:50:05 +0530 Subject: [PATCH 3/8] Fixing syntax issue --- terraform/modules/galileo-gke/main.tf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/terraform/modules/galileo-gke/main.tf b/terraform/modules/galileo-gke/main.tf index c7e40fe..2832cba 100644 --- a/terraform/modules/galileo-gke/main.tf +++ b/terraform/modules/galileo-gke/main.tf @@ -70,7 +70,7 @@ module "galileo_gke" { auto_repair = true auto_upgrade = true initial_node_count = 1 - }, + },], var.create_ml_node_group ? [{ name = "galileo-ml" @@ -90,7 +90,6 @@ module "galileo_gke" { max_shared_clients_per_gpu = 2 }] : [] - ] ) node_pools_oauth_scopes = { From a3ffe8f0a153442613efdff096cae4b80e1acd5d Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Mon, 24 Jun 2024 20:00:14 +0530 Subject: [PATCH 4/8] Updating disktype to pd-balanced as pd-standard is not valid for ML instance type --- terraform/modules/galileo-gke/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/galileo-gke/main.tf b/terraform/modules/galileo-gke/main.tf index 2832cba..1dd6a6b 100644 --- a/terraform/modules/galileo-gke/main.tf +++ b/terraform/modules/galileo-gke/main.tf @@ -79,7 +79,7 @@ module "galileo_gke" { min_count = 1 max_count = 5 disk_size_gb = 100 - disk_type = "pd-standard" + disk_type = "pd-balanced" auto_repair = true auto_upgrade = true initial_node_count = 1 From 5fa18cfcca9a695c41a66bfb5fd1f1560125597c Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:53:25 +0530 Subject: [PATCH 5/8] Moving to latest version of GKE module to auto install GPU drivers --- terraform/modules/galileo-gke/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/galileo-gke/main.tf b/terraform/modules/galileo-gke/main.tf index 1dd6a6b..807ba13 100644 --- a/terraform/modules/galileo-gke/main.tf +++ b/terraform/modules/galileo-gke/main.tf @@ -17,7 +17,7 @@ provider "kubernetes" { module "galileo_gke" { source = "terraform-google-modules/kubernetes-engine/google" - version = "23.3.0" + version = "31.0.0" project_id = data.google_project.galileo.project_id name = var.cluster_name region = var.region From 05ac350c4b68b53a04676d61c86b98878c872c96 Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:00:25 +0530 Subject: [PATCH 6/8] Updating google versions --- terraform/modules/galileo-gke/versions.tf | 28 +++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/terraform/modules/galileo-gke/versions.tf b/terraform/modules/galileo-gke/versions.tf index e36b8ef..d72b838 100644 --- a/terraform/modules/galileo-gke/versions.tf +++ b/terraform/modules/galileo-gke/versions.tf @@ -1,14 +1,14 @@ -terraform { - required_version = ">=0.13" - - required_providers { - google = { - source = "hashicorp/google" - version = ">= 4.36.0, < 5.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "~> 2.10" - } - } -} +terraform { + required_version = ">=0.13" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.25.0, < 6.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.10" + } + } +} From 8d8effac5b17109dcb68633a72b1009c3274c9bb Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Tue, 25 Jun 2024 18:06:07 +0530 Subject: [PATCH 7/8] setting new mandatory fields --- terraform/modules/galileo-gke/main.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/terraform/modules/galileo-gke/main.tf b/terraform/modules/galileo-gke/main.tf index 807ba13..f04776f 100644 --- a/terraform/modules/galileo-gke/main.tf +++ b/terraform/modules/galileo-gke/main.tf @@ -44,6 +44,9 @@ module "galileo_gke" { min_memory_gb = 0 max_memory_gb = 200 gpu_resources = [] + auto_repair = true + auto_upgrade = true + autoscaling_profile = "BALANCED" } node_pools = concat([ From 9d09fd32cf8d84d2ddeb26d579d4fa37d40dbd72 Mon Sep 17 00:00:00 2001 From: Jeeva Kumar <50436466+jeeva-duplo@users.noreply.github.com> Date: Tue, 25 Jun 2024 23:28:22 +0530 Subject: [PATCH 8/8] Updating kubernetes version to 1.29 --- terraform/modules/galileo-gke/variable.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/modules/galileo-gke/variable.tf b/terraform/modules/galileo-gke/variable.tf index e2611b2..2ead86f 100644 --- a/terraform/modules/galileo-gke/variable.tf +++ b/terraform/modules/galileo-gke/variable.tf @@ -29,7 +29,7 @@ variable "subnetwork" { variable "kubernetes_version" { type = string description = "The Kubernetes version of the masters" - default = "1.23" + default = "1.29" } variable "pod_subnet_name" {
"us-central1-c"
]