Skip to content

Commit

Permalink
Release v1.1.0 (#295)
Browse files Browse the repository at this point in the history
  • Loading branch information
soumyapani authored Aug 24, 2023
2 parents 1cf9ecd + 4d511bb commit 697eacc
Show file tree
Hide file tree
Showing 17 changed files with 154 additions and 18 deletions.
2 changes: 2 additions & 0 deletions a3/terraform/modules/cluster/gke-beta/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ No requirements.
| [google_project_iam_member.node_service_account_monitoringViewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
| [null_resource.gke-cluster-command](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [null_resource.gke-node-pool-command](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [null_resource.gke-node-pool-resize-command](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [null_resource.kubernetes-setup-command](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
| [google_client_config.current](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
| [google_compute_default_service_account.account](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
Expand All @@ -59,6 +60,7 @@ No requirements.
| <a name="input_node_service_account"></a> [node\_service\_account](#input\_node\_service\_account) | The service account to be used by the Node VMs. If not specified, the "default" service account is used.<br><br>Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_node_config), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--service-account). | `string` | `null` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes |
| <a name="input_region"></a> [region](#input\_region) | The region in which the cluster master will be created. The cluster will be a regional cluster with multiple masters spread across zones in the region, and with default node locations in those zones as well. | `string` | n/a | yes |
| <a name="input_resize_node_counts"></a> [resize\_node\_counts](#input\_resize\_node\_counts) | The list of resized node counts for node pools of the GKE cluster.<br>The resize node count is used in the same order as the node pool, i.e: the first resize node count will be applied for the first node pool and so on. | `list(number)` | `[]` | no |
| <a name="input_resource_prefix"></a> [resource\_prefix](#input\_resource\_prefix) | Arbitrary string with which all names of newly created resources will be prefixed. | `string` | n/a | yes |

## Outputs
Expand Down
36 changes: 35 additions & 1 deletion a3/terraform/modules/cluster/gke-beta/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ resource "null_resource" "gke-node-pool-command" {
node_count = each.value.node_count
disk_type = var.disk_type
disk_size = var.disk_size_gb
resource_policy = "${var.resource_prefix}-${each.key}"
resource_policy = module.resource_policy[tonumber(each.key)].resource_name
gke_endpoint = local.gke_endpoint_value
network_1 = "network=${module.network.network_names[1]},subnetwork=${module.network.subnetwork_names[1]}"
network_2 = "network=${module.network.network_names[2]},subnetwork=${module.network.subnetwork_names[2]}"
Expand Down Expand Up @@ -186,6 +186,40 @@ resource "null_resource" "gke-node-pool-command" {
depends_on = [null_resource.gke-cluster-command, module.network]
}

resource "null_resource" "gke-node-pool-resize-command" {
for_each = {
for idx, rnc in var.resize_node_counts : idx => rnc
}

triggers = {
project_id = var.project_id
cluster_name = var.resource_prefix
node_pool_name = "np-${each.key}"
region = var.region
node_count = each.value
gke_endpoint = local.gke_endpoint_value
}

provisioner "local-exec" {
when = create
interpreter = ["/bin/bash", "-c"]
command = <<-EOT
${path.module}/scripts/gke_node_pool_resize.sh \
${self.triggers.project_id} \
${self.triggers.cluster_name} \
${self.triggers.node_pool_name} \
${self.triggers.region} \
${self.triggers.node_count}
EOT
environment = {
CLOUDSDK_API_ENDPOINT_OVERRIDES_CONTAINER = "${self.triggers.gke_endpoint}"
}
on_failure = fail
}

depends_on = [null_resource.gke-node-pool-command]
}

output "gke-cluster-name" {
value = null_resource.gke-cluster-command.triggers.cluster_name
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ gke_node_pool::create () {
--machine-type='a3-highgpu-8g' \
--node-locations="${zone}" \
--num-nodes="${node_count}" \
--node-labels="cloud.google.com/gke-kdump-enabled=true" \
--placement-policy="${resource_policy}" \
--project="${project_id}" \
--scopes "https://www.googleapis.com/auth/cloud-platform" \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gke_node_pool_resize::resize () {
echo "Checking if node pool '${node_pool_name}' already exists in cluster '${cluster_name}'..." >&2
local -r matching_node_pools=$(
gcloud container node-pools list \
--cluster="${cluster_name}" \
--filter="name<=${node_pool_name} AND name>=${node_pool_name}" \
--format='value(name)' \
--project="${project_id}" \
--region="${region}" \
| wc -l)
[ "${matching_node_pools}" -eq 1 ] || {
echo "Node pool '${node_pool_name}' doesn't exist in cluster '${cluster_name}'."
return 1
} >&2

echo "Resizing node pool '${node_pool_name}' in cluster '${cluster_name}'..." >&2
gcloud beta container clusters resize "${cluster_name}" \
--region="${region}" \
--num-nodes="${node_count}" \
--node-pool="${node_pool_name}" \
--quiet || {
echo "Failed to create node pool '${node_pool_name}' in cluster '${cluster_name}'."
return 1
} >&2
}

main () {
local -r project_id="${1:?}"
local -r cluster_name="${2:?}"
local -r node_pool_name="${3:?}"
local -r region="${4:?}"
local -r node_count="${5:?}"

gke_node_pool_resize::resize
}

main "${@}"
9 changes: 9 additions & 0 deletions a3/terraform/modules/cluster/gke-beta/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ variable "node_pools" {
}))
}

variable "resize_node_counts" {
description = <<-EOT
The list of resized node counts for node pools of the GKE cluster.
The resize node count is used in the same order as the node pool, i.e: the first resize node count will be applied for the first node pool and so on.
EOT
type = list(number)
default = []
}

variable "kubernetes_setup_config" {
description = <<-EOT
The configuration for setting up Kubernetes after GKE cluster is created.
Expand Down
3 changes: 2 additions & 1 deletion a3/terraform/modules/cluster/gke/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ No requirements.
| <a name="input_disk_type"></a> [disk\_type](#input\_disk\_type) | Type of the disk attached to each node. The default disk type is 'pd-standard'<br><br>Possible values: `["pd-ssd", "local-ssd", "pd-balanced", "pd-standard"]`<br><br>Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#disk_type), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--disk-type). | `string` | `"pd-ssd"` | no |
| <a name="input_enable_gke_dashboard"></a> [enable\_gke\_dashboard](#input\_enable\_gke\_dashboard) | Flag to enable GPU usage dashboards for the GKE cluster. | `bool` | `true` | no |
| <a name="input_gke_version"></a> [gke\_version](#input\_gke\_version) | The GKE version to be used as the minimum version of the master. The default value for that is latest master version.<br>More details can be found [here](https://cloud.google.com/kubernetes-engine/versioning#specifying_cluster_version)<br><br>Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#name), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--name). | `string` | `null` | no |
| <a name="input_host_maintenance_interval"></a> [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for host\_maintenance\_interval. This enables using stable fleet VM. | `string` | `"PERIODIC"` | no |
| <a name="input_kubernetes_setup_config"></a> [kubernetes\_setup\_config](#input\_kubernetes\_setup\_config) | The configuration for setting up Kubernetes after GKE cluster is created.<br><br>- `enable_kubernetes_setup`: Flag to enable kubernetes setup<br>- `kubernetes_service_account_name`: The KSA (kubernetes service account) name to be used for Pods<br>- `kubernetes_service_account_namespace`: The KSA (kubernetes service account) namespace to be used for Pods<br><br>Related Docs: [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) | <pre>object({<br> enable_kubernetes_setup = bool,<br> kubernetes_service_account_name = string,<br> kubernetes_service_account_namespace = string<br> })</pre> | <pre>{<br> "enable_kubernetes_setup": true,<br> "kubernetes_service_account_name": "aiinfra-gke-sa",<br> "kubernetes_service_account_namespace": "default"<br>}</pre> | no |
| <a name="input_network_existing"></a> [network\_existing](#input\_network\_existing) | Existing network to attach to nic0. Setting to null will create a new network for it. | <pre>object({<br> network_name = string<br> subnetwork_name = string<br> })</pre> | `null` | no |
| <a name="input_node_pools"></a> [node\_pools](#input\_node\_pools) | The list of node pools for the GKE cluster.<br>- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)<br>- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)<br>- `enable_compact_placement`: (Optional)Flag to enable compact placement policy to use for the node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name) | <pre>list(object({<br> zone = string,<br> node_count = number,<br> enable_compact_placement = optional(bool, false)<br> }))</pre> | `[]` | no |
| <a name="input_node_pools"></a> [node\_pools](#input\_node\_pools) | The list of node pools for the GKE cluster.<br>- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)<br>- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)<br>- `use_compact_placement_policy`: (Optional)The flag to create and use a superblock level compact placement policy for the instances. Currently only 1 resource policy is supported. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name) | <pre>list(object({<br> zone = string,<br> node_count = number,<br> use_compact_placement_policy = optional(bool, false)<br> }))</pre> | `[]` | no |
| <a name="input_node_service_account"></a> [node\_service\_account](#input\_node\_service\_account) | The service account to be used by the Node VMs. If not specified, the "default" service account is used.<br><br>Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_node_config), [gcloud](https://cloud.google.com/sdk/gcloud/reference/container/clusters/create#--service-account). | `string` | `null` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | GCP Project ID to which the cluster will be deployed. | `string` | n/a | yes |
| <a name="input_region"></a> [region](#input\_region) | The region in which the cluster master will be created. The cluster will be a regional cluster with multiple masters spread across zones in the region, and with default node locations in those zones as well. | `string` | n/a | yes |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ resource "kubernetes_service_account" "gke-sa" {
}
}

// ref: https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers
data "http" "installer_daemonsets" {
for_each = local.installer_daemonsets

Expand Down
17 changes: 14 additions & 3 deletions a3/terraform/modules/cluster/gke/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ module "resource_policy" {
source = "../../common/resource_policy"
for_each = {
for idx, node_pool in var.node_pools : "np-${idx}" => node_pool
if node_pool.enable_compact_placement
if node_pool.use_compact_placement_policy
}
project_id = var.project_id
resource_policy_name = "${var.resource_prefix}-${each.key}"
Expand Down Expand Up @@ -223,6 +223,17 @@ resource "google_container_node_pool" "node-pools" {
"disable-legacy-endpoints" = "true"
}

labels = {
"cloud.google.com/gke-kdump-enabled" = "true"
}

dynamic "host_maintenance_policy" {
for_each = var.host_maintenance_interval != null ? [1] : []
content {
maintenance_interval = var.host_maintenance_interval
}
}

oauth_scopes = local.oauth_scopes
}

Expand All @@ -238,10 +249,10 @@ resource "google_container_node_pool" "node-pools" {
}

dynamic "placement_policy" {
for_each = var.node_pools[count.index].enable_compact_placement ? [1] : []
for_each = var.node_pools[count.index].use_compact_placement_policy ? [1] : []
content {
type = "COMPACT"
policy_name = "${var.resource_prefix}-np-${count.index}"
policy_name = module.resource_policy["np-${count.index}"].resource_name
}
}
Expand Down
21 changes: 17 additions & 4 deletions a3/terraform/modules/cluster/gke/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,30 @@ variable "node_service_account" {
default = null
}

variable "host_maintenance_interval" {
description = "Specifies the frequency of planned maintenance events. 'PERIODIC' is th only supported value for host_maintenance_interval. This enables using stable fleet VM."
type = string
default = "PERIODIC"
validation {
condition = var.host_maintenance_interval != null ? contains(
["PERIODIC"],
var.host_maintenance_interval,
) : true
error_message = "'PERIODIC' is th only supported value for host_maintenance_interval."
}
}

variable "node_pools" {
description = <<-EOT
The list of node pools for the GKE cluster.
- `zone`: The zone in which the node pool's nodes should be located. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_locations)
- `node_count`: The number of nodes per node pool. This field can be used to update the number of nodes per node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#node_count)
- `enable_compact_placement`: (Optional)Flag to enable compact placement policy to use for the node pool. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name)
- `use_compact_placement_policy`: (Optional)The flag to create and use a superblock level compact placement policy for the instances. Currently only 1 resource policy is supported. Related docs: [terraform](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_node_pool.html#policy_name)
EOT
type = list(object({
zone = string,
node_count = number,
enable_compact_placement = optional(bool, false)
zone = string,
node_count = number,
use_compact_placement_policy = optional(bool, false)
}))
default = []
nullable = false
Expand Down
Loading

0 comments on commit 697eacc

Please sign in to comment.