Skip to content

Commit

Permalink
Merge pull request #216 from GoogleCloudPlatform/infra
Browse files Browse the repository at this point in the history
update infrastructure module
  • Loading branch information
umeshkumhar authored Feb 23, 2024
2 parents ea665e3 + 16de272 commit 96fe068
Show file tree
Hide file tree
Showing 20 changed files with 417 additions and 606 deletions.
2 changes: 1 addition & 1 deletion applications/jupyter/workloads.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ create_service_account = true
gcp_service_account = "jupyter-service-account"

# Jupyterhub with IAP
add_auth = true
add_auth = false
brand = "projects/<prj-number>/brands/<prj-number>"
support_email = "<email>"
default_backend_service = "proxy-public"
Expand Down
2 changes: 2 additions & 0 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ module "private-gke-standard-cluster" {
gcs_fuse_csi_driver = var.gcs_fuse_csi_driver
deletion_protection = var.deletion_protection
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block

## pools config variables
cpu_pools = var.cpu_pools
Expand Down Expand Up @@ -163,6 +164,7 @@ module "private-gke-autopilot-cluster" {
ip_range_pods = var.ip_range_pods
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection

}
Expand Down
151 changes: 11 additions & 140 deletions infrastructure/platform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -18,144 +18,15 @@ project_id = "<your project ID>"
#### PLATFORM
#######################################################
## network values
create_network = true # set to false to use an existing network
network_name = "ml-network"
subnetwork_name = "ml-subnet"
## required only if creating a new network
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"
subnetwork_private_access = "true"
subnetwork_description = "GKE subnet"
network_secondary_ranges = {
"ml-subnet" = [
{
range_name = "us-central1-01-gke-01-pods-1"
ip_cidr_range = "192.168.0.0/20"
},
{
range_name = "us-central1-01-gke-01-services-1"
ip_cidr_range = "192.168.48.0/20"
}
]
}
## gke variables
create_cluster = true
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "ml-cluster"
kubernetes_version = "1.28"
cluster_regional = true
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
ip_range_pods = "us-central1-01-gke-01-pods-1" # update if using your own network
ip_range_services = "us-central1-01-gke-01-services-1" # update if using your own network
monitoring_enable_managed_prometheus = true
gcs_fuse_csi_driver = true ## enabled by default for autopilot
deletion_protection = false
master_authorized_networks = [{
cidr_block = "0.0.0.0/0"
display_name = "VPC"
}]

## Node configuration are ignored for autopilot clusters
cpu_pools = [{
name = "cpu-pool"
machine_type = "n2-standard-8"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
create_service_account = true
preemptible = false
initial_node_count = 1
accelerator_count = 0
}]

enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
machine_type = "n1-standard-16"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
create_service_account = true
preemptible = false
initial_node_count = 1
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
gpu_driver_version = "DEFAULT"
}]

enable_tpu = false
tpu_pools = [{
name = "tpu-pool"
machine_type = "ct4p-hightpu-4t"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
max_count = 3
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
create_service_account = true
preemptible = false
initial_node_count = 1
accelerator_count = 2
accelerator_type = "nvidia-tesla-t4"
}]


## pools config variables
all_node_pools_oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/trace.append",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/servicecontrol",
]


cluster_labels = {
"gke-profile" = "ai-on-gke"
}

all_node_pools_labels = {
"gke-profile" = "ai-on-gke"
}

all_node_pools_metadata = {
disable-legacy-endpoints = "true"
}

all_node_pools_tags = ["gke-node", "ai-on-gke"]

create_network = true
network_name = "ml-network"
subnetwork_name = "ml-subnet1"
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"

## gke variables
private_cluster = false ## true = private cluster, false = public cluster
autopilot_cluster = true ## true = autopilot cluster, false = standard cluster
cluster_name = "ml-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##common variables
project_id = "<project-id>"

#######################################################
#### PLATFORM
#######################################################
## network values
create_network = false
network_name = "demo-network"
subnetwork_name = "demo-subnet"

## gke variables
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##common variables
project_id = "<project-id>"

#######################################################
#### PLATFORM
#######################################################
## network values
create_network = true
network_name = "demo-network"
subnetwork_name = "demo-subnet"
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"


## gke variables
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "demo-cluster"
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,51 +13,57 @@
# limitations under the License.

##common variables
project_id = "ai-sandbox-5"
project_id = "<project-id>"

#######################################################
#### PLATFORM
#######################################################
## network values
create_network = true
network_name = "demo-network-4"
subnetwork_name = "demo-subnet-04"
create_network = true
network_name = "ml-network"
subnetwork_name = "ml-subnet"

## required only in case new network provisioning
subnetwork_cidr = "10.100.0.0/16"
subnetwork_region = "us-central1"
subnetwork_private_access = "true"
subnetwork_description = "GKE subnet"
network_secondary_ranges = {
"demo-subnet-04" = [
"ml-subnet" = [
{
range_name = "us-central1-01-gke-01-pods-3"
range_name = "us-central1-01-gke-01-pods-1"
ip_cidr_range = "192.168.0.0/20"
},
{
range_name = "us-central1-01-gke-01-services-3"
range_name = "us-central1-01-gke-01-services-1"
ip_cidr_range = "192.168.48.0/20"
}
]
}

## gke variables
create_cluster = true
private_cluster = false
cluster_name = "demo-cluster-4"
kubernetes_version = "1.27"
private_cluster = true ## Default true. Use false for a public cluster
autopilot_cluster = true # false = standard cluster, true = autopilot cluster
cluster_name = "ml-cluster"
kubernetes_version = "1.28"
cluster_regional = true
cluster_region = "us-central1"
cluster_zones = ["us-central1-a", "us-central1-b", "us-central1-f"]
ip_range_pods = "us-central1-01-gke-01-pods-3" ## name should match with secondary ranges names
ip_range_services = "us-central1-01-gke-01-services-3" ## name should match with secondary ranges names
ip_range_pods = "us-central1-01-gke-01-pods-1"
ip_range_services = "us-central1-01-gke-01-services-1"
monitoring_enable_managed_prometheus = true
gcs_fuse_csi_driver = true ## enabled default for autopilot
deletion_protection = false
master_authorized_networks = [{
cidr_block = "122.169.15.5/32" ## public IPs should be configured, if authorized network is required
display_name = "Home"
cidr_block = "10.100.0.0/16"
display_name = "VPC"
}]

## Node configuration are ignored for autopilot clusters
cpu_pools = [{
name = "cpu-pool"
machine_type = "n1-standard-16"
machine_type = "n2-standard-8"
node_locations = "us-central1-b,us-central1-c"
autoscaling = true
min_count = 1
Expand All @@ -78,7 +84,6 @@ cpu_pools = [{
accelerator_count = 0
}]

## make sure gpu quotas are available in given region
enable_gpu = true
gpu_pools = [{
name = "gpu-pool"
Expand Down Expand Up @@ -141,6 +146,11 @@ all_node_pools_oauth_scopes = [
"https://www.googleapis.com/auth/servicecontrol",
]


cluster_labels = {
"gke-profile" = "ai-on-gke"
}

all_node_pools_labels = {
"gke-profile" = "ai-on-gke"
}
Expand All @@ -151,4 +161,3 @@ all_node_pools_metadata = {

all_node_pools_tags = ["gke-node", "ai-on-gke"]


Loading

0 comments on commit 96fe068

Please sign in to comment.