From f99f77e50e4328b123dee3bf88c59a3aa97ea387 Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Tue, 24 Jan 2023 18:38:33 +0000 Subject: [PATCH 1/6] Release v0.1.0 (#52) (#54) --- cloudbuild-continuous.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cloudbuild-continuous.yaml b/cloudbuild-continuous.yaml index 7e38cd3ab..416468ed2 100644 --- a/cloudbuild-continuous.yaml +++ b/cloudbuild-continuous.yaml @@ -22,6 +22,7 @@ steps: - name: 'gcr.io/cloud-builders/docker' args: [ 'build', '--tag', 'us-docker.pkg.dev/${PROJECT_ID}/cluster-provision-dev/cluster-provision-image:latest', '--tag', 'us-docker.pkg.dev/${PROJECT_ID}/cluster-provision-dev/cluster-provision-image:${SHORT_SHA}', + '--tag', 'us-docker.pkg.dev/${PROJECT_ID}/cluster-provision-dev/cluster-provision-image:${_VERSION}', '--cache-from', 'us-docker.pkg.dev/${PROJECT_ID}/cluster-provision-dev/cluster-provision-image:latest', '-f', 'Dockerfile', '.' ] # Test for default VPC @@ -72,4 +73,7 @@ steps: images: - 'us-docker.pkg.dev/${PROJECT_ID}/cluster-provision-dev/cluster-provision-image' -timeout: 1800s \ No newline at end of file +timeout: 1800s + +substitutions: + _VERSION: 'v0.1.0' \ No newline at end of file From beddeea9268f52242a11c1463cd061986bebeb96 Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Wed, 25 Jan 2023 00:50:42 +0000 Subject: [PATCH 2/6] Converging network creation to a single module. (#55) --- scripts/_env_var_util.sh | 2 +- tfconfig/main.tf | 56 +++++++------------ .../main.tf | 32 +++++++++-- .../outputs.tf | 36 ++++++------ .../variables.tf | 6 ++ .../versions.tf | 0 tfconfig/modules/vm-instance-group/outputs.tf | 20 ------- tfconfig/variables.tf | 2 +- 8 files changed, 71 insertions(+), 83 deletions(-) rename tfconfig/modules/{multi-nic-network => aiinfra-network}/main.tf (50%) rename tfconfig/modules/{multi-nic-network => aiinfra-network}/outputs.tf (59%) rename tfconfig/modules/{multi-nic-network => aiinfra-network}/variables.tf (90%) rename tfconfig/modules/{multi-nic-network => aiinfra-network}/versions.tf (100%) delete mode 100644 tfconfig/modules/vm-instance-group/outputs.tf diff --git a/scripts/_env_var_util.sh b/scripts/_env_var_util.sh index 609ca98a2..96b512832 100644 --- a/scripts/_env_var_util.sh +++ b/scripts/_env_var_util.sh @@ -215,6 +215,6 @@ _set_terraform_env_var() { fi if [[ -n "$NFS_FILESHARE_LIST" ]]; then - echo "nfs_fileshare_list= \"$NFS_FILESHARE_LIST\"" >> /usr/primary/tf.auto.tfvars + echo "nfs_filestore_list= \"$NFS_FILESHARE_LIST\"" >> /usr/primary/tf.auto.tfvars fi } \ No newline at end of file diff --git a/tfconfig/main.tf b/tfconfig/main.tf index 6fed17ff9..f402e5049 100644 --- a/tfconfig/main.tf +++ b/tfconfig/main.tf @@ -15,32 +15,16 @@ */ locals { - trimmed_net_config = lower(trimspace(var.network_config)) - primary_network = coalesce(one(module.new_primary_network), one(module.multi-nic-network), one(module.default_primary_network)) gcs_mount_arr = compact(split(",", trimspace(var.gcs_mount_list))) - nfs_fileshare_arr = compact(split(",", trimspace(var.nfs_fileshare_list))) -} -module "default_primary_network" { - count = local.trimmed_net_config != "new_network" ? 1 : 0 - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/pre-existing-vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1" - project_id = var.project_id - region = var.region -} - -module "new_primary_network" { - count = local.trimmed_net_config == "new_network" ? 1 : 0 - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1" - project_id = var.project_id - region = var.region - deployment_name = var.deployment_name + nfs_filestore_arr = compact(split(",", trimspace(var.nfs_filestore_list))) } -module "multi-nic-network" { - count = local.trimmed_net_config == "multi_nic_network" ? 1 : 0 - source = "./modules/multi-nic-network" +module "aiinfra-network" { + source = "./modules/aiinfra-network" project_id = var.project_id region = var.region deployment_name = var.deployment_name + network_config = var.network_config } module "gcsfuse_mount" { @@ -52,22 +36,21 @@ module "gcsfuse_mount" { local_mount = split(":", trimspace(local.gcs_mount_arr[count.index]))[1] } -module "nfs_fileshare" { +module "nfs_filestore" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=c1f4a44" - count = length(local.nfs_fileshare_arr) + count = length(local.nfs_filestore_arr) project_id = var.project_id region = var.region zone = var.zone deployment_name = var.deployment_name - network_name = local.primary_network.network_name + network_name = module.aiinfra-network.network_name filestore_share_name = "nfsshare_${count.index}" - labels = merge(var.labels, { ghpc_role = "aiinfra-fileshare",}) - local_mount = split(":", trimspace(local.nfs_fileshare_arr[count.index]))[0] - filestore_tier = split(":", trimspace(local.nfs_fileshare_arr[count.index]))[1] - size_gb = length(split(":", trimspace(local.nfs_fileshare_arr[count.index]))) > 2 ? split(":", trimspace(local.nfs_fileshare_arr[count.index]))[2] : 2560 + labels = merge(var.labels, { ghpc_role = "aiinfra-filestore",}) + local_mount = split(":", trimspace(local.nfs_filestore_arr[count.index]))[0] + filestore_tier = split(":", trimspace(local.nfs_filestore_arr[count.index]))[1] + size_gb = length(split(":", trimspace(local.nfs_filestore_arr[count.index]))) > 2 ? split(":", trimspace(local.nfs_filestore_arr[count.index]))[2] : 2560 depends_on = [ - local.primary_network, - module.multi-nic-network + module.aiinfra-network ] } @@ -83,17 +66,17 @@ __REPLACE_STARTUP_SCRIPT__ }] , module.gcsfuse_mount[*].client_install_runner , module.gcsfuse_mount[*].mount_runner - , module.nfs_fileshare[*].install_nfs_client_runner - , module.nfs_fileshare[*].mount_runner) + , module.nfs_filestore[*].install_nfs_client_runner + , module.nfs_filestore[*].mount_runner) labels = merge(var.labels, { ghpc_role = "scripts",}) deployment_name = var.deployment_name gcs_bucket_path = var.gcs_bucket_path region = var.region } -module "compute-vm-1" { +module "aiinfra-mig" { source = "./modules/vm-instance-group" - subnetwork_self_link = local.primary_network.subnetwork_self_link + subnetwork_self_link = module.aiinfra-network.subnetwork_self_link service_account = { email = var.service_account scopes = ["cloud-platform"] @@ -102,7 +85,7 @@ module "compute-vm-1" { project_id = var.project_id disk_size_gb = var.disk_size_gb disk_type = var.disk_type - network_self_link = local.primary_network.network_self_link + network_self_link = module.aiinfra-network.network_self_link placement_policy = { availability_domain_count = null collocation = "COLLOCATED" @@ -122,10 +105,9 @@ module "compute-vm-1" { type = var.accelerator_type }] deployment_name = var.deployment_name - network_interfaces = local.trimmed_net_config == "multi_nic_network" ? module.multi-nic-network[0].multi_network_interface : [] + network_interfaces = module.aiinfra-network.network_interfaces depends_on = [ - local.primary_network, - module.multi-nic-network + module.aiinfra-network ] } diff --git a/tfconfig/modules/multi-nic-network/main.tf b/tfconfig/modules/aiinfra-network/main.tf similarity index 50% rename from tfconfig/modules/multi-nic-network/main.tf rename to tfconfig/modules/aiinfra-network/main.tf index 81a830672..353c35544 100644 --- a/tfconfig/modules/multi-nic-network/main.tf +++ b/tfconfig/modules/aiinfra-network/main.tf @@ -14,17 +14,37 @@ * limitations under the License. */ -module "network1" { +locals { + trimmed_net_config = lower(trimspace(var.network_config)) + primary_network = coalesce(one(module.new_vpc), try(module.multinic_vpc[0], null), one(module.default_vpc)) +} + +module "default_vpc" { + count = local.trimmed_net_config != "new_network" ? 1 : 0 + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/pre-existing-vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1" + project_id = var.project_id + region = var.region +} + +module "new_vpc" { + count = local.trimmed_net_config == "new_network" ? 1 : 0 + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1" + project_id = var.project_id + region = var.region + deployment_name = var.deployment_name +} + +module "multinic_vpc" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=866de32de9c3cf7ea8fa20f377d62aa80a07b8b3" - count = var.nic_count + count = local.trimmed_net_config == "multi_nic_network" ? var.nic_count : 0 network_address_range = "10.${count.index}.0.0/16" subnetworks = [{ new_bits = 8 subnet_name = "primary-subnet-${count.index}" subnet_region = var.region }] - region = var.region - deployment_name = var.deployment_name - project_id = var.project_id - network_name = "${var.deployment_name}-net-${count.index}" + region = var.region + deployment_name = var.deployment_name + project_id = var.project_id + network_name = "${var.deployment_name}-net-${count.index}" } diff --git a/tfconfig/modules/multi-nic-network/outputs.tf b/tfconfig/modules/aiinfra-network/outputs.tf similarity index 59% rename from tfconfig/modules/multi-nic-network/outputs.tf rename to tfconfig/modules/aiinfra-network/outputs.tf index 59e752e24..76daeab34 100644 --- a/tfconfig/modules/multi-nic-network/outputs.tf +++ b/tfconfig/modules/aiinfra-network/outputs.tf @@ -16,34 +16,34 @@ output "network_name" { description = "The name of the primary network of all the VPCs created." - value = module.network1[0].network_name + value = local.primary_network.network_name } output "subnetwork_self_link" { description = "The subnetwork_self_link of the primary network of all the VPCs created." - value = module.network1[0].subnetwork_self_link + value = local.primary_network.subnetwork_self_link } output "network_self_link" { description = "The network_self_link of the primary network of all the VPCs created." - value = module.network1[0].network_self_link + value = local.primary_network.network_self_link } -output "multi_network_interface" { +output "network_interfaces" { description = "The network interface that includes all VPC subnets." - value = [for idx in range(var.nic_count) : { - access_config = [] - alias_ip_range = [] - ipv6_access_config = [] - network = null - network_ip = null - queue_count = null - stack_type = null - nic_type = "GVNIC" - subnetwork = module.network1[idx].subnetwork_self_link - subnetwork_project = var.project_id + value = local.trimmed_net_config == "multi_nic_network" ? [for idx in range(var.nic_count) : { + access_config = [] + alias_ip_range = [] + ipv6_access_config = [] + network = null + network_ip = null + queue_count = null + stack_type = null + nic_type = "GVNIC" + subnetwork = module.multinic_vpc[idx].subnetwork_self_link + subnetwork_project = var.project_id } - ] + ] : [] - depends_on = [module.network1] -} \ No newline at end of file + depends_on = [module.multinic_vpc] +} diff --git a/tfconfig/modules/multi-nic-network/variables.tf b/tfconfig/modules/aiinfra-network/variables.tf similarity index 90% rename from tfconfig/modules/multi-nic-network/variables.tf rename to tfconfig/modules/aiinfra-network/variables.tf index 4590d8227..2c9178ee5 100644 --- a/tfconfig/modules/multi-nic-network/variables.tf +++ b/tfconfig/modules/aiinfra-network/variables.tf @@ -33,4 +33,10 @@ variable "nic_count" { description = "The NIC count" type = number default = 5 +} + +variable "network_config" { + description = "" + type = string + default = "default_network" } \ No newline at end of file diff --git a/tfconfig/modules/multi-nic-network/versions.tf b/tfconfig/modules/aiinfra-network/versions.tf similarity index 100% rename from tfconfig/modules/multi-nic-network/versions.tf rename to tfconfig/modules/aiinfra-network/versions.tf diff --git a/tfconfig/modules/vm-instance-group/outputs.tf b/tfconfig/modules/vm-instance-group/outputs.tf deleted file mode 100644 index d28d31c3b..000000000 --- a/tfconfig/modules/vm-instance-group/outputs.tf +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "mig-id" { - description = "Id of the mig created" - value = google_compute_instance_group_manager.mig.self_link -} diff --git a/tfconfig/variables.tf b/tfconfig/variables.tf index 2d917660a..217fcf6b6 100644 --- a/tfconfig/variables.tf +++ b/tfconfig/variables.tf @@ -105,7 +105,7 @@ variable "gcs_mount_list" { default = "" } -variable "nfs_fileshare_list" { +variable "nfs_filestore_list" { description = "" type = string default = "" From 9e4bf4ef8ca3b25f4abd21b7b0e5e7e92d4dbf46 Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Wed, 25 Jan 2023 23:50:36 +0000 Subject: [PATCH 3/6] Update README.md (#56) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71548fdcf..32e8ba6db 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ Since the resource state is stored outside of the container, the GPU cluster lif ## Instructions 1. gcloud auth application-default login. -2. gcloud auth configure-docker us-central1-docker.pkg.dev +2. gcloud auth configure-docker us-docker.pkg.dev 3. ***[`OPTIONAL - if project not set already`]*** gcloud config set account supercomputer-testing 4. Create env.list file. The sample env.list can be found [here](#sample-config-file-that-the-user-provides). 5. ***[`SIMPLE CREATE`]*** From d4cc38b7c4ccf445905ff6349bd7327e6726a9af Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Fri, 3 Feb 2023 22:21:37 +0000 Subject: [PATCH 4/6] Merging main with develop (#57) From 30b0a8b5758ee36416ce8758572dd5a6b284cc0b Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Wed, 8 Feb 2023 00:47:13 +0000 Subject: [PATCH 5/6] Supporting minimal terraform verbosity for running with LLM pipeline. (#61) Co-authored-by: Dmitry Kakurin --- README.md | 3 ++- scripts/_terraform_util.sh | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 32e8ba6db..cce746981 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ The optional parameters are: - The `` cannot be empty. The supported values are `BASIC_HDD`,`BASIC_SSD`,`HIGH_SCALE_SSD` and `ENTERPRISE`. - The `` can be empty and the default value is 2560 GB (2.5 TB). 18. ***SHOW_PROXY_URL***. This controls if the Jupyter notebook proxy url is retrieved for the cluster or not. The default value is yes. If this is present and set to no, then connection information is not collected. The supported values are: yes, no. -19. ***NETWORK_CONFIG***. This controls the VPC type to be used for the MIG. The supported values are default_network, new_network and multi_nic_network. The dault value is default_network. The behaviour is +19. ***MINIMIZE_TERRAFORM_LOGGING***. This controls the verbosity of terraform logs. When any value is set for this parameter, the terraform output is redirected to a local file and not printed on syserr. The log file is then uploaded to storage account. Any value can be set for this parameter, e.g.: yes, true. +20. ***NETWORK_CONFIG***. This controls the VPC type to be used for the MIG. The supported values are default_network, new_network and multi_nic_network. The dault value is default_network. The behaviour is - __default_network__: MIG uses the default VPC in the project. - __new_network__: A new VPC is created for the MIG. - __multi_nic_network__: New VPCs are created and used by all the VMs in the MIG. By default 5 new VPCs are created and 5 NICs are used for the MIG but that value is configurable. diff --git a/scripts/_terraform_util.sh b/scripts/_terraform_util.sh index 0825e01a6..62befaeac 100644 --- a/scripts/_terraform_util.sh +++ b/scripts/_terraform_util.sh @@ -19,7 +19,15 @@ # _terraform_setup() { apply_ret=0 - terraform -chdir=/usr/primary apply -input=false -auto-approve || apply_ret=$? + + # change terraform verbosity based on MINIMIZE_TERRAFORM_LOGGING environment variable. + if [[ -n "$MINIMIZE_TERRAFORM_LOGGING" ]]; then + echo "Redirecting 'terraform apply' output to $TERRAFORM_LOG_PATH." + terraform -chdir=/usr/primary apply -input=false -auto-approve > $TERRAFORM_LOG_PATH || apply_ret=$? + else + terraform -chdir=/usr/primary apply -input=false -auto-approve || apply_ret=$? + fi + if [ $apply_ret -eq 0 ]; then echo "Terraform apply finished successfully." _Display_connection_info @@ -44,7 +52,7 @@ _terraform_setup() { # method to display jupyter notebook connection endpoint. # _Display_connection_info() { - if [[ ! -z "$SHOW_PROXY_URL" && "${SHOW_PROXY_URL,,}" == "no" ]]; then + if [[ -n "$SHOW_PROXY_URL" && "${SHOW_PROXY_URL,,}" == "no" ]]; then echo "Not checking for proxy_url information." else for vm in $(gcloud compute instance-groups list-instances $NAME_PREFIX-mig --zone $ZONE --format="value(NAME)"); @@ -78,7 +86,15 @@ _terraform_cleanup() { export IS_CLEANUP_NEEDED="no" echo "Calling terraform destroy..." destroy_ret=0 - terraform -chdir=/usr/primary destroy -input=false -auto-approve || destroy_ret=$? + + # change terraform verbosity based on MINIMIZE_TERRAFORM_LOGGING environment variable. + if [[ -n "$MINIMIZE_TERRAFORM_LOGGING" ]]; then + echo "Redirecting 'terraform destroy' output to $TERRAFORM_LOG_PATH." + terraform -chdir=/usr/primary destroy -input=false -auto-approve > $TERRAFORM_LOG_PATH || destroy_ret=$? + else + terraform -chdir=/usr/primary destroy -input=false -auto-approve || destroy_ret=$? + fi + del_state_ret=0 if [ $destroy_ret -eq 0 ]; then echo "Successfully destroyed resources. Cleaning up the terraform state." @@ -94,10 +110,11 @@ _terraform_cleanup() { # method to perform terraform action to create or destroy cluster # _perform_terraform_action() { + export TERRAFORM_LOG_PATH=/usr/terraformlog.txt if [[ "${ACTION,,}" == "create" ]]; then echo "Uploading environment variables to gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH" printenv >> /usr/$NAME_PREFIX-env.list - gsutil cp /usr/$NAME_PREFIX-env.list gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH + gsutil -m cp /usr/$NAME_PREFIX-env.list gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-env.list echo "Parameter file location: gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-env.list" >> /usr/info.txt echo "Creating cluster..." terraform --version @@ -134,6 +151,11 @@ _perform_terraform_action() { else echo "Action $ACTION is not supported..." fi + + if [ -f "$TERRAFORM_LOG_PATH" ]; then + echo -e "${GREEN}Copying terraform output file from $TERRAFORM_LOG_PATH ${NOC}" + gsutil -m cp $TERRAFORM_LOG_PATH gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-terraform.log + fi } # From df43ddeb19c90b93cef56c10d8cc2a2b2dd44684 Mon Sep 17 00:00:00 2001 From: soumyapani <112522451+soumyapani@users.noreply.github.com> Date: Wed, 8 Feb 2023 11:25:01 -0800 Subject: [PATCH 6/6] Release v0.2.0 --- cloudbuild-continuous.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild-continuous.yaml b/cloudbuild-continuous.yaml index 416468ed2..e81d9bb70 100644 --- a/cloudbuild-continuous.yaml +++ b/cloudbuild-continuous.yaml @@ -76,4 +76,4 @@ images: timeout: 1800s substitutions: - _VERSION: 'v0.1.0' \ No newline at end of file + _VERSION: 'v0.2.0' \ No newline at end of file