Skip to content

Commit

Permalink
Release v0.2.0 (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
soumyapani committed Feb 8, 2023
2 parents 7e9f385 + df43dde commit 44dd63d
Show file tree
Hide file tree
Showing 11 changed files with 101 additions and 90 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ The optional parameters are:
- The `<NFS fileshare type>` cannot be empty. The supported values are `BASIC_HDD`,`BASIC_SSD`,`HIGH_SCALE_SSD` and `ENTERPRISE`.
- The `<NFS fileshare size in GB>` can be empty and the default value is 2560 GB (2.5 TB).
18. ***SHOW_PROXY_URL***. This controls if the Jupyter notebook proxy url is retrieved for the cluster or not. The default value is yes. If this is present and set to no, then connection information is not collected. The supported values are: yes, no.
19. ***NETWORK_CONFIG***. This controls the VPC type to be used for the MIG. The supported values are default_network, new_network and multi_nic_network. The dault value is default_network. The behaviour is
19. ***MINIMIZE_TERRAFORM_LOGGING***. This controls the verbosity of terraform logs. When any value is set for this parameter, the terraform output is redirected to a local file and not printed on syserr. The log file is then uploaded to storage account. Any value can be set for this parameter, e.g.: yes, true.
20. ***NETWORK_CONFIG***. This controls the VPC type to be used for the MIG. The supported values are default_network, new_network and multi_nic_network. The dault value is default_network. The behaviour is
- __default_network__: MIG uses the default VPC in the project.
- __new_network__: A new VPC is created for the MIG.
- __multi_nic_network__: New VPCs are created and used by all the VMs in the MIG. By default 5 new VPCs are created and 5 NICs are used for the MIG but that value is configurable.
Expand Down Expand Up @@ -122,7 +123,7 @@ Since the resource state is stored outside of the container, the GPU cluster lif

## Instructions
1. gcloud auth application-default login.
2. gcloud auth configure-docker us-central1-docker.pkg.dev
2. gcloud auth configure-docker us-docker.pkg.dev
3. ***[`OPTIONAL - if project not set already`]*** gcloud config set account supercomputer-testing
4. Create env.list file. The sample env.list can be found [here](#sample-config-file-that-the-user-provides).
5. ***[`SIMPLE CREATE`]***
Expand Down
2 changes: 1 addition & 1 deletion cloudbuild-continuous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,4 @@ images:
timeout: 1800s

substitutions:
_VERSION: 'v0.1.0'
_VERSION: 'v0.2.0'
2 changes: 1 addition & 1 deletion scripts/_env_var_util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,6 @@ _set_terraform_env_var() {
fi

if [[ -n "$NFS_FILESHARE_LIST" ]]; then
echo "nfs_fileshare_list= \"$NFS_FILESHARE_LIST\"" >> /usr/primary/tf.auto.tfvars
echo "nfs_filestore_list= \"$NFS_FILESHARE_LIST\"" >> /usr/primary/tf.auto.tfvars
fi
}
30 changes: 26 additions & 4 deletions scripts/_terraform_util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,15 @@
#
_terraform_setup() {
apply_ret=0
terraform -chdir=/usr/primary apply -input=false -auto-approve || apply_ret=$?

# change terraform verbosity based on MINIMIZE_TERRAFORM_LOGGING environment variable.
if [[ -n "$MINIMIZE_TERRAFORM_LOGGING" ]]; then
echo "Redirecting 'terraform apply' output to $TERRAFORM_LOG_PATH."
terraform -chdir=/usr/primary apply -input=false -auto-approve > $TERRAFORM_LOG_PATH || apply_ret=$?
else
terraform -chdir=/usr/primary apply -input=false -auto-approve || apply_ret=$?
fi

if [ $apply_ret -eq 0 ]; then
echo "Terraform apply finished successfully."
_Display_connection_info
Expand All @@ -44,7 +52,7 @@ _terraform_setup() {
# method to display jupyter notebook connection endpoint.
#
_Display_connection_info() {
if [[ ! -z "$SHOW_PROXY_URL" && "${SHOW_PROXY_URL,,}" == "no" ]]; then
if [[ -n "$SHOW_PROXY_URL" && "${SHOW_PROXY_URL,,}" == "no" ]]; then
echo "Not checking for proxy_url information."
else
for vm in $(gcloud compute instance-groups list-instances $NAME_PREFIX-mig --zone $ZONE --format="value(NAME)");
Expand Down Expand Up @@ -78,7 +86,15 @@ _terraform_cleanup() {
export IS_CLEANUP_NEEDED="no"
echo "Calling terraform destroy..."
destroy_ret=0
terraform -chdir=/usr/primary destroy -input=false -auto-approve || destroy_ret=$?

# change terraform verbosity based on MINIMIZE_TERRAFORM_LOGGING environment variable.
if [[ -n "$MINIMIZE_TERRAFORM_LOGGING" ]]; then
echo "Redirecting 'terraform destroy' output to $TERRAFORM_LOG_PATH."
terraform -chdir=/usr/primary destroy -input=false -auto-approve > $TERRAFORM_LOG_PATH || destroy_ret=$?
else
terraform -chdir=/usr/primary destroy -input=false -auto-approve || destroy_ret=$?
fi

del_state_ret=0
if [ $destroy_ret -eq 0 ]; then
echo "Successfully destroyed resources. Cleaning up the terraform state."
Expand All @@ -94,10 +110,11 @@ _terraform_cleanup() {
# method to perform terraform action to create or destroy cluster
#
_perform_terraform_action() {
export TERRAFORM_LOG_PATH=/usr/terraformlog.txt
if [[ "${ACTION,,}" == "create" ]]; then
echo "Uploading environment variables to gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH"
printenv >> /usr/$NAME_PREFIX-env.list
gsutil cp /usr/$NAME_PREFIX-env.list gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH
gsutil -m cp /usr/$NAME_PREFIX-env.list gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-env.list
echo "Parameter file location: gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-env.list" >> /usr/info.txt
echo "Creating cluster..."
terraform --version
Expand Down Expand Up @@ -134,6 +151,11 @@ _perform_terraform_action() {
else
echo "Action $ACTION is not supported..."
fi

if [ -f "$TERRAFORM_LOG_PATH" ]; then
echo -e "${GREEN}Copying terraform output file from $TERRAFORM_LOG_PATH ${NOC}"
gsutil -m cp $TERRAFORM_LOG_PATH gs://$TF_BUCKET_NAME/$TF_DEPLOYMENT_PATH/$NAME_PREFIX-terraform.log
fi
}

#
Expand Down
56 changes: 19 additions & 37 deletions tfconfig/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,16 @@
*/

locals {
trimmed_net_config = lower(trimspace(var.network_config))
primary_network = coalesce(one(module.new_primary_network), one(module.multi-nic-network), one(module.default_primary_network))
gcs_mount_arr = compact(split(",", trimspace(var.gcs_mount_list)))
nfs_fileshare_arr = compact(split(",", trimspace(var.nfs_fileshare_list)))
}
module "default_primary_network" {
count = local.trimmed_net_config != "new_network" ? 1 : 0
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/pre-existing-vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1"
project_id = var.project_id
region = var.region
}

module "new_primary_network" {
count = local.trimmed_net_config == "new_network" ? 1 : 0
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1"
project_id = var.project_id
region = var.region
deployment_name = var.deployment_name
nfs_filestore_arr = compact(split(",", trimspace(var.nfs_filestore_list)))
}

module "multi-nic-network" {
count = local.trimmed_net_config == "multi_nic_network" ? 1 : 0
source = "./modules/multi-nic-network"
module "aiinfra-network" {
source = "./modules/aiinfra-network"
project_id = var.project_id
region = var.region
deployment_name = var.deployment_name
network_config = var.network_config
}

module "gcsfuse_mount" {
Expand All @@ -52,22 +36,21 @@ module "gcsfuse_mount" {
local_mount = split(":", trimspace(local.gcs_mount_arr[count.index]))[1]
}

module "nfs_fileshare" {
module "nfs_filestore" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore//?ref=c1f4a44"
count = length(local.nfs_fileshare_arr)
count = length(local.nfs_filestore_arr)
project_id = var.project_id
region = var.region
zone = var.zone
deployment_name = var.deployment_name
network_name = local.primary_network.network_name
network_name = module.aiinfra-network.network_name
filestore_share_name = "nfsshare_${count.index}"
labels = merge(var.labels, { ghpc_role = "aiinfra-fileshare",})
local_mount = split(":", trimspace(local.nfs_fileshare_arr[count.index]))[0]
filestore_tier = split(":", trimspace(local.nfs_fileshare_arr[count.index]))[1]
size_gb = length(split(":", trimspace(local.nfs_fileshare_arr[count.index]))) > 2 ? split(":", trimspace(local.nfs_fileshare_arr[count.index]))[2] : 2560
labels = merge(var.labels, { ghpc_role = "aiinfra-filestore",})
local_mount = split(":", trimspace(local.nfs_filestore_arr[count.index]))[0]
filestore_tier = split(":", trimspace(local.nfs_filestore_arr[count.index]))[1]
size_gb = length(split(":", trimspace(local.nfs_filestore_arr[count.index]))) > 2 ? split(":", trimspace(local.nfs_filestore_arr[count.index]))[2] : 2560
depends_on = [
local.primary_network,
module.multi-nic-network
module.aiinfra-network
]
}

Expand All @@ -83,17 +66,17 @@ __REPLACE_STARTUP_SCRIPT__
}]
, module.gcsfuse_mount[*].client_install_runner
, module.gcsfuse_mount[*].mount_runner
, module.nfs_fileshare[*].install_nfs_client_runner
, module.nfs_fileshare[*].mount_runner)
, module.nfs_filestore[*].install_nfs_client_runner
, module.nfs_filestore[*].mount_runner)
labels = merge(var.labels, { ghpc_role = "scripts",})
deployment_name = var.deployment_name
gcs_bucket_path = var.gcs_bucket_path
region = var.region
}

module "compute-vm-1" {
module "aiinfra-mig" {
source = "./modules/vm-instance-group"
subnetwork_self_link = local.primary_network.subnetwork_self_link
subnetwork_self_link = module.aiinfra-network.subnetwork_self_link
service_account = {
email = var.service_account
scopes = ["cloud-platform"]
Expand All @@ -102,7 +85,7 @@ module "compute-vm-1" {
project_id = var.project_id
disk_size_gb = var.disk_size_gb
disk_type = var.disk_type
network_self_link = local.primary_network.network_self_link
network_self_link = module.aiinfra-network.network_self_link
placement_policy = {
availability_domain_count = null
collocation = "COLLOCATED"
Expand All @@ -122,10 +105,9 @@ module "compute-vm-1" {
type = var.accelerator_type
}]
deployment_name = var.deployment_name
network_interfaces = local.trimmed_net_config == "multi_nic_network" ? module.multi-nic-network[0].multi_network_interface : []
network_interfaces = module.aiinfra-network.network_interfaces
depends_on = [
local.primary_network,
module.multi-nic-network
module.aiinfra-network
]
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,37 @@
* limitations under the License.
*/

module "network1" {
locals {
trimmed_net_config = lower(trimspace(var.network_config))
primary_network = coalesce(one(module.new_vpc), try(module.multinic_vpc[0], null), one(module.default_vpc))
}

module "default_vpc" {
count = local.trimmed_net_config != "new_network" ? 1 : 0
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/pre-existing-vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1"
project_id = var.project_id
region = var.region
}

module "new_vpc" {
count = local.trimmed_net_config == "new_network" ? 1 : 0
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=c1f4a44d92e775baa8c48aab6ae28cf9aee932a1"
project_id = var.project_id
region = var.region
deployment_name = var.deployment_name
}

module "multinic_vpc" {
source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc//?ref=866de32de9c3cf7ea8fa20f377d62aa80a07b8b3"
count = var.nic_count
count = local.trimmed_net_config == "multi_nic_network" ? var.nic_count : 0
network_address_range = "10.${count.index}.0.0/16"
subnetworks = [{
new_bits = 8
subnet_name = "primary-subnet-${count.index}"
subnet_region = var.region
}]
region = var.region
deployment_name = var.deployment_name
project_id = var.project_id
network_name = "${var.deployment_name}-net-${count.index}"
region = var.region
deployment_name = var.deployment_name
project_id = var.project_id
network_name = "${var.deployment_name}-net-${count.index}"
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,34 @@

output "network_name" {
description = "The name of the primary network of all the VPCs created."
value = module.network1[0].network_name
value = local.primary_network.network_name
}

output "subnetwork_self_link" {
description = "The subnetwork_self_link of the primary network of all the VPCs created."
value = module.network1[0].subnetwork_self_link
value = local.primary_network.subnetwork_self_link
}

output "network_self_link" {
description = "The network_self_link of the primary network of all the VPCs created."
value = module.network1[0].network_self_link
value = local.primary_network.network_self_link
}

output "multi_network_interface" {
output "network_interfaces" {
description = "The network interface that includes all VPC subnets."
value = [for idx in range(var.nic_count) : {
access_config = []
alias_ip_range = []
ipv6_access_config = []
network = null
network_ip = null
queue_count = null
stack_type = null
nic_type = "GVNIC"
subnetwork = module.network1[idx].subnetwork_self_link
subnetwork_project = var.project_id
value = local.trimmed_net_config == "multi_nic_network" ? [for idx in range(var.nic_count) : {
access_config = []
alias_ip_range = []
ipv6_access_config = []
network = null
network_ip = null
queue_count = null
stack_type = null
nic_type = "GVNIC"
subnetwork = module.multinic_vpc[idx].subnetwork_self_link
subnetwork_project = var.project_id
}
]
] : []

depends_on = [module.network1]
}
depends_on = [module.multinic_vpc]
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,10 @@ variable "nic_count" {
description = "The NIC count"
type = number
default = 5
}

variable "network_config" {
description = ""
type = string
default = "default_network"
}
20 changes: 0 additions & 20 deletions tfconfig/modules/vm-instance-group/outputs.tf

This file was deleted.

2 changes: 1 addition & 1 deletion tfconfig/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ variable "gcs_mount_list" {
default = ""
}

variable "nfs_fileshare_list" {
variable "nfs_filestore_list" {
description = ""
type = string
default = ""
Expand Down

0 comments on commit 44dd63d

Please sign in to comment.