Merge pull request GoogleCloudPlatform#3475 from GoogleCloudPlatform/…

…a3ultra-preview Release v1.44.1: A3 Ultra (a3-ultragpu-8g) blueprints
tpdownes · Dec 30, 2024 · 346d015 · 346d015
2 parents 6a19416 + 7cd0a0e
commit 346d015
Show file tree

Hide file tree

Showing 26 changed files with 2,978 additions and 4 deletions.
diff --git a/examples/gke-a3-ultragpu/README.md b/examples/gke-a3-ultragpu/README.md
@@ -0,0 +1,4 @@
+Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.
+
+If you are unable to access these documents, please contact your
+[Technical Account Manager (TAM)](https://cloud.google.com/tam).
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
@@ -0,0 +1,30 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+terraform_backend_defaults:
+  type: gcs
+  configuration:
+    bucket: BUCKET_NAME
+
+vars:
+  deployment_name: gke-a3-ultra
+  project_id: PROJECT_ID
+  region: COMPUTE_REGION
+  zone: COMPUTE_ZONE
+  authorized_cidr: <IP_ADDRESS>/<SUFFIX>
+  # In order to not target a BLOCK_NAME, extended_reservation can be inputted as
+  # extended_reservation: RESERVATION_NAME
+  extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
+  static_node_count: NODE_COUNT
diff --git a/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml b/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
@@ -0,0 +1,197 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+blueprint_name: gke-a3-ultra
+
+vars:
+  project_id: # add this
+  deployment_name: # add this
+  region: # add this
+  zone: # add this
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: # add this
+  extended_reservation: # add this
+  # Installs NCCL library and Google NCCL plugin
+  # Runs an init container on all H200 GPU nodes with the NCCL plugin image
+  nccl_installer_path: $(ghpc_stage("./nccl-installer.yaml"))
+  # Temporary fix for COS issue, will be fixed in next release
+  mglru_disable_path: $(ghpc_stage("./mglru-disable.yaml"))
+  mtu_size: 8896
+  static_node_count:  # add this
+  system_node_pool_disk_size_gb: 200
+  a3ultra_node_pool_disk_size_gb: 100
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: gke-a3-ultra-net-0
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e17bb15
+    settings:
+      network_name: $(vars.deployment_name)-net-0
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-0
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.0.0/18
+      secondary_ranges_list:
+      - subnetwork_name: $(vars.deployment_name)-sub-0
+        ranges:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-0
+        ranges: [192.168.0.0/16]
+        allow:
+        - protocol: tcp
+          ports: ["0-65535"]
+        - protocol: udp
+          ports: ["0-65535"]
+        - protocol: icmp
+
+  - id: gke-a3-ultra-net-1
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e17bb15
+    settings:
+      network_name: $(vars.deployment_name)-net-1
+      mtu: $(vars.mtu_size)
+      subnetworks:
+      - subnet_name: $(vars.deployment_name)-sub-1
+        subnet_region: $(vars.region)
+        subnet_ip: 192.168.64.0/18
+      firewall_rules:
+      - name: $(vars.deployment_name)-internal-1
+        ranges: [192.168.0.0/16]
+        allow:
+        - protocol: tcp
+          ports: ["0-65535"]
+        - protocol: udp
+          ports: ["0-65535"]
+        - protocol: icmp
+
+  - id: gke-a3-ultra-rdma-net
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/gpu-rdma-vpc?ref=e17bb15
+    settings:
+      network_name: $(vars.deployment_name)-rdma-net
+      mtu: $(vars.mtu_size)
+      network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
+      network_routing_mode: REGIONAL
+      subnetworks_template:
+        name_prefix: $(vars.deployment_name)-rdma-sub
+        count: 8
+        ip_range: 192.168.128.0/18
+        region: $(vars.region)
+
+  - id: a3-ultragpu-cluster
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e17bb15
+    use: [gke-a3-ultra-net-0]
+    settings:
+      release_channel: RAPID
+      system_node_pool_machine_type: "e2-standard-16"
+      system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
+      system_node_pool_taints: []
+      enable_dcgm_monitoring: true
+      enable_gcsfuse_csi: true
+      enable_private_endpoint: false # Allows access from authorized public IPs
+      master_authorized_networks:
+      - cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
+        display_name: "kubectl-access-network"
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: a3-ultragpu-pool
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e17bb15
+    use: [a3-ultragpu-cluster]
+    settings:
+      machine_type: a3-ultragpu-8g
+      auto_upgrade: true
+      zones: [$(vars.zone)]
+      disk_type: hyperdisk-balanced
+      disk_size_gb: $(vars.a3ultra_node_pool_disk_size_gb)
+      static_node_count: $(vars.static_node_count)
+      guest_accelerator:
+      - type: nvidia-h200-141gb
+        count: 8
+        gpu_driver_installation_config:
+          gpu_driver_version: "LATEST"
+      reservation_affinity:
+        consume_reservation_type: SPECIFIC_RESERVATION
+        specific_reservations:
+        - name: $(vars.extended_reservation)
+      additional_networks:
+        $(concat(
+          [{
+            network=gke-a3-ultra-net-1.network_name,
+            subnetwork=gke-a3-ultra-net-1.subnetwork_name,
+            subnetwork_project=vars.project_id,
+            nic_type="GVNIC",
+            queue_count=null,
+            network_ip=null,
+            stack_type=null,
+            access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
+            ipv6_access_config=[],
+            alias_ip_range=[]
+          }],
+         gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
+        ))
+    outputs: [instructions]
+
+  - id: topology-aware-scheduler-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e17bb15
+    use: [a3-ultragpu-cluster]
+
+  - id: workload-manager-install
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e17bb15
+    use: [a3-ultragpu-cluster]
+    settings:
+      kueue:
+        install: true
+        version: v0.10.0
+      jobset:
+        install: true
+        version: v0.7.1
+      apply_manifests:
+      - source: $(vars.nccl_installer_path)
+      - source: $(vars.mglru_disable_path)
+
+  - id: job-template
+    source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-job-template?ref=e17bb15
+    use: [a3-ultragpu-pool]
+    settings:
+      image: nvidia/cuda:11.0.3-runtime-ubuntu20.04
+      command:
+      - nvidia-smi
+      node_count: 2
+      name: run-nvidia-smi
+    outputs: [instructions]
diff --git a/examples/gke-a3-ultragpu/mglru-disable.yaml b/examples/gke-a3-ultragpu/mglru-disable.yaml
@@ -0,0 +1,59 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: disable-mglru
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      app: disable-mglru
+  template:
+    metadata:
+      labels:
+        app: disable-mglru
+    spec:
+      hostNetwork: true
+      tolerations:
+      - operator: "Exists"
+        key: nvidia.com/gpu
+      containers:
+      - name: disable-mglru
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          echo n | tee /sys/kernel/mm/lru_gen/enabled
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
+          sleep infinity
+        volumeMounts:
+        - name: sys-kernel-mm-lru-gen
+          mountPath: /sys/kernel/mm/lru_gen
+      # Remount sysfs so that it will be writable.
+      volumes:
+      - name: sys-kernel-mm-lru-gen
+        hostPath:
+          path: /sys/kernel/mm/lru_gen
diff --git a/examples/gke-a3-ultragpu/nccl-installer.yaml b/examples/gke-a3-ultragpu/nccl-installer.yaml
@@ -0,0 +1,95 @@
+# Copyright 2024 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nccl-rdma-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nccl-rdma-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nccl-rdma-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nccl-rdma-installer
+        k8s-app: nccl-rdma-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: cloud.google.com/gke-accelerator
+                operator: In
+                values:
+                - nvidia-h200-141gb
+      tolerations:
+      - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+      - name: library-dir-host
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/lib64
+          type: DirectoryOrCreate
+      - name: gib
+        hostPath:
+          path: /home/kubernetes/bin/gib
+      initContainers:
+      - name: disable-log-martian
+        image: alpine:latest
+        command: ["/bin/sh"]
+        securityContext:
+          privileged: true
+        args:
+        - -c
+        - |
+          sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
+          sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
+      - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
+        name: nccl-rdma-installer
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: library-dir-host
+          mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
+        - name: gib
+          mountPath: /usr/local/home/kubernetes/bin/gib
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          set -ex
+          /scripts/container_entry.sh install --install-nccl
+          cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
+          cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
+          echo "installation finishes"
+      containers:
+      - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
+        name: pause