Skip to content

Commit

Permalink
Merge pull request GoogleCloudPlatform#3475 from GoogleCloudPlatform/…
Browse files Browse the repository at this point in the history
…a3ultra-preview

Release v1.44.1: A3 Ultra (a3-ultragpu-8g) blueprints
  • Loading branch information
tpdownes authored Dec 30, 2024
2 parents 6a19416 + 7cd0a0e commit 346d015
Show file tree
Hide file tree
Showing 26 changed files with 2,978 additions and 4 deletions.
4 changes: 4 additions & 0 deletions examples/gke-a3-ultragpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Refer to [AI Hypercomputer Documentation](https://cloud.google.com/ai-hypercomputer/docs/create/gke-ai-hypercompute#create-cluster) for instructions.

If you are unable to access these documents, please contact your
[Technical Account Manager (TAM)](https://cloud.google.com/tam).
30 changes: 30 additions & 0 deletions examples/gke-a3-ultragpu/gke-a3-ultragpu-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
terraform_backend_defaults:
type: gcs
configuration:
bucket: BUCKET_NAME

vars:
deployment_name: gke-a3-ultra
project_id: PROJECT_ID
region: COMPUTE_REGION
zone: COMPUTE_ZONE
authorized_cidr: <IP_ADDRESS>/<SUFFIX>
# In order to not target a BLOCK_NAME, extended_reservation can be inputted as
# extended_reservation: RESERVATION_NAME
extended_reservation: RESERVATION_NAME/reservationBlocks/BLOCK_NAME
static_node_count: NODE_COUNT
197 changes: 197 additions & 0 deletions examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# Copyright 2024 "Google LLC"
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

blueprint_name: gke-a3-ultra

vars:
project_id: # add this
deployment_name: # add this
region: # add this
zone: # add this
# Cidr block containing the IP of the machine calling terraform.
# The following line must be updated for this example to work.
authorized_cidr: # add this
extended_reservation: # add this
# Installs NCCL library and Google NCCL plugin
# Runs an init container on all H200 GPU nodes with the NCCL plugin image
nccl_installer_path: $(ghpc_stage("./nccl-installer.yaml"))
# Temporary fix for COS issue, will be fixed in next release
mglru_disable_path: $(ghpc_stage("./mglru-disable.yaml"))
mtu_size: 8896
static_node_count: # add this
system_node_pool_disk_size_gb: 200
a3ultra_node_pool_disk_size_gb: 100

deployment_groups:
- group: primary
modules:
- id: gke-a3-ultra-net-0
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e17bb15
settings:
network_name: $(vars.deployment_name)-net-0
subnetworks:
- subnet_name: $(vars.deployment_name)-sub-0
subnet_region: $(vars.region)
subnet_ip: 192.168.0.0/18
secondary_ranges_list:
- subnetwork_name: $(vars.deployment_name)-sub-0
ranges:
- range_name: pods
ip_cidr_range: 10.4.0.0/14
- range_name: services
ip_cidr_range: 10.0.32.0/20
firewall_rules:
- name: $(vars.deployment_name)-internal-0
ranges: [192.168.0.0/16]
allow:
- protocol: tcp
ports: ["0-65535"]
- protocol: udp
ports: ["0-65535"]
- protocol: icmp

- id: gke-a3-ultra-net-1
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e17bb15
settings:
network_name: $(vars.deployment_name)-net-1
mtu: $(vars.mtu_size)
subnetworks:
- subnet_name: $(vars.deployment_name)-sub-1
subnet_region: $(vars.region)
subnet_ip: 192.168.64.0/18
firewall_rules:
- name: $(vars.deployment_name)-internal-1
ranges: [192.168.0.0/16]
allow:
- protocol: tcp
ports: ["0-65535"]
- protocol: udp
ports: ["0-65535"]
- protocol: icmp

- id: gke-a3-ultra-rdma-net
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/gpu-rdma-vpc?ref=e17bb15
settings:
network_name: $(vars.deployment_name)-rdma-net
mtu: $(vars.mtu_size)
network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
network_routing_mode: REGIONAL
subnetworks_template:
name_prefix: $(vars.deployment_name)-rdma-sub
count: 8
ip_range: 192.168.128.0/18
region: $(vars.region)

- id: a3-ultragpu-cluster
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e17bb15
use: [gke-a3-ultra-net-0]
settings:
release_channel: RAPID
system_node_pool_machine_type: "e2-standard-16"
system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
system_node_pool_taints: []
enable_dcgm_monitoring: true
enable_gcsfuse_csi: true
enable_private_endpoint: false # Allows access from authorized public IPs
master_authorized_networks:
- cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
display_name: "kubectl-access-network"
maintenance_exclusions:
- name: no-minor-or-node-upgrades-indefinite
start_time: "2024-12-01T00:00:00Z"
end_time: "2025-12-22T00:00:00Z"
exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
additional_networks:
$(concat(
[{
network=gke-a3-ultra-net-1.network_name,
subnetwork=gke-a3-ultra-net-1.subnetwork_name,
subnetwork_project=vars.project_id,
nic_type="GVNIC",
queue_count=null,
network_ip=null,
stack_type=null,
access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
ipv6_access_config=[],
alias_ip_range=[]
}],
gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
))
outputs: [instructions]

- id: a3-ultragpu-pool
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e17bb15
use: [a3-ultragpu-cluster]
settings:
machine_type: a3-ultragpu-8g
auto_upgrade: true
zones: [$(vars.zone)]
disk_type: hyperdisk-balanced
disk_size_gb: $(vars.a3ultra_node_pool_disk_size_gb)
static_node_count: $(vars.static_node_count)
guest_accelerator:
- type: nvidia-h200-141gb
count: 8
gpu_driver_installation_config:
gpu_driver_version: "LATEST"
reservation_affinity:
consume_reservation_type: SPECIFIC_RESERVATION
specific_reservations:
- name: $(vars.extended_reservation)
additional_networks:
$(concat(
[{
network=gke-a3-ultra-net-1.network_name,
subnetwork=gke-a3-ultra-net-1.subnetwork_name,
subnetwork_project=vars.project_id,
nic_type="GVNIC",
queue_count=null,
network_ip=null,
stack_type=null,
access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
ipv6_access_config=[],
alias_ip_range=[]
}],
gke-a3-ultra-rdma-net.subnetwork_interfaces_gke
))
outputs: [instructions]

- id: topology-aware-scheduler-install
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e17bb15
use: [a3-ultragpu-cluster]

- id: workload-manager-install
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e17bb15
use: [a3-ultragpu-cluster]
settings:
kueue:
install: true
version: v0.10.0
jobset:
install: true
version: v0.7.1
apply_manifests:
- source: $(vars.nccl_installer_path)
- source: $(vars.mglru_disable_path)

- id: job-template
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-job-template?ref=e17bb15
use: [a3-ultragpu-pool]
settings:
image: nvidia/cuda:11.0.3-runtime-ubuntu20.04
command:
- nvidia-smi
node_count: 2
name: run-nvidia-smi
outputs: [instructions]
59 changes: 59 additions & 0 deletions examples/gke-a3-ultragpu/mglru-disable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2024 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: disable-mglru
namespace: kube-system
spec:
selector:
matchLabels:
app: disable-mglru
template:
metadata:
labels:
app: disable-mglru
spec:
hostNetwork: true
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
containers:
- name: disable-mglru
image: alpine:latest
command: ["/bin/sh"]
securityContext:
privileged: true
args:
- -c
- |
echo n | tee /sys/kernel/mm/lru_gen/enabled
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
sleep infinity
volumeMounts:
- name: sys-kernel-mm-lru-gen
mountPath: /sys/kernel/mm/lru_gen
# Remount sysfs so that it will be writable.
volumes:
- name: sys-kernel-mm-lru-gen
hostPath:
path: /sys/kernel/mm/lru_gen
95 changes: 95 additions & 0 deletions examples/gke-a3-ultragpu/nccl-installer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright 2024 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nccl-rdma-installer
namespace: kube-system
labels:
k8s-app: nccl-rdma-installer
spec:
selector:
matchLabels:
k8s-app: nccl-rdma-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nccl-rdma-installer
k8s-app: nccl-rdma-installer
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h200-141gb
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: library-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
type: DirectoryOrCreate
- name: gib
hostPath:
path: /home/kubernetes/bin/gib
initContainers:
- name: disable-log-martian
image: alpine:latest
command: ["/bin/sh"]
securityContext:
privileged: true
args:
- -c
- |
sysctl -w net.ipv4.conf.gpu0rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu1rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu2rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu3rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu4rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu5rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu6rdma0.log_martians=0
sysctl -w net.ipv4.conf.gpu7rdma0.log_martians=0
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.0.3
name: nccl-rdma-installer
resources:
requests:
cpu: 150m
securityContext:
privileged: true
volumeMounts:
- name: library-dir-host
mountPath: /usr/local/home/kubernetes/bin/nvidia/lib64
- name: gib
mountPath: /usr/local/home/kubernetes/bin/gib
command: ["/bin/sh", "-c"]
args:
- |
set -ex
/scripts/container_entry.sh install --install-nccl
cp -r /var/lib/gib/lib64/. /usr/local/home/kubernetes/bin/nvidia/lib64
cp -r /var/lib/gib/. /usr/local/home/kubernetes/bin/gib
echo "installation finishes"
containers:
- image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
name: pause
Loading

0 comments on commit 346d015

Please sign in to comment.