Skip to content

Commit

Permalink
Merge pull request #220 from klueska/update-gke-demo
Browse files Browse the repository at this point in the history
Update GKE deployment script for kubernetes 1.32
  • Loading branch information
klueska authored Dec 20, 2024
2 parents 6c34f5f + 737b187 commit b1fe289
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 19 deletions.
33 changes: 18 additions & 15 deletions demo/clusters/gke/create-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,38 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"
NODE_VERSION="1.31.1"
NODE_VERSION="1.32"
ROUTER_REGION="us-central1"
REGION="us-central1-c"

## Create the Network for the cluster
gcloud compute networks create "${NETWORK_NAME}" \
--quiet \
--project="${PROJECT_NAME}" \
--description=Manually\ created\ network\ for\ TMS\ DRA\ Alpha\ cluster \
--description="Manually created network for DRA beta test cluster" \
--subnet-mode=auto \
--mtu=1460 \
--bgp-routing-mode=regional

## Create the cluster
gcloud container clusters create "${CLUSTER_NAME}" \
--quiet \
--enable-kubernetes-alpha \
--enable-kubernetes-unstable-apis="resource.k8s.io/v1beta1/deviceclasses,resource.k8s.io/v1beta1/resourceclaims,resource.k8s.io/v1beta1/resourceclaimtemplates,resource.k8s.io/v1beta1/resourceslices" \
--release-channel=rapid \
--no-enable-autorepair \
--no-enable-autoupgrade \
--region us-west1 \
--enable-autoupgrade \
--region "${REGION}" \
--num-nodes "1" \
--network "${NETWORK_NAME}" \
--cluster-version "${NODE_VERSION}" \
--node-version "${NODE_VERSION}"
--node-version "${NODE_VERSION}" \

# Create t4 node pool
gcloud beta container node-pools create "pool-1" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--region "${REGION}" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-t4,count=1" \
Expand All @@ -77,19 +80,19 @@ gcloud beta container node-pools create "pool-1" \
--min-nodes "2" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-locations "${REGION}" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

# Create v100 node pool
gcloud beta container node-pools create "pool-2" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--region "${REGION}" \
--node-version "${NODE_VERSION}" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-v100,count=1" \
Expand All @@ -103,30 +106,30 @@ gcloud beta container node-pools create "pool-2" \
--min-nodes "1" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-locations "${REGION}" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true

## Allow the GPU nodes access to the internet
gcloud compute routers create ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--network "${NETWORK_NAME}" \
--region "us-west1"
--region "${ROUTER_REGION}" \

gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--nat-all-subnet-ip-ranges \
--auto-allocate-nat-external-ips \
--router-region "us-west1"
--router-region "${ROUTER_REGION}" \

## Start using this cluster for kubectl
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="${REGION}"

## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version-
Expand Down
8 changes: 5 additions & 3 deletions demo/clusters/gke/delete-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,27 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"
ROUTER_REGION="us-central1"
REGION="us-central1-c"

## Delete the cluster
gcloud container clusters delete "${CLUSTER_NAME}" \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"
--region "${REGION}"

## Delete the nat config
gcloud compute routers nats delete "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--router-region "us-west1"
--router-region "${ROUTER_REGION}"

## Delete the nat router
gcloud compute routers delete ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"
--region "${ROUTER_REGION}"

## Delete the network
gcloud compute networks delete "${NETWORK_NAME}" \
Expand Down
2 changes: 1 addition & 1 deletion demo/clusters/gke/install-dra-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: ${IMAGE_REGISTRY:=ghcr.io/nvidia}
: ${IMAGE_NAME:=${DRIVER_NAME}}
: ${IMAGE_TAG:=32805fec-ubi8}
: ${IMAGE_TAG:=6c34f5fb-ubi8}

helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
Expand Down

0 comments on commit b1fe289

Please sign in to comment.