Skip to content

Commit

Permalink
Merge pull request #23 from NVIDIA/dra-on-gke
Browse files Browse the repository at this point in the history
Address issues with running DRA driver on GKE
  • Loading branch information
elezar authored Nov 21, 2023
2 parents 1bdce27 + af5302d commit 1c4bf38
Show file tree
Hide file tree
Showing 35 changed files with 510 additions and 88 deletions.
2 changes: 2 additions & 0 deletions cmd/nvidia-dra-plugin/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type CDIHandler struct {
nvcdi nvcdi.Interface
registry cdiapi.Registry
driverRoot string
devRoot string
targetDriverRoot string
nvidiaCTKPath string

Expand Down Expand Up @@ -84,6 +85,7 @@ func NewCDIHandler(opts ...cdiOption) (*CDIHandler, error) {
nvcdilib, err := nvcdi.New(
nvcdi.WithDeviceLib(h.nvdevice),
nvcdi.WithDriverRoot(h.driverRoot),
nvcdi.WithDevRoot(h.devRoot),
nvcdi.WithLogger(h.logger),
nvcdi.WithNvmlLib(h.nvml),
nvcdi.WithMode("nvml"),
Expand Down
11 changes: 8 additions & 3 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"sync"

"github.com/NVIDIA/go-nvlib/pkg/nvml"
"k8s.io/klog/v2"

nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1"
)
Expand Down Expand Up @@ -126,7 +127,8 @@ type DeviceState struct {
}

func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
nvdevlib, err := newDeviceLib(root(config.flags.containerDriverRoot))
containerDriverRoot := root(config.flags.containerDriverRoot)
nvdevlib, err := newDeviceLib(containerDriverRoot)
if err != nil {
return nil, fmt.Errorf("failed to create device library: %w", err)
}
Expand All @@ -136,12 +138,15 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
return nil, fmt.Errorf("error enumerating all possible devices: %w", err)
}

devRoot := containerDriverRoot.getDevRoot()
klog.Infof("using devRoot=%v", devRoot)

hostDriverRoot := config.flags.hostDriverRoot
containerDriverRoot := config.flags.containerDriverRoot
cdi, err := NewCDIHandler(
WithNvml(nvdevlib.nvmllib),
WithDeviceLib(nvdevlib),
WithDriverRoot(containerDriverRoot),
WithDriverRoot(string(containerDriverRoot)),
WithDevRoot(devRoot),
WithTargetDriverRoot(hostDriverRoot),
WithNvidiaCTKPath(config.flags.nvidiaCTKPath),
WithCDIRoot(config.flags.cdiRoot),
Expand Down
20 changes: 20 additions & 0 deletions cmd/nvidia-dra-plugin/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package main

import (
"fmt"
"os"
"path/filepath"
)

Expand Down Expand Up @@ -60,6 +61,25 @@ func (r root) getNvidiaSMIPath() (string, error) {
return binaryPath, nil
}

// isDevRoot checks whether the specified root is a dev root.
// A dev root is defined as a root containing a /dev folder.
func (r root) isDevRoot() bool {
stat, err := os.Stat(filepath.Join(string(r), "dev"))
if err != nil {
return false
}
return stat.IsDir()
}

// getDevRoot returns the dev root associated with the root.
// If the root is not a dev root, this defaults to "/".
func (r root) getDevRoot() string {
if r.isDevRoot() {
return string(r)
}
return "/"
}

// findFile searches the root for a specified file.
// A number of folders can be specified to search in addition to the root itself.
// If the file represents a symlink, this is resolved and the final path is returned.
Expand Down
7 changes: 7 additions & 0 deletions cmd/nvidia-dra-plugin/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ func WithDriverRoot(root string) cdiOption {
}
}

// WithDevRoot provides a cdiOption to set the device root used by the 'cdi' interface.
func WithDevRoot(root string) cdiOption {
return func(c *CDIHandler) {
c.devRoot = root
}
}

// WithTargetDriverRoot provides an cdiOption to set the target driver root used by the 'cdi' interface.
func WithTargetDriverRoot(root string) cdiOption {
return func(c *CDIHandler) {
Expand Down
135 changes: 135 additions & 0 deletions demo/clusters/gke/create-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

: ${PROJECT_NAME:=$(gcloud config list --format 'value(core.project)' 2>/dev/null)}

if [[ -z ${PROJECT_NAME} ]]; then
echo "Project name could not be determined"
echo "Please run 'gcloud config set project'"
exit 1
fi

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"

## Create the Network for the cluster
gcloud compute networks create "${NETWORK_NAME}" \
--quiet \
--project="${PROJECT_NAME}" \
--description=Manually\ created\ network\ for\ TMS\ DRA\ Alpha\ cluster \
--subnet-mode=auto \
--mtu=1460 \
--bgp-routing-mode=regional

## Create the cluster
gcloud container clusters create "${CLUSTER_NAME}" \
--quiet \
--enable-kubernetes-alpha \
--no-enable-autorepair \
--no-enable-autoupgrade \
--region us-west1 \
--network "${NETWORK_NAME}" \
--node-labels=nvidia.com/dra.controller=true

# Create t4 node pool
gcloud beta container node-pools create "pool-1" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-t4,count=1" \
--image-type "UBUNTU_CONTAINERD" \
--disk-type "pd-standard" \
--disk-size "100" \
--metadata disable-legacy-endpoints=true \
--scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \
--num-nodes "2" \
--enable-autoscaling \
--min-nodes "2" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true

# Create v100 node pool
gcloud beta container node-pools create "pool-2" \
--quiet \
--project "${PROJECT_NAME}" \
--cluster "${CLUSTER_NAME}" \
--region "us-west1" \
--node-version "1.27.3-gke.100" \
--machine-type "n1-standard-8" \
--accelerator "type=nvidia-tesla-v100,count=1" \
--image-type "UBUNTU_CONTAINERD" \
--disk-type "pd-standard" \
--disk-size "100" \
--metadata disable-legacy-endpoints=true \
--scopes "https://www.googleapis.com/auth/devstorage.read_only","https://www.googleapis.com/auth/logging.write","https://www.googleapis.com/auth/monitoring","https://www.googleapis.com/auth/servicecontrol","https://www.googleapis.com/auth/service.management.readonly","https://www.googleapis.com/auth/trace.append" \
--num-nodes "1" \
--enable-autoscaling \
--min-nodes "1" \
--max-nodes "6" \
--location-policy "ANY" \
--no-enable-autoupgrade \
--no-enable-autorepair \
--max-surge-upgrade 1 \
--max-unavailable-upgrade 0 \
--node-locations "us-west1-a" \
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true

## Allow the GPU nodes access to the internet
gcloud compute routers create ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--network "${NETWORK_NAME}" \
--region "us-west1"

gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--nat-all-subnet-ip-ranges \
--auto-allocate-nat-external-ips \
--router-region "us-west1"

## Start using this cluster for kubectl
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"

## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml

## Create the nvidia namespace
kubectl create namespace nvidia

## Deploy a custom daemonset that prepares a node for use with DRA
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/456d097feb452cca1351817bab2ccd0782e96c9f/demo/prepare-gke-nodes-for-dra.yaml
55 changes: 55 additions & 0 deletions demo/clusters/gke/delete-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

: ${PROJECT_NAME:=$(gcloud config list --format 'value(core.project)' 2>/dev/null)}

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

NETWORK_NAME="${DRIVER_NAME}-net"
CLUSTER_NAME="${DRIVER_NAME}-cluster"

## Delete the cluster
gcloud container clusters delete "${CLUSTER_NAME}" \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"

## Delete the nat config
gcloud compute routers nats delete "${NETWORK_NAME}-nat-config" \
--quiet \
--project "${PROJECT_NAME}" \
--router "${NETWORK_NAME}-nat-router" \
--router-region "us-west1"

## Delete the nat router
gcloud compute routers delete ${NETWORK_NAME}-nat-router \
--quiet \
--project "${PROJECT_NAME}" \
--region "us-west1"

## Delete the network
gcloud compute networks delete "${NETWORK_NAME}" \
--quiet \
--project "${PROJECT_NAME}"
41 changes: 41 additions & 0 deletions demo/clusters/gke/install-dra-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# Copyright 2023 NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: ${IMAGE_REGISTRY:=registry.gitlab.com/nvidia/cloud-native/k8s-dra-driver/staging}
: ${IMAGE_NAME:=${DRIVER_NAME}}
: ${IMAGE_TAG:=530b16c-ubuntu20.04}

helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
--set image.tag=${IMAGE_TAG} \
--set image.pullPolicy=Always \
--set controller.priorityClassName="" \
--set kubeletPlugin.priorityClassName="" \
--set nvidiaDriverRoot="/opt/nvidia" \
--set kubeletPlugin.tolerations[0].key=nvidia.com/gpu \
--set kubeletPlugin.tolerations[0].operator=Exists \
--set kubeletPlugin.tolerations[0].effect=NoSchedule
48 changes: 48 additions & 0 deletions demo/specs/selectors/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#### List the set of nodes in the cluster
```console
kubectl get nodes -A
```

#### Show the set of nodes which have GPUs available
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A
```

#### Show the set of allocatable GPUs from each node
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A -o=json \
| jq -r '.items[]
| "\(.metadata.name):",
(.spec.allocatableDevices[])'
```

### Open the yaml files with the specs for the demo
```console
vi -O parameters.yaml claims.yaml pods.yaml
```

#### Create a namespace for the demo and deploy the demo pods
```console
kubectl create namespace kubecon-demo
kubectl apply -f parameters.yaml -f claims.yaml -f pods.yaml
```

#### Show the pods running
```console
kubectl get pod -n kubecon-demo
```

#### Show the set of GPUs allocated to some claim
```console
kubectl get nodeallocationstates.nas.gpu.resource.nvidia.com -A -o=json \
| jq -r '.items[]
| select(.spec.allocatedClaims)
| "\(.metadata.name):",
(.spec.allocatedClaims[])'
```

#### Show the logs of the inference and training pods
```console
kubectl logs -n kubecon-demo inference-pod
kubectl logs -n kubecon-demo training-pod
```
Loading

0 comments on commit 1c4bf38

Please sign in to comment.