Skip to content

Commit

Permalink
Merge pull request #218 from klueska/update-1.32
Browse files Browse the repository at this point in the history
Update to work with kubernetes 1.32
  • Loading branch information
klueska authored Dec 12, 2024
2 parents c0b728c + 1258b53 commit 6c34f5f
Show file tree
Hide file tree
Showing 1,237 changed files with 54,928 additions and 38,469 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,14 @@ We now install the NVIDIA GPU DRA driver:
./demo/clusters/kind/install-dra-driver.sh
```

This should show two pods running in the `nvidia-dra-driver` namespace:
This should show two pods running in the `nvidia` namespace:
```console
kubectl get pods -n nvidia-dra-driver
kubectl get pods -n nvidia
```
```
NAME READY STATUS RESTARTS AGE
nvidia-k8s-dra-driver-kubelet-plugin-t5qgz 1/1 Running 0 44s
NAME READY STATUS RESTARTS AGE
nvidia-dra-driver-k8s-dra-driver-controller-844fcb94b-ktbkc 1/1 Running 0 69s
nvidia-dra-driver-k8s-dra-driver-kubelet-plugin-5vfp9 1/1 Running 0 69s
```

### Run the examples by following the steps in the demo script
Expand Down
36 changes: 15 additions & 21 deletions cmd/nvidia-dra-controller/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
"time"

v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1alpha3"
resourceapi "k8s.io/api/resource/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
Expand Down Expand Up @@ -60,7 +60,6 @@ type ImexManager struct {
waitGroup sync.WaitGroup
clientset kubernetes.Interface
imexDomainOffsets imexDomainOffsets
owner resourceslice.Owner
driverResources *resourceslice.DriverResources
}

Expand All @@ -77,20 +76,6 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
return nil, fmt.Errorf("error creating dynamic client: %w", err)
}

// Fetch the current Pod object
pod, err := clientset.CoreV1().Pods(config.flags.namespace).Get(ctx, config.flags.podName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("error fetching pod: %w", err)
}

// Set the owner of the ResourceSlices we will create
owner := resourceslice.Owner{
APIVersion: "v1",
Kind: "Pod",
Name: pod.Name,
UID: pod.UID,
}

// Create a new set of DriverResources
driverResources := &resourceslice.DriverResources{
Pools: make(map[string]resourceslice.Pool),
Expand All @@ -103,7 +88,6 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)
driverImexChannelLimit: DriverImexChannelLimit,
retryTimeout: RetryTimeout,
clientset: clientset,
owner: owner,
driverResources: driverResources,
imexDomainOffsets: make(imexDomainOffsets),
}
Expand All @@ -125,8 +109,14 @@ func (m *ImexManager) manageResourceSlices(ctx context.Context) error {
return fmt.Errorf("error streaming IMEX domains: %w", err)
}

options := resourceslice.Options{
DriverName: m.driverName,
KubeClient: m.clientset,
Resources: m.driverResources,
}

klog.Info("Start publishing IMEX channels to ResourceSlices...")
controller, err := resourceslice.StartController(ctx, m.clientset, m.driverName, m.owner, m.driverResources)
controller, err := resourceslice.StartController(ctx, options)
if err != nil {
return fmt.Errorf("error starting resource slice controller: %w", err)
}
Expand Down Expand Up @@ -310,13 +300,13 @@ func (m *ImexManager) cleanupResourceSlices() error {
ops := metav1.ListOptions{
FieldSelector: fmt.Sprintf("%s=%s", resourceapi.ResourceSliceSelectorDriver, DriverName),
}
l, err := m.clientset.ResourceV1alpha3().ResourceSlices().List(context.Background(), ops)
l, err := m.clientset.ResourceV1beta1().ResourceSlices().List(context.Background(), ops)
if err != nil {
return fmt.Errorf("error listing resource slices: %w", err)
}

for _, rs := range l.Items {
err := m.clientset.ResourceV1alpha3().ResourceSlices().Delete(context.Background(), rs.Name, metav1.DeleteOptions{})
err := m.clientset.ResourceV1beta1().ResourceSlices().Delete(context.Background(), rs.Name, metav1.DeleteOptions{})
if err != nil {
return fmt.Errorf("error deleting resource slice %s: %w", rs.Name, err)
}
Expand Down Expand Up @@ -415,7 +405,11 @@ func generateImexChannelPool(imexDomain string, startChannel int, numChannels in
},
},
},
Devices: devices,
Slices: []resourceslice.Slice{
{
Devices: devices,
},
},
}

return pool
Expand Down
2 changes: 1 addition & 1 deletion cmd/nvidia-dra-plugin/allocatable.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package main
import (
"slices"

resourceapi "k8s.io/api/resource/v1alpha3"
resourceapi "k8s.io/api/resource/v1beta1"
)

type AllocatableDevices map[string]*AllocatableDevice
Expand Down
4 changes: 2 additions & 2 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ import (
"slices"
"sync"

resourceapi "k8s.io/api/resource/v1alpha3"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"

Expand Down
30 changes: 18 additions & 12 deletions cmd/nvidia-dra-plugin/deviceinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
"github.com/Masterminds/semver"
nvdev "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
resourceapi "k8s.io/api/resource/v1alpha3"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/utils/ptr"
)
Expand Down Expand Up @@ -131,8 +131,10 @@ func (d *GpuInfo) GetDevice() resourceapi.Device {
VersionValue: ptr.To(semver.MustParse(d.cudaDriverVersion).String()),
},
},
Capacity: map[resourceapi.QualifiedName]resource.Quantity{
"memory": *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI),
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
"memory": {
Value: *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI),
},
},
},
}
Expand Down Expand Up @@ -181,20 +183,24 @@ func (d *MigDeviceInfo) GetDevice() resourceapi.Device {
VersionValue: ptr.To(semver.MustParse(d.parent.cudaDriverVersion).String()),
},
},
Capacity: map[resourceapi.QualifiedName]resource.Quantity{
"multiprocessors": *resource.NewQuantity(int64(d.giProfileInfo.MultiprocessorCount), resource.BinarySI),
"copyEngines": *resource.NewQuantity(int64(d.giProfileInfo.CopyEngineCount), resource.BinarySI),
"decoders": *resource.NewQuantity(int64(d.giProfileInfo.DecoderCount), resource.BinarySI),
"encoders": *resource.NewQuantity(int64(d.giProfileInfo.EncoderCount), resource.BinarySI),
"jpegEngines": *resource.NewQuantity(int64(d.giProfileInfo.JpegCount), resource.BinarySI),
"ofaEngines": *resource.NewQuantity(int64(d.giProfileInfo.OfaCount), resource.BinarySI),
"memory": *resource.NewQuantity(int64(d.giProfileInfo.MemorySizeMB*1024*1024), resource.BinarySI),
Capacity: map[resourceapi.QualifiedName]resourceapi.DeviceCapacity{
"multiprocessors": {
Value: *resource.NewQuantity(int64(d.giProfileInfo.MultiprocessorCount), resource.BinarySI),
},
"copyEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.CopyEngineCount), resource.BinarySI)},
"decoders": {Value: *resource.NewQuantity(int64(d.giProfileInfo.DecoderCount), resource.BinarySI)},
"encoders": {Value: *resource.NewQuantity(int64(d.giProfileInfo.EncoderCount), resource.BinarySI)},
"jpegEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.JpegCount), resource.BinarySI)},
"ofaEngines": {Value: *resource.NewQuantity(int64(d.giProfileInfo.OfaCount), resource.BinarySI)},
"memory": {Value: *resource.NewQuantity(int64(d.giProfileInfo.MemorySizeMB*1024*1024), resource.BinarySI)},
},
},
}
for i := d.placement.Start; i < d.placement.Start+d.placement.Size; i++ {
capacity := resourceapi.QualifiedName(fmt.Sprintf("memorySlice%d", i))
device.Basic.Capacity[capacity] = *resource.NewQuantity(1, resource.BinarySI)
device.Basic.Capacity[capacity] = resourceapi.DeviceCapacity{
Value: *resource.NewQuantity(1, resource.BinarySI),
}
}
return device
}
Expand Down
8 changes: 5 additions & 3 deletions cmd/nvidia-dra-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ import (
coreclientset "k8s.io/client-go/kubernetes"
"k8s.io/dynamic-resource-allocation/kubeletplugin"
"k8s.io/klog/v2"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
)

var _ drapbv1.DRAPluginServer = &driver{}

type driver struct {
sync.Mutex
client coreclientset.Interface
Expand All @@ -48,7 +50,7 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) {

plugin, err := kubeletplugin.Start(
ctx,
driver,
[]any{driver},
kubeletplugin.KubeClient(driver.client),
kubeletplugin.NodeName(config.flags.nodeName),
kubeletplugin.DriverName(DriverName),
Expand Down Expand Up @@ -117,7 +119,7 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim)
d.Lock()
defer d.Unlock()

resourceClaim, err := d.client.ResourceV1alpha3().ResourceClaims(claim.Namespace).Get(
resourceClaim, err := d.client.ResourceV1beta1().ResourceClaims(claim.Namespace).Get(
ctx,
claim.Name,
metav1.GetOptions{})
Expand Down
2 changes: 1 addition & 1 deletion cmd/nvidia-dra-plugin/prepared.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package main
import (
"slices"

drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
)

type PreparedDeviceList []PreparedDevice
Expand Down
2 changes: 1 addition & 1 deletion demo/clusters/kind/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ DRIVER_IMAGE_VERSION=$(from_versions_mk "VERSION")
# From https://github.com/kubernetes/kubernetes/tags
# See also https://hub.docker.com/r/kindest/node/tags
: ${KIND_K8S_REPO:="https://github.com/kubernetes/kubernetes.git"}
: ${KIND_K8S_TAG:="v1.31.0"}
: ${KIND_K8S_TAG:="v1.32.0"}

# The name of the kind cluster to create
: ${KIND_CLUSTER_NAME:="${DRIVER_NAME}-cluster"}
Expand Down
6 changes: 5 additions & 1 deletion demo/clusters/kind/scripts/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ nodes:
kind: ClusterConfiguration
apiServer:
extraArgs:
runtime-config: "resource.k8s.io/v1alpha3=true"
runtime-config: "resource.k8s.io/v1beta1=true"
scheduler:
extraArgs:
v: "1"
Expand Down Expand Up @@ -66,3 +66,7 @@ nodes:
# on the kind nodes.
- hostPath: /usr/bin/nvidia-ctk
containerPath: /usr/bin/nvidia-ctk
# We need to inject the fabricmanager socket to support MIG with toolkit 1.16.2
# TODO: Remove this once we have a version of the toolkit where this is not required
- hostPath: /run/nvidia-fabricmanager/socket
containerPath: /run/nvidia-fabricmanager/socket
15 changes: 12 additions & 3 deletions demo/clusters/nvkind/scripts/kind-cluster-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ nodes:
kind: ClusterConfiguration
apiServer:
extraArgs:
runtime-config: "resource.k8s.io/v1alpha3=true"
runtime-config: "resource.k8s.io/v1beta1=true"
scheduler:
extraArgs:
v: "1"
Expand All @@ -48,8 +48,6 @@ nodes:
- role: worker
labels:
node-role.x-k8s.io/worker: ""
nvidia.com/gpu.clusteruuid: "0f884867-ba2f-4294-9155-b495ff367eea"
nvidia.com/gpu.cliqueid: "{{ add 1 (mod $gpu 2) }}"
kubeadmConfigPatches:
- |
kind: JoinConfiguration
Expand All @@ -62,4 +60,15 @@ nodes:
# in `/etc/nvidia-container-runtime/config.toml`
- hostPath: /dev/null
containerPath: /var/run/nvidia-container-devices/cdi/runtime.nvidia.com/gpu/{{ $gpu }}
# The generated CDI specification assumes that `nvidia-ctk` is available on a
# node -- specifically for the `nvidia-ctk hook` subcommand. As a workaround,
# we mount it from the host.
# TODO: Remove this once we have a more stable solution to make `nvidia-ctk`
# on the kind nodes.
- hostPath: /usr/bin/nvidia-ctk
containerPath: /usr/bin/nvidia-ctk
# We need to inject the fabricmanager socket to support MIG with toolkit 1.16.2
# TODO: Remove this once we have a version of the toolkit where this is not required
- hostPath: /run/nvidia-fabricmanager/socket
containerPath: /run/nvidia-fabricmanager/socket
{{- end }}
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test-mps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ kind: Namespace
metadata:
name: gpu-test-mps
---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test-mps
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
name: gpu-test1

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test1
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
name: gpu-test2

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test2
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
name: gpu-test3

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
namespace: gpu-test3
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test4.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
name: gpu-test4

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test4
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
name: gpu-test5

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test5
Expand Down
2 changes: 1 addition & 1 deletion demo/specs/quickstart/gpu-test6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
name: gpu-test6

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: gpu-test6
Expand Down
10 changes: 5 additions & 5 deletions demo/specs/quickstart/imex-test1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
name: imex-test1

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
namespace: imex-test1
Expand All @@ -20,7 +20,7 @@ spec:
deviceClassName: imex.nvidia.com

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
namespace: imex-test1
Expand All @@ -32,7 +32,7 @@ spec:
deviceClassName: imex.nvidia.com

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
namespace: imex-test1
Expand All @@ -44,7 +44,7 @@ spec:
deviceClassName: imex.nvidia.com

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaim
metadata:
namespace: imex-test1
Expand All @@ -56,7 +56,7 @@ spec:
deviceClassName: imex.nvidia.com

---
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: ResourceClaimTemplate
metadata:
namespace: imex-test1
Expand Down
Loading

0 comments on commit 6c34f5f

Please sign in to comment.