From 66bdc4f55332e2173f86d85aa94f9362eefbd9a8 Mon Sep 17 00:00:00 2001 From: Kevin Hannon Date: Mon, 16 Sep 2024 16:29:34 -0400 Subject: [PATCH] Implement a KEP for DRA and Kueue --- keps/2941-DRA-Structured-Parameters/README.md | 257 ++++++++++++++---- .../examples/gpu-test1/gpu-test1.yaml | 75 ++--- .../gpu-test1/single-clusterqueue-setup.yaml | 7 +- 3 files changed, 254 insertions(+), 85 deletions(-) diff --git a/keps/2941-DRA-Structured-Parameters/README.md b/keps/2941-DRA-Structured-Parameters/README.md index 0b204c806a..3057e648c2 100644 --- a/keps/2941-DRA-Structured-Parameters/README.md +++ b/keps/2941-DRA-Structured-Parameters/README.md @@ -41,21 +41,105 @@ tags, and then generate with `hack/update-toc.sh`. ## Summary Dynamic Resource Allocation (DRA) is a major effort to improve device support in Kubernetes. -It changes how one can request resources in a myriad of ways. Kueue should be able to integrate with DRA. +It changes how one can request resources in a myriad of ways. ## Motivation -DRA allows for more elaborate scheduling of devices. It puts control in how devices are scheduled into the device driver. +Dynamic Resource Allocation (DRA) provides the groundwork for more sophisticated device allocations to Pods. +Quota management is about enforcing rules around the use of resources. +For example, GPUs are resource constrained and a popular request is the ability to enforce fair sharing of GPU resources. +With these devices, many users want access and sometimes some users want the ability to preempt other users if their workloads have a higher priority. Kueue provides support for this. -### DRA Background +DRA provides a future where users could schedule partitionable GPU devices (MIG) or time slicing. As devices gain a more robust way to schedule, it is important to walk through how support of DRA will work with Kueue. -To be able to dive into details for Kueue, I first want to summarize the different usecases for DRA from a workload perspective. +### Background -#### Examples +DRA has three APIs that are relevant for a Kueue: +- Resource Claims +- DeviceClasses +- ResourceSlices + +#### DRA Example + +I found the easiest way to test DRA was to use [dra example driver repository](https://github.com/kubernetes-sigs/dra-example-driver) + +You can clone that repo and run `make setup-e2e` and that will create a Kind cluster with the DRA feature gate and install a mock dra driver. + +This does not use actual GPUs so it is perfect for a test environment for exploring Kueue and DRA integration. + +#### Workload Example + +An example workload that uses DRA: + +```yaml +--- + +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test1 + name: single-gpu +spec: + spec: + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com + +--- + +apiVersion: batch/v1 +kind: Job +metadata: + namespace: gpu-test1 + name: job0 + labels: + app: job + kueue.x-k8s.io/queue-name: user-queue +spec: + template: + spec: + restartPolicy: Never + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: gpu + requests: + cpu: 1 + memory: "200Mi" + resourceClaims: + - name: gpu + resourceClaimTemplateName: gpu.example.com +``` + +#### Example Driver Cluster Resources + +The dra-example-driver creates a resource slice and a device class for the entire cluster. + +##### Resource slices + +Resource slices are meant for communication between drivers and the control planes. These are not expected to be used for workloads. + +Kueue does not need to be aware of these resources. + +##### Device classes + +Each driver creates a device class and every resource claim will reference the device class. + +The dra-example-driver has a simple device class named `gpu.example.com`. + +This can be a way to enforce quota limits. ### Goals +- Users can submit workloads using resource claims and Kueue can monitor the usage. +- Admins can enforce the number of requests to a given device class. + - -[ ] I/we understand the owners of the involved components may require updates to +[x] I/we understand the owners of the involved components may require updates to existing tests to make this code solid enough prior to committing the changes necessary to implement this enhancement. @@ -167,10 +337,19 @@ This can inform certain test coverage improvements that we want to do before extending the production code to implement this enhancement. --> +TBD - ``: `` - `` #### Integration tests +I am not sure if we can test DRA functionality (requiring alpha features enabled) at the integration level. + +DRA requires a kubelet plugin so this may not be a good candidate for an integration test. + +#### E2E Test + +It may be worth creating install dra-example-driver and testing this e2e. + +The goal will be limit changes only if this feature gate is enabled in combination with the DRA feature. ## Implementation History - +- Draft on September 16th 2024. ## Drawbacks - +NA. Kueue should be able to schedule devices following what upstream is proposing. +The only drawbacks are that workloads will have to fetch the resource claim if they are specifying resource claims. ## Alternatives - +### Resource Claim By Count + +Originally I was thinking one could keep a tally of the resource claims for a given workload. +The issue with this is that resource claims are namespaced scoped. +To enforce quota usage across namespaces we need to use cluster scoped resources. \ No newline at end of file diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/gpu-test1.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/gpu-test1.yaml index a62bd795b3..a2b072f70a 100644 --- a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/gpu-test1.yaml +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/gpu-test1.yaml @@ -1,41 +1,46 @@ # Two pods, one container each # Each container asking for 1 distinct GPU - --- - apiVersion: resource.k8s.io/v1alpha3 - kind: ResourceClaimTemplate - metadata: - namespace: gpu-test1 - name: gpu.example.com - spec: - spec: - devices: - requests: - - name: gpu - deviceClassName: gpu.example.com +--- - --- - apiVersion: batch/v1 - kind: Job - metadata: - namespace: gpu-test1 - name: job0 - labels: - app: job - kueue.x-k8s.io/queue-name: user-queue +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test1 + name: single-gpu +spec: spec: - template: - spec: - restartPolicy: Never - containers: - - name: ctr0 - image: ubuntu:22.04 - command: ["bash", "-c"] - args: ["export; sleep 9999"] - resources: - claims: - - name: gpu - resourceClaims: - - name: gpu - resourceClaimTemplateName: gpu.example.com + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com + +--- + +apiVersion: batch/v1 +kind: Job +metadata: + namespace: gpu-test1 + name: job0 + labels: + app: job + kueue.x-k8s.io/queue-name: user-queue +spec: + template: + spec: + restartPolicy: Never + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: gpu + requests: + cpu: 1 + memory: "200Mi" + resourceClaims: + - name: gpu + resourceClaimTemplateName: gpu.example.com diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml index 8f4e0e9ecf..8434ee8df1 100644 --- a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml @@ -10,14 +10,17 @@ metadata: spec: namespaceSelector: {} # match all. resourceGroups: - - coveredResources: ["cpu", "memory"] + - coveredResources: ["cpu", "memory", "gpu.example.com"] flavors: - name: "default-flavor" resources: - name: "cpu" nominalQuota: 9 - name: "memory" - nominalQuota: 36Gi + nominalQuota: "200Mi" + - name: "gpu.example.com" + nominalQuota: 2 + kind: "DeviceClass" --- apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue