From 53262613c03ac9b63a08c70b9376fd723b5a7eaa Mon Sep 17 00:00:00 2001 From: Oksana Baranova Date: Sun, 5 Jan 2025 00:01:47 +0200 Subject: [PATCH] Enable NFD rule for GPU resource driver Helm chart (#68) * add gpu nfd rule --- charts/intel-gpu-resource-driver/Chart.yaml | 12 ++- charts/intel-gpu-resource-driver/README.md | 4 +- .../templates/device-class.yaml | 2 +- .../templates/nfd.yaml | 96 +++++++++++++++++++ .../templates/resource-driver.yaml | 5 + .../validating-admission-policy.yaml | 2 +- charts/intel-gpu-resource-driver/values.yaml | 23 ++++- 7 files changed, 137 insertions(+), 7 deletions(-) create mode 100644 charts/intel-gpu-resource-driver/templates/nfd.yaml diff --git a/charts/intel-gpu-resource-driver/Chart.yaml b/charts/intel-gpu-resource-driver/Chart.yaml index 9bd90bb..dc665a1 100644 --- a/charts/intel-gpu-resource-driver/Chart.yaml +++ b/charts/intel-gpu-resource-driver/Chart.yaml @@ -3,5 +3,13 @@ name: intel-gpu-resource-driver description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver type: application -version: 0.6.0 -appVersion: "v0.6.0" +version: 0.7.0 +appVersion: "v0.7.0" +home: https://github.com/intel/helm-charts + +dependencies: + - name: node-feature-discovery + alias: nfd + version: "0.16.6" + condition: nfd.enabled + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts diff --git a/charts/intel-gpu-resource-driver/README.md b/charts/intel-gpu-resource-driver/README.md index f27419b..8b55522 100644 --- a/charts/intel-gpu-resource-driver/README.md +++ b/charts/intel-gpu-resource-driver/README.md @@ -16,7 +16,9 @@ helm repo update You can execute `helm search repo intel` command to see pulled charts [optional]. ## Install Helm Chart +When installing, update the dependencies: ``` +helm dependency update helm install intel-gpu-resource-driver intel/intel-gpu-resource-driver ``` ## Upgrade Chart @@ -43,7 +45,7 @@ You may also run `helm show values` on this chart's dependencies for additional | image.repository | string | `intel` | | image.name | string | `"intel-gpu-resource-driver"` | | image.pullPolicy | string | `"IfNotPresent"` | -| image.tag | string | `"v0.6.0"` | +| image.tag | string | `"v0.7.0"` | > [!Note] > When upgrading, CRDs from previous version need to be removed manually because Helm supports neither upgrading nor deleting CRDs, see: https://github.com/helm/community/blob/main/hips/hip-0011.md diff --git a/charts/intel-gpu-resource-driver/templates/device-class.yaml b/charts/intel-gpu-resource-driver/templates/device-class.yaml index cb28849..93a0881 100644 --- a/charts/intel-gpu-resource-driver/templates/device-class.yaml +++ b/charts/intel-gpu-resource-driver/templates/device-class.yaml @@ -1,4 +1,4 @@ -apiVersion: resource.k8s.io/v1alpha3 +apiVersion: resource.k8s.io/v1beta1 kind: DeviceClass metadata: name: gpu.intel.com diff --git a/charts/intel-gpu-resource-driver/templates/nfd.yaml b/charts/intel-gpu-resource-driver/templates/nfd.yaml new file mode 100644 index 0000000..b6ae2a1 --- /dev/null +++ b/charts/intel-gpu-resource-driver/templates/nfd.yaml @@ -0,0 +1,96 @@ +{{- if .Values.nfd.enabled }} +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-gpu-device-rule +spec: + rules: + - name: "intel.gpu" + labels: + "intel.feature.node.kubernetes.io/gpu": "true" + matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["8086"]} + class: {op: In, value: ["0300", "0380"]} + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + i915: {op: Exists} + - matchFeatures: + - feature: kernel.enabledmodule + matchExpressions: + i915: {op: Exists} +--- +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: intel-gpu-platform-labeling +spec: + rules: + # A_Series (Alchemist) + - labels: + gpu.intel.com/family: "A_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: {op: In, value: ["0300"]} + vendor: {op: In, value: ["8086"]} + device: + op: In + value: + - "56a6" + - "56a5" + - "56a1" + - "56a0" + - "5694" + - "5693" + - "5692" + - "5691" + - "5690" + - "56b3" + - "56b2" + - "56a4" + - "56a3" + - "5697" + - "5696" + - "5695" + - "56b1" + - "56b0" + name: intel.gpu.a.series + # Max_Series + - labels: + gpu.intel.com/family: "Max_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: {op: In, value: ["0380"]} + vendor: {op: In, value: ["8086"]} + device: + op: In + value: + - "0bda" + - "0bd5" + - "0bd9" + - "0bdb" + - "0bd7" + - "0bd6" + - "0bd0" + name: intel.gpu.max.series + # Flex_Series + - labels: + gpu.intel.com/family: "Flex_Series" + matchFeatures: + - feature: pci.device + matchExpressions: + class: {op: In, value: ["0300", "0380"]} + vendor: {op: In, value: ["8086"]} + device: + op: In + value: + - "0f00" + - "0f01" + - "0f02" + name: intel.gpu.flex.series +{{- end }} \ No newline at end of file diff --git a/charts/intel-gpu-resource-driver/templates/resource-driver.yaml b/charts/intel-gpu-resource-driver/templates/resource-driver.yaml index 2fbba87..400c471 100644 --- a/charts/intel-gpu-resource-driver/templates/resource-driver.yaml +++ b/charts/intel-gpu-resource-driver/templates/resource-driver.yaml @@ -73,10 +73,15 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.nfd.enabled }} + nodeSelector: + intel.feature.node.kubernetes.io/gpu: "true" + {{- else }} {{- with .Values.kubeletPlugin.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} + {{- end }} {{- with .Values.kubeletPlugin.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml b/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml index dfa1256..503aeb5 100644 --- a/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml +++ b/charts/intel-gpu-resource-driver/templates/validating-admission-policy.yaml @@ -7,7 +7,7 @@ spec: matchConstraints: resourceRules: - apiGroups: ["resource.k8s.io"] - apiVersions: ["v1alpha3"] + apiVersions: ["v1beta1"] operations: ["CREATE", "UPDATE", "DELETE"] resources: ["resourceslices"] matchConditions: diff --git a/charts/intel-gpu-resource-driver/values.yaml b/charts/intel-gpu-resource-driver/values.yaml index a3ee0eb..4345000 100644 --- a/charts/intel-gpu-resource-driver/values.yaml +++ b/charts/intel-gpu-resource-driver/values.yaml @@ -9,7 +9,7 @@ image: repository: intel name: intel-gpu-resource-driver pullPolicy: IfNotPresent - tag: "v0.6.0" + tag: "v0.7.0" serviceAccount: create: true @@ -19,6 +19,25 @@ serviceAccount: kubeletPlugin: podAnnotations: {} - tolerations: [] nodeSelector: {} + # label used when nfd.enabled is true + #intel.feature.node.kubernetes.io/gpu: "true" + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + # Refer to the official documentation for Node Feature Discovery (NFD) + # regarding node tainting: + # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting + - key: "node.kubernetes.io/gpu" + operator: "Exists" + effect: "NoSchedule" affinity: {} + +nfd: + enabled: false # change to true to install NFD to the cluster + nameOverride: intel-gpu-nfd + enableNodeFeatureApi: true