Skip to content

Commit

Permalink
Enable NFD rule for GPU resource driver Helm chart (#68)
Browse files Browse the repository at this point in the history
* add gpu nfd rule
  • Loading branch information
oxxenix authored Jan 4, 2025
1 parent 0020d7e commit 5326261
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 7 deletions.
12 changes: 10 additions & 2 deletions charts/intel-gpu-resource-driver/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,13 @@ name: intel-gpu-resource-driver
description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Resource Driver

type: application
version: 0.6.0
appVersion: "v0.6.0"
version: 0.7.0
appVersion: "v0.7.0"
home: https://github.com/intel/helm-charts

dependencies:
- name: node-feature-discovery
alias: nfd
version: "0.16.6"
condition: nfd.enabled
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
4 changes: 3 additions & 1 deletion charts/intel-gpu-resource-driver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ helm repo update
You can execute `helm search repo intel` command to see pulled charts [optional].

## Install Helm Chart
When installing, update the dependencies:
```
helm dependency update
helm install intel-gpu-resource-driver intel/intel-gpu-resource-driver
```
## Upgrade Chart
Expand All @@ -43,7 +45,7 @@ You may also run `helm show values` on this chart's dependencies for additional
| image.repository | string | `intel` |
| image.name | string | `"intel-gpu-resource-driver"` |
| image.pullPolicy | string | `"IfNotPresent"` |
| image.tag | string | `"v0.6.0"` |
| image.tag | string | `"v0.7.0"` |

> [!Note]
> When upgrading, CRDs from previous version need to be removed manually because Helm supports neither upgrading nor deleting CRDs, see: https://github.com/helm/community/blob/main/hips/hip-0011.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: resource.k8s.io/v1alpha3
apiVersion: resource.k8s.io/v1beta1
kind: DeviceClass
metadata:
name: gpu.intel.com
Expand Down
96 changes: 96 additions & 0 deletions charts/intel-gpu-resource-driver/templates/nfd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{{- if .Values.nfd.enabled }}
apiVersion: nfd.k8s-sigs.io/v1alpha1
kind: NodeFeatureRule
metadata:
name: intel-gpu-device-rule
spec:
rules:
- name: "intel.gpu"
labels:
"intel.feature.node.kubernetes.io/gpu": "true"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["8086"]}
class: {op: In, value: ["0300", "0380"]}
matchAny:
- matchFeatures:
- feature: kernel.loadedmodule
matchExpressions:
i915: {op: Exists}
- matchFeatures:
- feature: kernel.enabledmodule
matchExpressions:
i915: {op: Exists}
---
apiVersion: nfd.k8s-sigs.io/v1alpha1
kind: NodeFeatureRule
metadata:
name: intel-gpu-platform-labeling
spec:
rules:
# A_Series (Alchemist)
- labels:
gpu.intel.com/family: "A_Series"
matchFeatures:
- feature: pci.device
matchExpressions:
class: {op: In, value: ["0300"]}
vendor: {op: In, value: ["8086"]}
device:
op: In
value:
- "56a6"
- "56a5"
- "56a1"
- "56a0"
- "5694"
- "5693"
- "5692"
- "5691"
- "5690"
- "56b3"
- "56b2"
- "56a4"
- "56a3"
- "5697"
- "5696"
- "5695"
- "56b1"
- "56b0"
name: intel.gpu.a.series
# Max_Series
- labels:
gpu.intel.com/family: "Max_Series"
matchFeatures:
- feature: pci.device
matchExpressions:
class: {op: In, value: ["0380"]}
vendor: {op: In, value: ["8086"]}
device:
op: In
value:
- "0bda"
- "0bd5"
- "0bd9"
- "0bdb"
- "0bd7"
- "0bd6"
- "0bd0"
name: intel.gpu.max.series
# Flex_Series
- labels:
gpu.intel.com/family: "Flex_Series"
matchFeatures:
- feature: pci.device
matchExpressions:
class: {op: In, value: ["0300", "0380"]}
vendor: {op: In, value: ["8086"]}
device:
op: In
value:
- "0f00"
- "0f01"
- "0f02"
name: intel.gpu.flex.series
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,15 @@ spec:
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if .Values.nfd.enabled }}
nodeSelector:
intel.feature.node.kubernetes.io/gpu: "true"
{{- else }}
{{- with .Values.kubeletPlugin.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
{{- with .Values.kubeletPlugin.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ spec:
matchConstraints:
resourceRules:
- apiGroups: ["resource.k8s.io"]
apiVersions: ["v1alpha3"]
apiVersions: ["v1beta1"]
operations: ["CREATE", "UPDATE", "DELETE"]
resources: ["resourceslices"]
matchConditions:
Expand Down
23 changes: 21 additions & 2 deletions charts/intel-gpu-resource-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ image:
repository: intel
name: intel-gpu-resource-driver
pullPolicy: IfNotPresent
tag: "v0.6.0"
tag: "v0.7.0"

serviceAccount:
create: true
Expand All @@ -19,6 +19,25 @@ serviceAccount:

kubeletPlugin:
podAnnotations: {}
tolerations: []
nodeSelector: {}
# label used when nfd.enabled is true
#intel.feature.node.kubernetes.io/gpu: "true"
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# Refer to the official documentation for Node Feature Discovery (NFD)
# regarding node tainting:
# https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
- key: "node.kubernetes.io/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity: {}

nfd:
enabled: false # change to true to install NFD to the cluster
nameOverride: intel-gpu-nfd
enableNodeFeatureApi: true

0 comments on commit 5326261

Please sign in to comment.