Skip to content

Commit

Permalink
add gpu nfd rule
Browse files Browse the repository at this point in the history
  • Loading branch information
oxxenix committed Dec 10, 2024
1 parent 5f43b0d commit e5bd563
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 2 deletions.
8 changes: 8 additions & 0 deletions charts/intel-gpu-resource-driver/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,11 @@ description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Reso
type: application
version: 0.6.0
appVersion: "v0.6.0"
home: https://github.com/intel/helm-charts

dependencies:
- name: node-feature-discovery
alias: nfd
version: "0.16.6"
condition: nfd.enabled
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
2 changes: 2 additions & 0 deletions charts/intel-gpu-resource-driver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ helm repo update
You can execute `helm search repo intel` command to see pulled charts [optional].

## Install Helm Chart
When installing, update the dependencies:
```
helm dependency update
helm install intel-gpu-resource-driver intel/intel-gpu-resource-driver
```
## Upgrade Chart
Expand Down
229 changes: 229 additions & 0 deletions charts/intel-gpu-resource-driver/templates/nfd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
apiVersion: nfd.k8s-sigs.io/v1alpha1
kind: NodeFeatureRule
metadata:
name: intel-gpu-device-rule
spec:
rules:
- name: "intel.gpu"
labels:
"feature.node.kubernetes.io/gpu": "true"
matchFeatures:
- feature: pci.device
matchExpressions:
vendor: {op: In, value: ["8086"]}
class: {op: In, value: ["0300", "0380"]}
matchAny:
- matchFeatures:
- feature: kernel.loadedmodule
matchExpressions:
i915: {op: Exists}
- matchFeatures:
- feature: kernel.enabledmodule
matchExpressions:
i915: {op: Exists}
- matchFeatures:
- feature: kernel.loadedmodule
matchExpressions:
xe: {op: Exists}
- matchFeatures:
- feature: kernel.enabledmodule
matchExpressions:
xe: {op: Exists}
---
apiVersion: nfd.k8s-sigs.io/v1alpha1
kind: NodeFeatureRule
metadata:
name: intel-gpu-platform-labeling
spec:
rules:
- extendedResources:
gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores"
gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max"
gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles"
matchFeatures:
- feature: local.label
matchExpressions:
gpu.intel.com/millicores: {op: Exists}
gpu.intel.com/memory.max: {op: Exists}
gpu.intel.com/tiles: {op: Exists}
name: intel.gpu.fractionalresources
# generic rule for older and upcoming devices
- labelsTemplate: |
{{"{{"}} range .pci.device {{"}}"}}gpu.intel.com/device-id.{{"{{"}} .class {{"}}"}}-{{"{{"}} .device {{"}}"}}.present=true
{{"{{"}} end {{"}}"}}
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0300"
- "0380"
vendor:
op: In
value:
- "8086"
name: intel.gpu.generic.deviceid
- labelsTemplate: gpu.intel.com/device-id.0300-{{"{{"}} (index .pci.device 0).device {{"}}"}}.count={{"{{"}} len .pci.device {{"}}"}}
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0300"
vendor:
op: In
value:
- "8086"
name: intel.gpu.generic.count.300
- labelsTemplate: gpu.intel.com/device-id.0380-{{"{{"}} (index .pci.device 0).device {{"}}"}}.count={{"{{"}} len .pci.device {{"}}"}}
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
name: intel.gpu.generic.count.380
- labels:
gpu.intel.com/product: "Max_1100"
labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "0bda"
name: intel.gpu.max.1100
- labels:
gpu.intel.com/product: "Max_1550"
labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "0bd5"
name: intel.gpu.max.1550
- labels:
gpu.intel.com/family: "Max_Series"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "0bda"
- "0bd5"
- "0bd9"
- "0bdb"
- "0bd7"
- "0bd6"
- "0bd0"
name: intel.gpu.max.series
- labels:
gpu.intel.com/family: "Flex_Series"
gpu.intel.com/product: "Flex_170"
labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "56c0"
name: intel.gpu.flex.170
- labels:
gpu.intel.com/family: "Flex_Series"
gpu.intel.com/product: "Flex_140"
labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0380"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "56c1"
name: intel.gpu.flex.140
- labels:
gpu.intel.com/family: "A_Series"
matchFeatures:
- feature: pci.device
matchExpressions:
class:
op: In
value:
- "0300"
vendor:
op: In
value:
- "8086"
device:
op: In
value:
- "56a6"
- "56a5"
- "56a1"
- "56a0"
- "5694"
- "5693"
- "5692"
- "5691"
- "5690"
- "56b3"
- "56b2"
- "56a4"
- "56a3"
- "5697"
- "5696"
- "5695"
- "56b1"
- "56b0"
name: intel.gpu.a.series
22 changes: 20 additions & 2 deletions charts/intel-gpu-resource-driver/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,24 @@ serviceAccount:

kubeletPlugin:
podAnnotations: {}
tolerations: []
nodeSelector: {}
nodeSelector:
feature.node.kubernetes.io/gpu: "true"
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
# Refer to the official documentation for Node Feature Discovery (NFD)
# regarding node tainting:
# https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
- key: "node.kubernetes.io/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity: {}

nfd:
enabled: false # change to true to install NFD to the cluster
nameOverride: intel-gpu-nfd
enableNodeFeatureApi: true

0 comments on commit e5bd563

Please sign in to comment.