add gpu nfd rule

intel · Dec 10, 2024 · e5bd563 · e5bd563
1 parent 5f43b0d
commit e5bd563
Show file tree

Hide file tree

Showing 4 changed files with 259 additions and 2 deletions.
diff --git a/charts/intel-gpu-resource-driver/Chart.yaml b/charts/intel-gpu-resource-driver/Chart.yaml
@@ -5,3 +5,11 @@ description: A Helm chart for a Dynamic Resource Allocation (DRA) Intel GPU Reso
 type: application
 version: 0.6.0
 appVersion: "v0.6.0"
+home: https://github.com/intel/helm-charts
+
+dependencies:
+  - name: node-feature-discovery
+    alias: nfd
+    version: "0.16.6"
+    condition: nfd.enabled
+    repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
diff --git a/charts/intel-gpu-resource-driver/README.md b/charts/intel-gpu-resource-driver/README.md
@@ -16,7 +16,9 @@ helm repo update
 You can execute `helm search repo intel` command to see pulled charts [optional].
 
 ## Install Helm Chart
+When installing, update the dependencies:
 ```
+helm dependency update
 helm install intel-gpu-resource-driver intel/intel-gpu-resource-driver
 ```
 ## Upgrade Chart

diff --git a/charts/intel-gpu-resource-driver/templates/nfd.yaml b/charts/intel-gpu-resource-driver/templates/nfd.yaml
@@ -0,0 +1,229 @@
+apiVersion: nfd.k8s-sigs.io/v1alpha1
+kind: NodeFeatureRule
+metadata:
+  name: intel-gpu-device-rule
+spec:
+  rules:
+    - name: "intel.gpu"
+      labels:
+        "feature.node.kubernetes.io/gpu": "true"
+      matchFeatures:
+        - feature: pci.device
+          matchExpressions:
+            vendor: {op: In, value: ["8086"]}
+            class: {op: In, value: ["0300", "0380"]}
+      matchAny:
+        - matchFeatures:
+          - feature: kernel.loadedmodule
+            matchExpressions:
+              i915: {op: Exists}
+        - matchFeatures:
+          - feature: kernel.enabledmodule
+            matchExpressions:
+              i915: {op: Exists}
+        - matchFeatures:
+          - feature: kernel.loadedmodule
+            matchExpressions:
+              xe: {op: Exists}
+        - matchFeatures:
+          - feature: kernel.enabledmodule
+            matchExpressions:
+              xe: {op: Exists}
+---
+apiVersion: nfd.k8s-sigs.io/v1alpha1
+kind: NodeFeatureRule
+metadata:
+  name: intel-gpu-platform-labeling
+spec:
+  rules:
+  - extendedResources:
+      gpu.intel.com/millicores: "@local.label.gpu.intel.com/millicores"
+      gpu.intel.com/memory.max: "@local.label.gpu.intel.com/memory.max"
+      gpu.intel.com/tiles: "@local.label.gpu.intel.com/tiles"
+    matchFeatures:
+      - feature: local.label
+        matchExpressions:
+          gpu.intel.com/millicores: {op: Exists}
+          gpu.intel.com/memory.max: {op: Exists}
+          gpu.intel.com/tiles: {op: Exists}
+    name: intel.gpu.fractionalresources
+  # generic rule for older and upcoming devices
+  - labelsTemplate: |
+      {{"{{"}} range .pci.device {{"}}"}}gpu.intel.com/device-id.{{"{{"}} .class {{"}}"}}-{{"{{"}} .device {{"}}"}}.present=true
+      {{"{{"}} end {{"}}"}}
+    matchFeatures:
+      - feature: pci.device
+        matchExpressions:
+          class:
+            op: In
+            value:
+            - "0300"
+            - "0380"
+          vendor:
+            op: In
+            value:
+            - "8086"
+    name: intel.gpu.generic.deviceid
+  - labelsTemplate: gpu.intel.com/device-id.0300-{{"{{"}} (index .pci.device 0).device {{"}}"}}.count={{"{{"}} len .pci.device {{"}}"}}
+    matchFeatures:
+      - feature: pci.device
+        matchExpressions:
+          class:
+            op: In
+            value:
+            - "0300"
+          vendor:
+            op: In
+            value:
+            - "8086"
+    name: intel.gpu.generic.count.300
+  - labelsTemplate: gpu.intel.com/device-id.0380-{{"{{"}} (index .pci.device 0).device {{"}}"}}.count={{"{{"}} len .pci.device {{"}}"}}
+    matchFeatures:
+      - feature: pci.device
+        matchExpressions:
+          class:
+            op: In
+            value:
+            - "0380"
+          vendor:
+            op: In
+            value:
+            - "8086"
+    name: intel.gpu.generic.count.380
+  - labels:
+      gpu.intel.com/product: "Max_1100"
+    labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0380"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "0bda"
+    name: intel.gpu.max.1100
+  - labels:
+      gpu.intel.com/product: "Max_1550"
+    labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0380"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "0bd5"
+    name: intel.gpu.max.1550
+  - labels:
+      gpu.intel.com/family: "Max_Series"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0380"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "0bda"
+          - "0bd5"
+          - "0bd9"
+          - "0bdb"
+          - "0bd7"
+          - "0bd6"
+          - "0bd0"
+    name: intel.gpu.max.series
+  - labels:
+      gpu.intel.com/family: "Flex_Series"
+      gpu.intel.com/product: "Flex_170"
+    labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0380"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "56c0"
+    name: intel.gpu.flex.170
+  - labels:
+      gpu.intel.com/family: "Flex_Series"
+      gpu.intel.com/product: "Flex_140"
+    labelsTemplate: "gpu.intel.com/device.count={{"{{"}} len .pci.device {{"}}"}}"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0380"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "56c1"
+    name: intel.gpu.flex.140
+  - labels:
+      gpu.intel.com/family: "A_Series"
+    matchFeatures:
+    - feature: pci.device
+      matchExpressions:
+        class:
+          op: In
+          value:
+          - "0300"
+        vendor:
+          op: In
+          value:
+          - "8086"
+        device:
+          op: In
+          value:
+          - "56a6"
+          - "56a5"
+          - "56a1"
+          - "56a0"
+          - "5694"
+          - "5693"
+          - "5692"
+          - "5691"
+          - "5690"
+          - "56b3"
+          - "56b2"
+          - "56a4"
+          - "56a3"
+          - "5697"
+          - "5696"
+          - "5695"
+          - "56b1"
+          - "56b0"
+    name: intel.gpu.a.series
diff --git a/charts/intel-gpu-resource-driver/values.yaml b/charts/intel-gpu-resource-driver/values.yaml
@@ -19,6 +19,24 @@ serviceAccount:
 
 kubeletPlugin:
   podAnnotations: {}
-  tolerations: []
-  nodeSelector: {}
+  nodeSelector:
+    feature.node.kubernetes.io/gpu: "true"
+  tolerations:
+    - key: node-role.kubernetes.io/master
+      operator: Exists
+      effect: NoSchedule
+    - key: node-role.kubernetes.io/control-plane
+      operator: Exists
+      effect: NoSchedule
+      # Refer to the official documentation for Node Feature Discovery (NFD) 
+      # regarding node tainting: 
+      # https://nfd.sigs.k8s.io/usage/customization-guide#node-tainting
+    - key: "node.kubernetes.io/gpu"
+      operator: "Exists"
+      effect: "NoSchedule"
   affinity: {}
+
+nfd:
+  enabled: false # change to true to install NFD to the cluster
+  nameOverride: intel-gpu-nfd
+  enableNodeFeatureApi: true