@@ -74,13 +74,39 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
7474 # (Optional) Specifying image names are optional. Default image names for shown here if not specified.
7575 devicePluginImage : rocm/k8s-device-plugin:latest # Change this to trigger metrics exporter upgrade on CR update
7676 devicePluginImagePullPolicy : IfNotPresent # Image pull policy for the device plugin. Either `Always`, `IfNotPresent` or `Never`
77+ # devicePluginImagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
78+ devicePluginTolerations :
79+ key : " key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
80+ # operator must be "Exists"; this combination means to match all values and all keys.
81+ operator : " Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal.
82+ # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
83+ value : " value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
84+ # otherwise just a regular string.
85+ effect : " NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed
86+ # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
87+ tolerationSeconds : [Expected Int value, not set by default] # Seconds represents the period of time the toleration tolerates the taint.
88+ # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this,
89+ # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
7790 nodeLabellerImage : rocm/k8s-device-plugin:labeller-latest # Change this to trigger metrics exporter upgrade on CR update
78- nodeLabellerImagePullPolicy : IfNotPresent # Image pull policy for the device plugin. Either `Always`, `IfNotPresent` or `Never`
79- # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
80- # you can create the docker-registry type secret by running command like:
81- # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
82- # Make sure you created the secret within the namespace that KMM operator is running
91+ nodeLabellerImagePullPolicy : IfNotPresent # Image pull policy for the node labeller. Either `Always`, `IfNotPresent` or `Never`
92+ # nodeLabellerImagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
93+ nodeLabellerTolerations :
94+ key : " key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
95+ # operator must be "Exists"; this combination means to match all values and all keys.
96+ operator : " Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal.
97+ # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
98+ value : " value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
99+ # otherwise just a regular string.
100+ effect : " NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed
101+ # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
102+ tolerationSeconds : [Expected Int value, not set by default] # Seconds represents the period of time the toleration tolerates the taint.
103+ # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this,
104+ # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
83105 imageRegistrySecret :
106+ # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
107+ # you can create the docker-registry type secret by running command like:
108+ # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
109+ # Make sure you created the secret within the namespace that KMM operator is running
84110 name : mysecret
85111 upgradePolicy :
86112 # (Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
@@ -94,18 +120,82 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
94120 port : 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
95121 # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
96122 image : rocm/device-metrics-exporter:v1.2.0 # Change this to trigger metrics exporter upgrade on CR update
123+ imagePullPolicy : " IfNotPresent" # image pull policy for the metrics exporter container. Either `Always`, `IfNotPresent` or `Never`
124+ # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
125+ config :
126+ # Name of the ConfigMap that contains the metrics exporter configuration.
127+ name : gpu-config # (Optional) If the configmap does not exist the DeviceConfig will show a validation error and not start any plugin pods
128+ upgradePolicy :
129+ # (Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
130+ # get upgraded according to the defaults, which is `upgradeStrategy` set to `RollingUpdate` and `maxUnavailable` set to 1.
131+ upgradeStrategy : RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
132+ maxUnavailable : 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
133+ # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
134+ # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
135+ tolerations :
136+ key : " key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
137+ # operator must be "Exists"; this combination means to match all values and all keys.
138+ operator : " Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal.
139+ # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
140+ value : " value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
141+ # otherwise just a regular string.
142+ effect : " NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed
143+ # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
144+ tolerationSeconds : [Expected Int value, not set by default] # Seconds represents the period of time the toleration tolerates the taint.
145+ # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this,
146+ # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
147+ imageRegistrySecret :
148+ # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
149+ # you can create the docker-registry type secret by running command like:
150+ # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
151+ # Make sure you created the secret within the namespace that KMM operator is running
152+ name : mysecret
153+ selector :
154+ feature.node.kubernetes.io/amd-gpu : " true" # You must include this again as this selector will overwrite the global selector
155+ amd.com/device-metrics-exporter : " true" # Helpful for when you want to disable the metrics exporter on specific nodes
156+ # # AMD GPU Device Test Runner Configuration ##
157+ testRunner :
158+ enable : true # false by Default. Set to true to enable the Metrics Exporter
159+ serviceType : ClusterIP # ServiceType used to expose the Metrics Exporter endpoint. Can be either `ClusterIp` or `NodePort`.
160+ port : 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
161+ # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
162+ image : docker.io/rocm/test-runner:v1.2.0-beta.0 # Change this to trigger metrics exporter upgrade on CR update
163+ imagePullPolicy : " IfNotPresent" # image pull policy for the test runner container. Either `Always`, `IfNotPresent` or `Never`
164+ # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
97165 config :
98- name : exporter-configmap # Name of the ConfigMap that contains the metrics exporter configuration
166+ # Name of the configmap to customize the config for test runner. If not specified default test config will be aplied
167+ name : test-config # (Optional) If the configmap does not exist the DeviceConfig will show a validation error and not start any plugin pods
168+ logsLocation :
169+ mountPath : " /var/log/amd-test-runner" # mount path inside test runner container for log files
170+ hostPath : " /var/log/amd-test-runner" # host path to be mounted into test runner container for log files
99171 upgradePolicy :
100172 # (Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
101173 # get upgraded according to the defaults, which is `upgradeStrategy` set to `RollingUpdate` and `maxUnavailable` set to 1.
102174 upgradeStrategy : RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
103175 maxUnavailable : 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
104176 # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
105177 # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
178+ tolerations :
179+ key : " key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
180+ # operator must be "Exists"; this combination means to match all values and all keys.
181+ operator : " Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal.
182+ # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
183+ value : " value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
184+ # otherwise just a regular string.
185+ effect : " NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed
186+ # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
187+ tolerationSeconds : [Expected Int value, not set by default] # Seconds represents the period of time the toleration tolerates the taint.
188+ # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this,
189+ # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
190+ imageRegistrySecret :
191+ # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
192+ # you can create the docker-registry type secret by running command like:
193+ # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
194+ # Make sure you created the secret within the namespace that KMM operator is running
195+ name : mysecret
106196 selector :
107197 feature.node.kubernetes.io/amd-gpu : " true" # You must include this again as this selector will overwrite the global selector
108- amd.com/device-metrics-exporter : " true" # Helpful for when you want to disable the metrics exporter on specific nodes
198+ amd.com/device-test-runner : " true" # Helpful for when you want to disable the test runner on specific nodes
109199 selector :
110200 # Specify the nodes to be managed by this DeviceConfig Custom Resource. This will be applied to all components unless a selector
111201 # is specified in the component configuration. The node labeller will automatically find nodes with AMD GPUs and apply the label
@@ -132,9 +222,12 @@ The below is an example of the minimal DeviceConfig CR that can be used to insta
132222 metricsExporter :
133223 enable : true # To enable/disable the metrics exporter, disabled by default
134224 serviceType : " NodePort" # Node port for metrics exporter service
135- config :
136- name : exporter-configmap
137225 nodePort : 32500
226+ testRunner :
227+ enable : true
228+ logsLocation :
229+ mountPath : " /var/log/amd-test-runner" # mount path inside test runner container for logs
230+ hostPath : " /var/log/amd-test-runner" # host path to be mounted into test runner container for logs
138231 selector :
139232 feature.node.kubernetes.io/amd-gpu : " true"
140233
0 commit comments