Updated GPU Operator docs version to 1.2.0 (#61)

Farshad Ghodsian · Farshad Ghodsian · commit aa26891f71f8 · 2025-02-27T21:15:22.000-05:00
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
 # AMD GPU Operator
 
-:book: GPU Operator Documentation Site: https://instinct.docs.amd.com/projects/gpu-operator
+## :book: GPU Operator Documentation Site
+
+For the most detailed and up-to-date documentation please visit our Instinct Documenation site: [https://instinct.docs.amd.com/projects/gpu-operator](https://instinct.docs.amd.com/projects/gpu-operator)
 
 ## Introduction
 
@@ -53,12 +55,13 @@ helm install cert-manager jetstack/cert-manager \
 ### 1. Add the AMD Helm Repository
 
 ```bash
-helm install amd-gpu-operator --namespace kube-amd-gpu --create-namespace https://github.com/ROCm/gpu-operator/releases/download/v1.0.0/gpu-operator-charts-v1.0.0.tgz
+helm repo add rocm https://rocm.github.io/gpu-operator
+helm repo update
 ```
 
 ### 2. Install the Operator
 
-Basic installation:
+#### Basic installation
 
 ```bash
 helm install amd-gpu-operator rocm/gpu-operator-charts \
@@ -67,19 +70,17 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \
   --version=v1.2.0
 ```
 
-```{note}
-Installation Options
-  - Skip NFD installation: `--set node-feature-discovery.enabled=false`
-  - Skip KMM installation: `--set kmm.enabled=false`
-```
+#### Installation Options
 
-```{warning}
-  It is strongly recommended to use AMD-optimized KMM images included in the operator release.
-```
+* Skip NFD installation: `--set node-feature-discovery.enabled=false`
+* Skip KMM installation: `--set kmm.enabled=false`
+
+> [!WARNING]
+> It is strongly recommended to use AMD-optimized KMM images included in the operator release. This is not required when installing the GPU Operator on Red Hat OpenShift.
 
 ### 3. Install Custom Resource
 
-After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource).
+After the installation of AMD GPU Operator, you need to create the `DeviceConfig` custom resource in order to trigger the operator to start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource).
 
 ### Grafana Dashboards
 
diff --git a/docs/_static/amd-gpu-operator-diagram.png b/docs/_static/amd-gpu-operator-diagram.png
diff --git a/docs/conf.py b/docs/conf.py
@@ -6,7 +6,7 @@
 external_projects_current_project = "amd-gpu-operator"
 
 project = "AMD Instinct Documentation"
-version = "1.1.0"
+version = "1.2.0"
 release = version
 html_title = f"AMD GPU Operator {version}"
 author = "Advanced Micro Devices, Inc."
diff --git a/docs/fulldeviceconfig.rst b/docs/fulldeviceconfig.rst
@@ -74,13 +74,39 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
         # (Optional) Specifying image names are optional. Default image names for shown here if not specified.
         devicePluginImage: rocm/k8s-device-plugin:latest # Change this to trigger metrics exporter upgrade on CR update
         devicePluginImagePullPolicy: IfNotPresent # Image pull policy for the device plugin. Either `Always`, `IfNotPresent` or `Never`
+        # devicePluginImagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
+        devicePluginTolerations:
+          key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
+          # operator must be "Exists"; this combination means to match all values and all keys.
+          operator: "Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal. 
+          # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+          value: "value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
+          # otherwise just a regular string.
+          effect: "NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed 
+          # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
+          tolerationSeconds: [Expected Int value, not set by default] #Seconds represents the period of time the toleration tolerates the taint. 
+          # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this, 
+          # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
         nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest # Change this to trigger metrics exporter upgrade on CR update
-        nodeLabellerImagePullPolicy: IfNotPresent # Image pull policy for the device plugin. Either `Always`, `IfNotPresent` or `Never`
-        # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
-        # you can create the docker-registry type secret by running command like:
-        # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
-        # Make sure you created the secret within the namespace that KMM operator is running
+        nodeLabellerImagePullPolicy: IfNotPresent # Image pull policy for the node labeller. Either `Always`, `IfNotPresent` or `Never`
+        # nodeLabellerImagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
+        nodeLabellerTolerations:
+          key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
+          # operator must be "Exists"; this combination means to match all values and all keys.
+          operator: "Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal. 
+          # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+          value: "value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
+          # otherwise just a regular string.
+          effect: "NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed 
+          # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
+          tolerationSeconds: [Expected Int value, not set by default] #Seconds represents the period of time the toleration tolerates the taint. 
+          # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this, 
+          # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
         imageRegistrySecret:
+          # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
+          # you can create the docker-registry type secret by running command like:
+          # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
+          # Make sure you created the secret within the namespace that KMM operator is running
           name: mysecret
         upgradePolicy:
           #(Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
@@ -94,18 +120,82 @@ Below is an example of a full DeviceConfig CR that can be used to install the AM
         port: 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
         # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
         image: rocm/device-metrics-exporter:v1.2.0 # Change this to trigger metrics exporter upgrade on CR update
+        imagePullPolicy: "IfNotPresent" # image pull policy for the metrics exporter container. Either `Always`, `IfNotPresent` or `Never`
+        # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
+        config:
+          # Name of the ConfigMap that contains the metrics exporter configuration.
+          name: gpu-config # (Optional) If the configmap does not exist the DeviceConfig will show a validation error and not start any plugin pods
+        upgradePolicy:
+          #(Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
+          # get upgraded according to the defaults, which is `upgradeStrategy` set to `RollingUpdate` and `maxUnavailable` set to 1.
+          upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
+          maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
+        # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
+        # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
+        tolerations:
+          key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
+          # operator must be "Exists"; this combination means to match all values and all keys.
+          operator: "Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal. 
+          # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+          value: "value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
+          # otherwise just a regular string.
+          effect: "NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed 
+          # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
+          tolerationSeconds: [Expected Int value, not set by default] #Seconds represents the period of time the toleration tolerates the taint. 
+          # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this, 
+          # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
+        imageRegistrySecret:
+          # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
+          # you can create the docker-registry type secret by running command like:
+          # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
+          # Make sure you created the secret within the namespace that KMM operator is running
+          name: mysecret
+        selector:   
+          feature.node.kubernetes.io/amd-gpu: "true" # You must include this again as this selector will overwrite the global selector
+          amd.com/device-metrics-exporter: "true" # Helpful for when you want to disable the metrics exporter on specific nodes
+      ## AMD GPU Device Test Runner Configuration ##
+      testRunner: 
+        enable: true # false by Default. Set to true to enable the Metrics Exporter 
+        serviceType: ClusterIP # ServiceType used to expose the Metrics Exporter endpoint. Can be either `ClusterIp` or `NodePort`.
+        port: 5000 # Note if specifying NodePort as the serviceType use `32500` as the port number must be between 30000-32767
+        # (Optional) Specifying metrics exporter image is optional. Default imagename shown here if not specified.
+        image: docker.io/rocm/test-runner:v1.2.0-beta.0 # Change this to trigger metrics exporter upgrade on CR update
+        imagePullPolicy: "IfNotPresent" # image pull policy for the test runner container. Either `Always`, `IfNotPresent` or `Never`
+        # imagePullPolicy default value is "IfNotPresent" for valid tags, "Always" for no tag or "latest" tag
         config:
-          name: exporter-configmap # Name of the ConfigMap that contains the metrics exporter configuration
+          # Name of the configmap to customize the config for test runner. If not specified default test config will be aplied
+          name: test-config # (Optional) If the configmap does not exist the DeviceConfig will show a validation error and not start any plugin pods
+        logsLocation:
+          mountPath: "/var/log/amd-test-runner" # mount path inside test runner container for log files
+          hostPath: "/var/log/amd-test-runner" # host path to be mounted into test runner container for log files
         upgradePolicy:
           #(Optional) If no UpgradePolicy is mentioned for any of the components but their image is changed, the daemonset will
           # get upgraded according to the defaults, which is `upgradeStrategy` set to `RollingUpdate` and `maxUnavailable` set to 1.
           upgradeStrategy: RollingUpdate, # (Optional) Can be either `RollingUpdate` or `OnDelete`
           maxUnavailable: 1 # (Optional) Number of pods that can be unavailable during the upgrade process. 1 is the default value
         # If specifying a node selector here, the metrics exporter will only be deployed on nodes that match the selector
         # See Item #6 on https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/knownlimitations.html for example usage
+        tolerations:
+          key: "key1" # Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty,
+          # operator must be "Exists"; this combination means to match all values and all keys.
+          operator: "Equal" # Operator represents a key's relationship to the value. Valid operators are Exists and Equal. 
+          # Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category.
+          value: "value1" # Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty,
+          # otherwise just a regular string.
+          effect: "NoSchedule" # Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed 
+          # values are "NoSchedule", "PreferNoSchedule" and "NoExecute".
+          tolerationSeconds: [Expected Int value, not set by default] #Seconds represents the period of time the toleration tolerates the taint. 
+          # By default, it is not set, which means tolerate the taint forever (do not evict). Effect needs to be NoExecute for this, 
+          # otherwise this field is ignored. Zero and negative values will be treated as 0 (evict immediately) by the system.
+        imageRegistrySecret:
+          # (Optional) Specify the credential for your private registry if it requires credential to get pull/push access
+          # you can create the docker-registry type secret by running command like:
+          # kubectl create secret docker-registry mysecret -n kmm-namespace --docker-username=xxx --docker-password=xxx
+          # Make sure you created the secret within the namespace that KMM operator is running
+          name: mysecret
         selector:   
           feature.node.kubernetes.io/amd-gpu: "true" # You must include this again as this selector will overwrite the global selector
-          amd.com/device-metrics-exporter: "true" # Helpful for when you want to disable the metrics exporter on specific nodes 
+          amd.com/device-test-runner: "true" # Helpful for when you want to disable the test runner on specific nodes 
       selector: 
       # Specify the nodes to be managed by this DeviceConfig Custom Resource.  This will be applied to all components unless a selector 
       # is specified in the component configuration. The node labeller will automatically find nodes with AMD GPUs and apply the label 
@@ -132,9 +222,12 @@ The below is an example of the minimal DeviceConfig CR that can be used to insta
     metricsExporter:
       enable: true # To enable/disable the metrics exporter, disabled by default
       serviceType: "NodePort" # Node port for metrics exporter service
-      config:
-        name: exporter-configmap
       nodePort: 32500
+      testRunner:
+        enable: true
+        logsLocation:
+          mountPath: "/var/log/amd-test-runner" # mount path inside test runner container for logs
+          hostPath: "/var/log/amd-test-runner" # host path to be mounted into test runner container for logs
     selector:
       feature.node.kubernetes.io/amd-gpu: "true"
 
diff --git a/docs/installation/openshift-helm.md b/docs/installation/openshift-helm.md
@@ -1,7 +1,7 @@
 # OpenShift (Helm)
 
 ```{warning}
-Installing via Helm is not a recommended method for Red Hat OpenShift. Users wishing to use the AMD GPU with OpenShift should consider using the OLM method instead.
+Installing via Helm is not a recommended method for Red Hat OpenShift. Users wishing to use the AMD GPU with OpenShift should consider using the OLM method instead. This method is not supported in the latest version. As a result, this page will be removed in the next release.
 ```
 
 This guide walks through installing the AMD GPU Operator on an OpenShift cluster using Helm.
diff --git a/example/deviceconfig_example.yaml b/example/deviceconfig_example.yaml
@@ -84,9 +84,9 @@ spec:
     #  name: exporterimagesecret
 
     # metrics config in configmap
-    config:
+    # config:
       # configmap name, example config in example/metricsExporter/config.json
-      name: configmap-name
+    #  name: gpu-config
 
   # Specify the testrunner config
   testRunner: