From 6f9ad1d5a497ef18c062d52d339ca2244f02a430 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Thu, 5 Dec 2024 21:25:41 +0530 Subject: [PATCH] Update fake-toolkit-ready and disable all fractional gpu config by default (#853) * Update fake-toolkit-ready.yaml * Disable fractional gpus config by default * Update README.md with readme-generator-for-helm Signed-off-by: chiragjn * Update fake-toolkit-ready.yaml --------- Signed-off-by: chiragjn Co-authored-by: chiragjn --- charts/tfy-gpu-operator/Chart.yaml | 2 +- charts/tfy-gpu-operator/README.md | 6 ++- .../templates/fake-toolkit-ready.yaml | 3 +- charts/tfy-gpu-operator/values.yaml | 49 ++++++++++++++----- 4 files changed, 44 insertions(+), 16 deletions(-) diff --git a/charts/tfy-gpu-operator/Chart.yaml b/charts/tfy-gpu-operator/Chart.yaml index 36056ab94..ca7dd3ea8 100644 --- a/charts/tfy-gpu-operator/Chart.yaml +++ b/charts/tfy-gpu-operator/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: tfy-gpu-operator -version: 0.1.23 +version: 0.1.24 description: "Truefoundry GPU Operator" maintainers: - name: truefoundry diff --git a/charts/tfy-gpu-operator/README.md b/charts/tfy-gpu-operator/README.md index 4bb40ee07..cd5855f20 100644 --- a/charts/tfy-gpu-operator/README.md +++ b/charts/tfy-gpu-operator/README.md @@ -55,6 +55,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management | `aws-eks-gpu-operator.dcgmExporter.resources.limits.cpu` | CPU limit for the DCGM Exporter. | `100m` | | `aws-eks-gpu-operator.dcgmExporter.resources.limits.memory` | Memory limit for the DCGM Exporter. | `1000Mi` | | `aws-eks-gpu-operator.dcgmExporter.args` | Arguments for the DCGM Exporter. | `["-c","5000"]` | +| `aws-eks-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `none` | ### gcp-gke-standard-driver Configuration for the GKE Standard Nvidia Drivers. This section will only be used when clusterType.gcpGkeStandard is set to true. @@ -123,7 +124,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management | `azure-aks-gpu-operator.driver.enabled` | Enable/Disable driver installation. | `false` | | `azure-aks-gpu-operator.toolkit.enabled` | Enable/Disable nvidia container toolkit installation. | `true` | | `azure-aks-gpu-operator.toolkit.version` | Version of the toolkit. Note for Aure Linux change `-ubuntu20.04` to `-ubi8`. However at the time of writing Azure Linux only supports V100 and T4 GPUs | `v1.17.2-ubuntu20.04` | -| `azure-aks-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `mixed` | +| `azure-aks-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `none` | | `azure-aks-gpu-operator.devicePlugin.enabled` | Enable/Disable nvidia device plugin installation. | `true` | | `azure-aks-gpu-operator.dcgm.enabled` | Enabled/Disable standalone DCGM. | `false` | | `azure-aks-gpu-operator.dcgm.version` | Image tag for DCGM container. Find all image tags at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cloud-native/containers/dcgm/tags | `3.3.8-1-ubuntu22.04` | @@ -180,6 +181,7 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management | `civo-talos-gpu-operator.dcgmExporter.resources.limits.cpu` | CPU limit for the DCGM Exporter. | `100m` | | `civo-talos-gpu-operator.dcgmExporter.resources.limits.memory` | Memory limit for the DCGM Exporter. | `1000Mi` | | `civo-talos-gpu-operator.dcgmExporter.args` | Arguments for the DCGM Exporter. | `["-c","5000"]` | +| `civo-talos-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `none` | ### generic-gpu-operator Configuration for the GPU Operator. This section will only be used when clusterType.generic is set to true. @@ -209,7 +211,6 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management | `generic-gpu-operator.driver.enabled` | Enable/Disable driver installation. | `true` | | `generic-gpu-operator.toolkit.enabled` | Enable/Disable nvidia container toolkit installation. | `true` | | `generic-gpu-operator.toolkit.version` | Version of the toolkit. | `v1.17.2-ubuntu20.04` | -| `generic-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `mixed` | | `generic-gpu-operator.devicePlugin.enabled` | Enable/Disable nvidia device plugin installation. | `true` | | `generic-gpu-operator.dcgm.enabled` | Enabled/Disable standalone DCGM. | `false` | | `generic-gpu-operator.dcgm.version` | Image tag for DCGM container. Find all image tags at https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cloud-native/containers/dcgm/tags | `3.3.8-1-ubuntu22.04` | @@ -225,3 +226,4 @@ Tfy-gpu-operator is a Helm chart that facilitates the deployment and management | `generic-gpu-operator.dcgmExporter.resources.limits.cpu` | CPU limit for the DCGM Exporter. | `100m` | | `generic-gpu-operator.dcgmExporter.resources.limits.memory` | Memory limit for the DCGM Exporter. | `1000Mi` | | `generic-gpu-operator.dcgmExporter.args` | Arguments for the DCGM Exporter. | `["-c","5000"]` | +| `generic-gpu-operator.mig.strategy` | migStrategy for mig node, single or mixed | `none` | diff --git a/charts/tfy-gpu-operator/templates/fake-toolkit-ready.yaml b/charts/tfy-gpu-operator/templates/fake-toolkit-ready.yaml index 348728960..ad0ca249e 100644 --- a/charts/tfy-gpu-operator/templates/fake-toolkit-ready.yaml +++ b/charts/tfy-gpu-operator/templates/fake-toolkit-ready.yaml @@ -28,6 +28,7 @@ spec: - |- set -ex; touch /run/nvidia/validations/host-driver-ready; + touch /run/nvidia/validations/driver-ready; touch /run/nvidia/validations/toolkit-ready; sleep infinity; resources: @@ -131,4 +132,4 @@ spec: maxUnavailable: 1 maxSurge: 0 revisionHistoryLimit: 1 -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/tfy-gpu-operator/values.yaml b/charts/tfy-gpu-operator/values.yaml index 0f21a563a..fe262386d 100644 --- a/charts/tfy-gpu-operator/values.yaml +++ b/charts/tfy-gpu-operator/values.yaml @@ -93,8 +93,8 @@ aws-eks-gpu-operator: - name: nvidia.com/gpu replicas: 10 name: time-slicing-config - create: true - default: all + create: false + default: '' ## Node Feature Discovery configuration. node-feature-discovery: @@ -336,6 +336,11 @@ aws-eks-gpu-operator: ## @param aws-eks-gpu-operator.dcgmExporter.args Arguments for the DCGM Exporter. args: ["-c", "5000"] + ## MIG Configuration + mig: + ## @param aws-eks-gpu-operator.mig.strategy migStrategy for mig node, single or mixed + strategy: none + ## MIG Manager configuration. migManager: ## @skip aws-eks-gpu-operator.migManager.enabled @@ -709,7 +714,7 @@ azure-aks-gpu-operator: ## MIG Configuration mig: ## @param azure-aks-gpu-operator.mig.strategy migStrategy for mig node, single or mixed - strategy: mixed + strategy: none ## Device Plugin configuration. devicePlugin: @@ -736,8 +741,8 @@ azure-aks-gpu-operator: - name: nvidia.com/gpu replicas: 10 name: time-slicing-config - create: true - default: all + create: false + default: '' ## DCGM configuration dcgm: @@ -963,6 +968,21 @@ civo-talos-gpu-operator: value: volume-mounts - name: DEVICE_ID_STRATEGY value: index + ## @skip civo-talos-gpu-operator.devicePlugin.config + config: + data: + all: '' + time-sliced-10: |- + version: v1 + sharing: + timeSlicing: + renameByDefault: true + resources: + - name: nvidia.com/gpu + replicas: 10 + name: time-slicing-config + create: false + default: '' ## DCGM configuration dcgm: @@ -1029,6 +1049,11 @@ civo-talos-gpu-operator: ## @param civo-talos-gpu-operator.dcgmExporter.args Arguments for the DCGM Exporter. args: ["-c", "5000"] + ## MIG Configuration + mig: + ## @param civo-talos-gpu-operator.mig.strategy migStrategy for mig node, single or mixed + strategy: none + ## MIG Manager configuration. migManager: ## @skip civo-talos-gpu-operator.migManager.enabled @@ -1193,11 +1218,6 @@ generic-gpu-operator: - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS value: 'true' - ## MIG Configuration - mig: - ## @param generic-gpu-operator.mig.strategy migStrategy for mig node, single or mixed - strategy: mixed - ## Device Plugin configuration. devicePlugin: ## @param generic-gpu-operator.devicePlugin.enabled Enable/Disable nvidia device plugin installation. @@ -1223,8 +1243,8 @@ generic-gpu-operator: - name: nvidia.com/gpu replicas: 10 name: time-slicing-config - create: true - default: all + create: false + default: '' ## DCGM configuration dcgm: @@ -1291,6 +1311,11 @@ generic-gpu-operator: ## @param generic-gpu-operator.dcgmExporter.args Arguments for the DCGM Exporter. args: ["-c", "5000"] + ## MIG Configuration + mig: + ## @param generic-gpu-operator.mig.strategy migStrategy for mig node, single or mixed + strategy: none + ## MIG Manager configuration. migManager: ## @skip generic-gpu-operator.migManager.enabled