project-codeflare · dgrove-oss · Aug 20, 2024 · Aug 20, 2024
diff --git a/setup.k8s-v1.25/CLUSTER-SETUP.md b/setup.k8s-v1.25/CLUSTER-SETUP.md
@@ -98,3 +98,46 @@ Create `mlbatch-edit` role:
 ```sh
 kubectl apply -f setup.k8s-v1.25/mlbatch-edit-role.yaml
 ```
+
+## Slack Cluster Queue
+
+Create the designated slack `ClusterQueue` which will be used to automate
+minor adjustments to cluster capacity caused by node failures and
+scheduler maintanence.
+```sh
+kubectl apply -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: slack-cluster-queue
+spec:
+  namespaceSelector: {}
+  cohort: default-cohort
+  preemption:
+    withinClusterQueue: LowerOrNewerEqualPriority
+    reclaimWithinCohort: Any
+    borrowWithinCohort:
+      policy: Never
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"]
+    flavors:
+    - name: default-flavor
+      resources:
+      - name: "cpu"
+        nominalQuota: 8000m
+      - name: "memory"
+        nominalQuota: 128Gi
+      - name: "nvidia.com/gpu"
+        nominalQuota: 8
+      - name: "nvidia.com/roce_gdr"
+        nominalQuota: 1
+      - name: "pods"
+        nominalQuota: 100
+EOF
+```
+Edit the above quantities to adjust the quota to the desired
+values. Pod counts are optional and can be omitted from the list of
+covered resources.  The `lendingLimit` for each resource will be
+dynamically adjusted by the MLBatch system to reflect reduced cluster
+capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a
+detailed discussion of the role of the slack `ClusterQueue`.
diff --git a/setup.k8s-v1.25/appwrapper/config_patch.yaml b/setup.k8s-v1.25/appwrapper/config_patch.yaml
@@ -13,6 +13,7 @@ data:
           enable: false
       defaultQueueName: default-queue
       schedulerName: scheduler-plugins-scheduler
+      slackQueueName: slack-cluster-queue
       userRBACAdmissionCheck: false
     controllerManager:
       health:

diff --git a/setup.k8s-v1.30/CLUSTER-SETUP.md b/setup.k8s-v1.30/CLUSTER-SETUP.md
@@ -104,3 +104,46 @@ will have local queue names and thus be subject to Kueue's quota management.
 ```sh
 kubectl apply -f setup.k8s-v1.30/admission-policy.yaml
 ```
+
+## Slack Cluster Queue
+
+Create the designated slack `ClusterQueue` which will be used to automate
+minor adjustments to cluster capacity caused by node failures and
+scheduler maintanence.
+```sh
+kubectl apply -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: slack-cluster-queue
+spec:
+  namespaceSelector: {}
+  cohort: default-cohort
+  preemption:
+    withinClusterQueue: LowerOrNewerEqualPriority
+    reclaimWithinCohort: Any
+    borrowWithinCohort:
+      policy: Never
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"]
+    flavors:
+    - name: default-flavor
+      resources:
+      - name: "cpu"
+        nominalQuota: 8000m
+      - name: "memory"
+        nominalQuota: 128Gi
+      - name: "nvidia.com/gpu"
+        nominalQuota: 8
+      - name: "nvidia.com/roce_gdr"
+        nominalQuota: 1
+      - name: "pods"
+        nominalQuota: 100
+EOF
+```
+Edit the above quantities to adjust the quota to the desired
+values. Pod counts are optional and can be omitted from the list of
+covered resources.  The `lendingLimit` for each resource will be
+dynamically adjusted by the MLBatch system to reflect reduced cluster
+capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a
+detailed discussion of the role of the slack `ClusterQueue`.
diff --git a/setup.k8s-v1.30/appwrapper/config_patch.yaml b/setup.k8s-v1.30/appwrapper/config_patch.yaml
@@ -13,6 +13,7 @@ data:
           enable: false
       defaultQueueName: default-queue
       schedulerName: scheduler-plugins-scheduler
+      slackQueueName: slack-cluster-queue
       userRBACAdmissionCheck: false
     controllerManager:
       health:

diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl
@@ -196,3 +196,49 @@ will have local queue names and thus be subject to Kueue's quota management.
 {{ .KUBECTL }} apply -f setup.{{ .VERSION }}/admission-policy.yaml
 ```
 {{- end }}
+
+{{- if .SLACKCQ }}
+
+## Slack Cluster Queue
+
+Create the designated slack `ClusterQueue` which will be used to automate
+minor adjustments to cluster capacity caused by node failures and
+scheduler maintanence.
+```sh
+{{ .KUBECTL }} apply -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: slack-cluster-queue
+spec:
+  namespaceSelector: {}
+  cohort: default-cohort
+  preemption:
+    withinClusterQueue: LowerOrNewerEqualPriority
+    reclaimWithinCohort: Any
+    borrowWithinCohort:
+      policy: Never
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"]
+    flavors:
+    - name: default-flavor
+      resources:
+      - name: "cpu"
+        nominalQuota: 8000m
+      - name: "memory"
+        nominalQuota: 128Gi
+      - name: "nvidia.com/gpu"
+        nominalQuota: 8
+      - name: "nvidia.com/roce_gdr"
+        nominalQuota: 1
+      - name: "pods"
+        nominalQuota: 100
+EOF
+```
+Edit the above quantities to adjust the quota to the desired
+values. Pod counts are optional and can be omitted from the list of
+covered resources.  The `lendingLimit` for each resource will be
+dynamically adjusted by the MLBatch system to reflect reduced cluster
+capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a
+detailed discussion of the role of the slack `ClusterQueue`.
+{{- end }}
diff --git a/setup.tmpl/Kubernetes-v1.25.yaml b/setup.tmpl/Kubernetes-v1.25.yaml
@@ -4,3 +4,4 @@ OPENSHIFT: false
 VERSION: k8s-v1.25
 KUBECTL: kubectl
 VAP: false
+SLACKCQ: true
diff --git a/setup.tmpl/Kubernetes-v1.30.yaml b/setup.tmpl/Kubernetes-v1.30.yaml
@@ -4,3 +4,4 @@ OPENSHIFT: false
 VERSION: k8s-v1.30
 KUBECTL: kubectl
 VAP: true
+SLACKCQ: true
diff --git a/setup.tmpl/RHOAI-v2.10.yaml b/setup.tmpl/RHOAI-v2.10.yaml
@@ -2,4 +2,5 @@
 
 OPENSHIFT: true
 VERSION: RHOAI-v2.10
-KUBECTL: oc
+KUBECTL: oc
+SLACKCQ: false
diff --git a/setup.tmpl/RHOAI-v2.11.yaml b/setup.tmpl/RHOAI-v2.11.yaml
@@ -3,5 +3,4 @@
 OPENSHIFT: true
 VERSION: RHOAI-v2.11
 KUBECTL: oc
-
-
+SLACKCQ: false