From 14ff5d88bca2532fdf40db5de0e540f547a73173 Mon Sep 17 00:00:00 2001
From: David Grove <groved@us.ibm.com>
Date: Mon, 16 Sep 2024 11:27:46 -0400
Subject: [PATCH 1/4] add instructions for RHOAI 2.13

---
 SETUP.md                                      |   6 +
 setup.RHOAI-v2.12/UPGRADE.md                  |   4 +-
 setup.RHOAI-v2.13/CLUSTER-SETUP.md            | 160 ++++++++++
 setup.RHOAI-v2.13/TEAM-SETUP.md               |  91 ++++++
 setup.RHOAI-v2.13/UNINSTALL.md                |  23 ++
 setup.RHOAI-v2.13/UPGRADE-FAST.md             |  31 ++
 setup.RHOAI-v2.13/UPGRADE.md                  |  33 ++
 .../coscheduler-priority-patch.yaml           |   3 +
 setup.RHOAI-v2.13/default-flavor.yaml         |   4 +
 setup.RHOAI-v2.13/mlbatch-dsc.yaml            |  32 ++
 setup.RHOAI-v2.13/mlbatch-dsci.yaml           |  14 +
 setup.RHOAI-v2.13/mlbatch-edit-role.yaml      | 121 +++++++
 setup.RHOAI-v2.13/mlbatch-priorities.yaml     |  26 ++
 setup.RHOAI-v2.13/mlbatch-subscription.yaml   | 300 ++++++++++++++++++
 .../mlbatch-upgrade-configmaps.yaml           | 115 +++++++
 setup.tmpl/Makefile                           |   2 +
 setup.tmpl/RHOAI-v2.13.yaml                   |   6 +
 17 files changed, 969 insertions(+), 2 deletions(-)
 create mode 100644 setup.RHOAI-v2.13/CLUSTER-SETUP.md
 create mode 100644 setup.RHOAI-v2.13/TEAM-SETUP.md
 create mode 100644 setup.RHOAI-v2.13/UNINSTALL.md
 create mode 100644 setup.RHOAI-v2.13/UPGRADE-FAST.md
 create mode 100644 setup.RHOAI-v2.13/UPGRADE.md
 create mode 100644 setup.RHOAI-v2.13/coscheduler-priority-patch.yaml
 create mode 100644 setup.RHOAI-v2.13/default-flavor.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-dsc.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-dsci.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-edit-role.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-priorities.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-subscription.yaml
 create mode 100644 setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
 create mode 100644 setup.tmpl/RHOAI-v2.13.yaml

diff --git a/SETUP.md b/SETUP.md
index d21457c..fc2c4f4 100644
--- a/SETUP.md
+++ b/SETUP.md
@@ -29,6 +29,12 @@ Instructions are provided for the following OpenShift AI ***stable*** releases:
    + [RHOAI 2.10 Cluster Setup](./setup.RHOAI-v2.10/CLUSTER-SETUP.md)
    + [RHOAI 2.10 Team Setup](./setup.RHOAI-v2.10/TEAM-SETUP.md)
    + [RHOAI 2.10 Uninstall](./setup.RHOAI-v2.10/UNINSTALL.md)
++ OpenShift AI 2.13
+   + [RHOAI 2.13 Cluster Setup](./setup.RHOAI-v2.13/CLUSTER-SETUP.md)
+   + [RHOAI 2.13 Team Setup](./setup.RHOAI-v2.13/TEAM-SETUP.md)
+   + [UPGRADING from RHOAI 2.10](./setup.RHOAI-v2.12/UPGRADE-STABLE.md)
+   + [UPGRADING from RHOAI 2.12](./setup.RHOAI-v2.12/UPGRADE-FAST.md)
+   + [RHOAI 2.13 Uninstall](./setup.RHOAI-v2.13/UNINSTALL.md)
 
 Instructions are provided for the following OpenShift AI ***fast*** releases:
 + OpenShift AI 2.11
diff --git a/setup.RHOAI-v2.12/UPGRADE.md b/setup.RHOAI-v2.12/UPGRADE.md
index 31184d9..cb950d7 100644
--- a/setup.RHOAI-v2.12/UPGRADE.md
+++ b/setup.RHOAI-v2.12/UPGRADE.md
@@ -28,10 +28,10 @@ oc apply -f setup.RHOAI-v2.12/mlbatch-upgrade-configmaps.yaml
 Second, approve the install plan replacing the example plan name below with the actual
 value on your cluster:
 ```sh
-oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-st8vh
+oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-xs6gq
 ```
 
-Apply this patch:
+Third, apply this patch:
 ```sh
 oc apply -f setup.RHOAI-v2.12/mlbatch-rbac-fix.yaml
 ```
diff --git a/setup.RHOAI-v2.13/CLUSTER-SETUP.md b/setup.RHOAI-v2.13/CLUSTER-SETUP.md
new file mode 100644
index 0000000..33f73c2
--- /dev/null
+++ b/setup.RHOAI-v2.13/CLUSTER-SETUP.md
@@ -0,0 +1,160 @@
+# Cluster Setup
+
+The cluster setup installs OpenShift AI and Coscheduler, configures Kueue,
+cluster roles, and priority classes.
+
+If MLBatch is deployed on a cluster that used to run earlier versions of ODH,
+[MCAD](https://github.com/project-codeflare/mcad), OpenShift AI, or Coscheduler,
+make sure to scrub traces of these installations. In particular, make sure to
+delete the following custom resource definitions (CRD) if present on the
+cluster. Make sure to delete all instances prior to deleting the CRDs:
+```sh
+# Delete old appwrappers and crd
+oc delete appwrappers --all -A
+oc delete crd appwrappers.workload.codeflare.dev
+
+# Delete old noderesourcetopologies and crd
+oc delete noderesourcetopologies --all -A
+oc delete crd noderesourcetopologies.topology.node.k8s.io
+```
+
+## Priorities
+
+Create `default-priority`, `high-priority`, and `low-priority` priority classes:
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-priorities.yaml
+```
+
+## Coscheduler
+
+Install Coscheduler v0.28.9 as a secondary scheduler and configure packing:
+```sh
+helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \
+  scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \
+  --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"}]'
+```
+Patch Coscheduler pod priorities:
+```sh
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/coscheduler-priority-patch.yaml scheduler-plugins-controller
+oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.13/coscheduler-priority-patch.yaml scheduler-plugins-scheduler
+```
+
+## OpenShift AI
+
+Create the OpenShift AI subscription:
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-subscription.yaml
+````
+Identify install plan:
+```sh
+oc get ip -n redhat-ods-operator
+```
+```
+NAMESPACE             NAME            CSV                     APPROVAL   APPROVED
+redhat-ods-operator   install-kmh8w   rhods-operator.2.10.0   Manual     false
+```
+Approve install plan replacing the generated plan name below with the actual
+value:
+```sh
+oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w
+```
+Create DSC Initialization:
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-dsci.yaml
+```
+Create Data Science Cluster:
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-dsc.yaml
+```
+The provided DSCI and DSC are intended to install a minimal set of OpenShift
+AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The
+remaining components such as `dashboard` can be optionally enabled.
+
+The configuration of the managed components differs from the default OpenShift
+AI configuration as follows:
+- Kubeflow Training Operator:
+  - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`,
+- Kueue:
+  - `manageJobsWithoutQueueName` is enabled,
+  - `batch/job` integration is disabled,
+  - `waitForPodsReady` is disabled,
+  - `LendingLimit` feature gate is enabled,
+  - `enableClusterQueueResources` metrics is enabled,
+- Codeflare operator:
+  - the AppWrapper controller is enabled and configured as follows:
+    - `userRBACAdmissionCheck` is disabled,
+    - `schedulerName` is set to `scheduler-plugins-scheduler`,
+    - `queueName` is set to `default-queue`,
+- pod priorities, resource requests and limits have been adjusted.
+
+To work around https://issues.redhat.com/browse/RHOAIENG-7887 (a race condition
+in OpenShift AI installation), do a rolling restart of the Kueue manager.
+```sh
+oc rollout restart deployment/kueue-controller-manager -n redhat-ods-applications
+```
+
+After doing the restart, verify that you see the following lines in the
+kueue-controller-manager's log:
+```sh
+{"level":"info","ts":"2024-06-25T20:17:25.689638786Z","logger":"controller-runtime.builder","caller":"builder/webhook.go:189","msg":"Registering a validating webhook","GVK":"kubeflow.org/v1, Kind=PyTorchJob","path":"/validate-kubeflow-org-v1-pytorchjob"}
+{"level":"info","ts":"2024-06-25T20:17:25.689698615Z","logger":"controller-runtime.webhook","caller":"webhook/server.go:183","msg":"Registering webhook","path":"/validate-kubeflow-org-v1-pytorchjob"}
+{"level":"info","ts":"2024-06-25T20:17:25.689743757Z","logger":"setup","caller":"jobframework/setup.go:81","msg":"Set up controller and webhook for job framework","jobFrameworkName":"kubeflow.org/pytorchjob"}
+
+```
+
+## Kueue Configuration
+
+Create Kueue's default flavor:
+```sh
+oc apply -f setup.RHOAI-v2.13/default-flavor.yaml
+```
+
+## Cluster Role
+
+Create `mlbatch-edit` role:
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-edit-role.yaml
+```
+
+## Slack Cluster Queue
+
+Create the designated slack `ClusterQueue` which will be used to automate
+minor adjustments to cluster capacity caused by node failures and
+scheduler maintanence.
+```sh
+oc apply -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: slack-cluster-queue
+spec:
+  namespaceSelector: {}
+  cohort: default-cohort
+  preemption:
+    withinClusterQueue: LowerOrNewerEqualPriority
+    reclaimWithinCohort: Any
+    borrowWithinCohort:
+      policy: Never
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"]
+    flavors:
+    - name: default-flavor
+      resources:
+      - name: "cpu"
+        nominalQuota: 8000m
+      - name: "memory"
+        nominalQuota: 128Gi
+      - name: "nvidia.com/gpu"
+        nominalQuota: 8
+      - name: "nvidia.com/roce_gdr"
+        nominalQuota: 1
+      - name: "pods"
+        nominalQuota: 100
+EOF
+```
+Edit the above quantities to adjust the quota to the desired
+values. Pod counts are optional and can be omitted from the list of
+covered resources.  The `lendingLimit` for each resource will be
+dynamically adjusted by the MLBatch system to reflect reduced cluster
+capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a
+detailed discussion of the role of the slack `ClusterQueue`.
diff --git a/setup.RHOAI-v2.13/TEAM-SETUP.md b/setup.RHOAI-v2.13/TEAM-SETUP.md
new file mode 100644
index 0000000..85c9429
--- /dev/null
+++ b/setup.RHOAI-v2.13/TEAM-SETUP.md
@@ -0,0 +1,91 @@
+# Team Setup
+
+A *team* in MLBatch is a group of users that share a resource quota.
+
+Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md)
+for a discussion of our recommended best practices.
+
+
+Setting up a new team requires the cluster admin to create a project,
+a user group, a quota, a queue, and the required role bindings as described below.
+
+Create project:
+```sh
+oc new-project team1
+```
+Create user group:
+```sh
+oc adm groups new team1-edit-group
+```
+Add users to group for example:
+```sh
+oc adm groups add-users team1-edit-group user1
+```
+Bind cluster role to group in namespace:
+```sh
+oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1
+```
+
+Specify the intended quota for the namespace by creating a `ClusterQueue`:
+```sh
+oc apply -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: team1-cluster-queue
+spec:
+  namespaceSelector: {}
+  cohort: default-cohort
+  preemption:
+    withinClusterQueue: LowerOrNewerEqualPriority
+    reclaimWithinCohort: Any
+    borrowWithinCohort:
+      policy: Never
+  resourceGroups:
+  - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"]
+    flavors:
+    - name: default-flavor
+      resources:
+      - name: "cpu"
+        nominalQuota: 8000m
+        # borrowingLimit: 0
+        # lendingLimit: 0
+      - name: "memory"
+        nominalQuota: 128Gi
+        # borrowingLimit: 0
+        # lendingLimit: 0
+      - name: "nvidia.com/gpu"
+        nominalQuota: 16
+        # borrowingLimit: 0
+        # lendingLimit: 0
+      - name: "nvidia.com/roce_gdr"
+        nominalQuota: 4
+        # borrowingLimit: 0
+        # lendingLimit: 0
+      - name: "pods"
+        nominalQuota: 100
+        # borrowingLimit: 0
+        # lendingLimit: 0
+EOF
+```
+Edit the above quantities to adjust the quota to the desired values. Pod counts
+are optional and can be omitted from the list of covered resources.
+
+Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing
+quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other
+namespaces from borrowing quota from this namespace.
+
+Create a `LocalQueue` to bind the `ClusterQueue` to the namespace:
+```sh
+oc apply -n team1 -f- << EOF
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  name: default-queue
+spec:
+  clusterQueue: team1-cluster-queue
+EOF
+```
+We recommend naming the local queue `default-queue` as `AppWrappers` will
+default to this queue name.
+
diff --git a/setup.RHOAI-v2.13/UNINSTALL.md b/setup.RHOAI-v2.13/UNINSTALL.md
new file mode 100644
index 0000000..776045d
--- /dev/null
+++ b/setup.RHOAI-v2.13/UNINSTALL.md
@@ -0,0 +1,23 @@
+# Uninstall
+
+***First, remove all team projects and corresponding cluster queues.***
+
+Then to uninstall the MLBatch controllers and reclaim the corresponding
+namespaces, run:
+```sh
+# OpenShift AI uninstall
+oc delete dsc mlbatch-dsc
+oc delete dsci mlbatch-dsci
+oc delete subscription -n redhat-ods-operator rhods-operator
+oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator
+oc delete crd featuretrackers.features.opendatahub.io \
+  dscinitializations.dscinitialization.opendatahub.io \
+  datascienceclusters.datasciencecluster.opendatahub.io
+oc delete operators rhods-operator.redhat-ods-operator
+oc delete operatorgroup -n redhat-ods-operator rhods-operator
+oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator
+
+# Coscheduler uninstall
+helm uninstall -n scheduler-plugins scheduler-plugins
+oc delete namespace scheduler-plugins
+```
diff --git a/setup.RHOAI-v2.13/UPGRADE-FAST.md b/setup.RHOAI-v2.13/UPGRADE-FAST.md
new file mode 100644
index 0000000..cb62e00
--- /dev/null
+++ b/setup.RHOAI-v2.13/UPGRADE-FAST.md
@@ -0,0 +1,31 @@
+# Upgrading from RHOAI 2.12
+
+These instructions assume you installed and configured RHOAI 2.12 following
+the MLBatch [install instructions for RHOAI-v2.12](../setup.RHOAI-v2.12/CLUSTER-SETUP.md)
+or the [upgrade instructions for RHOAI-V2.12](../setup.RHOAI-v2.12/UPGRADE.md)
+
+Your subscription will have automatically created an unapproved
+install plan to upgrade to RHOAI 2.13.
+
+Before beginning, verify that the expected install plan exists:
+```sh
+oc get ip -n redhat-ods-operator
+```
+Typical output would be:
+```sh
+NAME            CSV                     APPROVAL   APPROVED
+install-kpzzl   rhods-operator.2.13.0   Manual     false
+install-nqrbp   rhods-operator.2.10.0   Manual     true
+install-st8vh   rhods-operator.2.11.0   Manual     true
+install-xs6gq   rhods-operator.2.12.0   Manual     true
+```
+
+Assuming the install plan exists you can begin the upgrade process.
+
+There are no MLBatch modifications to the default RHOAI configuration maps
+beyond those already made in previous installs. Therefore, you can simply
+approve the install plan replacing the example plan name below with the actual
+value on your cluster:
+```sh
+oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl
+```
diff --git a/setup.RHOAI-v2.13/UPGRADE.md b/setup.RHOAI-v2.13/UPGRADE.md
new file mode 100644
index 0000000..d0dd202
--- /dev/null
+++ b/setup.RHOAI-v2.13/UPGRADE.md
@@ -0,0 +1,33 @@
+# Upgrading from RHOAI 2.10
+
+These instructions assume you installed and configured RHOAI 2.10 following
+the MLBatch [install instructions for RHOAI-v2.10](../setup.RHOAI-v2.10/CLUSTER-SETUP.md).
+
+Your subscription will have automatically created an unapproved
+install plan to upgrade to RHOAI 2.13.
+
+Before beginning, verify that the expected install plan exists:
+```sh
+oc get ip -n redhat-ods-operator
+```
+Typical output would be:
+```sh
+NAME            CSV                     APPROVAL   APPROVED
+install-kpzzl   rhods-operator.2.13.0   Manual     false
+install-nqrbp   rhods-operator.2.10.0   Manual     true
+```
+
+Assuming the install plan exists you can begin the upgrade process.
+
+First, update the MLBatch modifications to the default RHOAI configuration maps.
+```sh
+oc apply -f setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
+```
+
+Second, approve the install plan replacing the example plan name below with the actual
+value on your cluster:
+```sh
+oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl
+```
+
+Finally, create the Slack Cluster Queue as described in [CLUSTER-SETUP.md for RHOAI 2.13](./CLUSTER-SETUP.md#Slack-Cluster-Queue).
diff --git a/setup.RHOAI-v2.13/coscheduler-priority-patch.yaml b/setup.RHOAI-v2.13/coscheduler-priority-patch.yaml
new file mode 100644
index 0000000..278802f
--- /dev/null
+++ b/setup.RHOAI-v2.13/coscheduler-priority-patch.yaml
@@ -0,0 +1,3 @@
+- op: add
+  path: /spec/template/spec/priorityClassName
+  value: system-node-critical
diff --git a/setup.RHOAI-v2.13/default-flavor.yaml b/setup.RHOAI-v2.13/default-flavor.yaml
new file mode 100644
index 0000000..6cbccf3
--- /dev/null
+++ b/setup.RHOAI-v2.13/default-flavor.yaml
@@ -0,0 +1,4 @@
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ResourceFlavor
+metadata:
+  name: default-flavor
diff --git a/setup.RHOAI-v2.13/mlbatch-dsc.yaml b/setup.RHOAI-v2.13/mlbatch-dsc.yaml
new file mode 100644
index 0000000..66336bc
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-dsc.yaml
@@ -0,0 +1,32 @@
+apiVersion: datasciencecluster.opendatahub.io/v1
+kind: DataScienceCluster
+metadata:
+  name: mlbatch-dsc
+spec:
+  components:
+    codeflare:
+      managementState: Managed
+    dashboard:
+      managementState: Removed
+    datasciencepipelines:
+      managementState: Removed
+    kserve:
+      managementState: Removed
+      serving:
+        ingressGateway:
+          certificate:
+            type: SelfSigned
+        managementState: Removed
+        name: knative-serving
+    kueue:
+      managementState: Managed
+    modelmeshserving:
+      managementState: Removed
+    ray:
+      managementState: Managed
+    trainingoperator:
+      managementState: Managed
+    trustyai:
+      managementState: Removed
+    workbenches:
+      managementState: Removed
diff --git a/setup.RHOAI-v2.13/mlbatch-dsci.yaml b/setup.RHOAI-v2.13/mlbatch-dsci.yaml
new file mode 100644
index 0000000..77785c3
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-dsci.yaml
@@ -0,0 +1,14 @@
+apiVersion: dscinitialization.opendatahub.io/v1
+kind: DSCInitialization
+metadata:
+  name: mlbatch-dsci
+spec:
+  applicationsNamespace: redhat-ods-applications
+  monitoring:
+    managementState: Managed
+    namespace: redhat-ods-monitoring
+  serviceMesh:
+    managementState: Removed
+  trustedCABundle:
+    customCABundle: ""
+    managementState: Managed
diff --git a/setup.RHOAI-v2.13/mlbatch-edit-role.yaml b/setup.RHOAI-v2.13/mlbatch-edit-role.yaml
new file mode 100644
index 0000000..0e216d7
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-edit-role.yaml
@@ -0,0 +1,121 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: mlbatch-edit
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - delete
+  - get
+  - list
+  - watch
+- apiGroups:
+  - apps
+  resources:
+  - deployments
+  - statefulsets
+  verbs:
+  - delete
+  - get
+  - list
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - services
+  - secrets
+  - configmaps
+  - persistentvolumeclaims
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - kueue.x-k8s.io
+  resources:
+  - "*"
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - kubeflow.org
+  resources:
+  - pytorchjobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - cluster.ray.io
+  resources:
+  - rayjobs
+  - rayclusters
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - delete
+  - get
+  - list
+  - watch
+- apiGroups:
+  - workload.codeflare.dev
+  resources:
+  - appwrappers
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - scheduling.k8s.io
+  resources:
+  - priorityclasses
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - scheduling.x-k8s.io
+  resources:
+  - podgroups
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
+- apiGroups:
+  - ""
+  resources:
+  - events
+  verbs:
+  - get
+  - list
+  - watch
diff --git a/setup.RHOAI-v2.13/mlbatch-priorities.yaml b/setup.RHOAI-v2.13/mlbatch-priorities.yaml
new file mode 100644
index 0000000..77c8f3b
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-priorities.yaml
@@ -0,0 +1,26 @@
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: low-priority
+value: 1
+preemptionPolicy: PreemptLowerPriority
+globalDefault: false
+description: "This is the priority class for all lower priority jobs."
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: default-priority
+value: 5
+preemptionPolicy: PreemptLowerPriority
+globalDefault: true
+description: "This is the priority class for all jobs (default priority)."
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: high-priority
+value: 10
+preemptionPolicy: PreemptLowerPriority
+globalDefault: false
+description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs."
diff --git a/setup.RHOAI-v2.13/mlbatch-subscription.yaml b/setup.RHOAI-v2.13/mlbatch-subscription.yaml
new file mode 100644
index 0000000..c546f62
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-subscription.yaml
@@ -0,0 +1,300 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: redhat-ods-operator
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: redhat-ods-applications
+---
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: rhods-operator
+  namespace: redhat-ods-operator
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mlbatch-codeflare
+  namespace: redhat-ods-operator
+data:
+  manager.yaml: |
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: manager
+      namespace: system
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: codeflare-operator
+          app.kubernetes.io/part-of: codeflare
+      replicas: 1
+      template:
+        metadata:
+          annotations:
+            kubectl.kubernetes.io/default-container: manager
+          labels:
+            app.kubernetes.io/name: codeflare-operator
+            app.kubernetes.io/part-of: codeflare
+        spec:
+          priorityClassName: system-node-critical
+          securityContext:
+            runAsNonRoot: true
+            # TODO(user): For common cases that do not require escalating privileges
+            # it is recommended to ensure that all your Pods/Containers are restrictive.
+            # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
+            # Please uncomment the following code if your project does NOT have to work on old Kubernetes
+            # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ).
+            # seccompProfile:
+            #   type: RuntimeDefault
+          containers:
+          - command:
+            - /manager
+            image: $(codeflare_operator_controller_image)
+            imagePullPolicy: Always
+            name: manager
+            securityContext:
+              allowPrivilegeEscalation: false
+              capabilities:
+                drop:
+                  - "ALL"
+            env:
+              - name: NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+            ports:
+              - containerPort: 8080
+                protocol: TCP
+                name: metrics
+            livenessProbe:
+              httpGet:
+                path: /healthz
+                port: 8081
+              initialDelaySeconds: 15
+              periodSeconds: 20
+            readinessProbe:
+              httpGet:
+                path: /readyz
+                port: 8081
+              initialDelaySeconds: 5
+              periodSeconds: 10
+            resources:
+              limits:
+                cpu: "1"
+                memory: 1Gi
+              requests:
+                cpu: "1"
+                memory: 1Gi
+          serviceAccountName: controller-manager
+          terminationGracePeriodSeconds: 10
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: codeflare-operator-config
+  namespace: redhat-ods-applications
+data:
+  config.yaml: |
+    appwrapper:
+      enabled: true
+      Config:
+        manageJobsWithoutQueueName: true
+        userRBACAdmissionCheck: false
+        schedulerName: scheduler-plugins-scheduler
+        defaultQueueName: default-queue
+        slackQueueName: slack-cluster-queue
+        autopilot:
+          resourceTaints:
+            nvidia.com/gpu:
+            - key: autopilot.ibm.com/gpuhealth
+              value: ERR
+              effect: NoSchedule
+            - key: autopilot.ibm.com/gpuhealth
+              value: TESTING
+              effect: NoSchedule
+            - key: autopilot.ibm.com/gpuhealth
+              value: EVICT
+              effect: NoExecute
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mlbatch-kuberay
+  namespace: redhat-ods-operator
+data:
+  kuberay-operator-image-patch.yaml: |
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: kuberay-operator
+    spec:
+      template:
+        spec:
+          priorityClassName: system-node-critical
+          containers:
+          - name: kuberay-operator
+            image: $(image)
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mlbatch-kueue
+  namespace: redhat-ods-operator
+data:
+  controller_manager_config.yaml: |
+    apiVersion: config.kueue.x-k8s.io/v1beta1
+    kind: Configuration
+    health:
+      healthProbeBindAddress: :8081
+    metrics:
+      bindAddress: :8080
+      enableClusterQueueResources: true
+    webhook:
+      port: 9443
+    leaderElection:
+      leaderElect: true
+      resourceName: c1f6bfd2.kueue.x-k8s.io
+    controller:
+      groupKindConcurrency:
+        Job.batch: 5
+        Pod: 5
+        Workload.kueue.x-k8s.io: 5
+        LocalQueue.kueue.x-k8s.io: 1
+        ClusterQueue.kueue.x-k8s.io: 1
+        ResourceFlavor.kueue.x-k8s.io: 1
+    clientConnection:
+      qps: 50
+      burst: 100
+    #pprofBindAddress: :8082
+    waitForPodsReady:
+      enable: false
+      blockAdmission: false
+    manageJobsWithoutQueueName: true
+    #internalCertManagement:
+    #  enable: false
+    #  webhookServiceName: ""
+    #  webhookSecretName: ""
+    integrations:
+      frameworks:
+    # - "batch/job"
+      - "kubeflow.org/mpijob"
+      - "ray.io/rayjob"
+      - "ray.io/raycluster"
+      - "jobset.x-k8s.io/jobset"
+      - "kubeflow.org/mxjob"
+      - "kubeflow.org/paddlejob"
+      - "kubeflow.org/pytorchjob"
+      - "kubeflow.org/tfjob"
+      - "kubeflow.org/xgboostjob"
+    # - "pod"
+      externalFrameworks:
+      - "AppWrapper.v1beta2.workload.codeflare.dev"
+    # podOptions:
+    #   namespaceSelector:
+    #     matchExpressions:
+    #       - key: kubernetes.io/metadata.name
+    #         operator: NotIn
+    #         values: [ kube-system, kueue-system ]
+  manager_config_patch.yaml: |
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: controller-manager
+      namespace: system
+    spec:
+      template:
+        spec:
+          priorityClassName: system-node-critical
+          containers:
+          - name: manager
+            image: $(image)
+            args:
+            - "--config=/controller_manager_config.yaml"
+            - "--zap-log-level=2"
+            - "--feature-gates=LendingLimit=true"
+            volumeMounts:
+            - name: manager-config
+              mountPath: /controller_manager_config.yaml
+              subPath: controller_manager_config.yaml
+          volumes:
+          - name: manager-config
+            configMap:
+              name: manager-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mlbatch-training-operator
+  namespace: redhat-ods-operator
+data:
+  manager_config_patch.yaml: |
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: training-operator
+    spec:
+      template:
+        spec:
+          priorityClassName: system-node-critical
+          containers:
+          - name:  training-operator
+            image: $(image)
+            args:
+            - "--zap-log-level=2"
+            - "--gang-scheduler-name=scheduler-plugins-scheduler"
+            resources:
+              requests:
+                cpu: 100m
+                memory: 100Mi
+              limits:
+                cpu: 500m
+                memory: 1000Mi
+---
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: rhods-operator
+  namespace: redhat-ods-operator
+spec:
+  channel: fast
+  installPlanApproval: Manual
+  name: rhods-operator
+  source: redhat-operators
+  sourceNamespace: openshift-marketplace
+  startingCSV: rhods-operator.2.12.0
+  config:
+    env:
+    - name: "DISABLE_DSC_CONFIG"
+    volumeMounts:
+    - name: mlbatch-codeflare
+      mountPath: /opt/manifests/codeflare/manager/manager.yaml
+      subPath: manager.yaml
+    - name: mlbatch-kuberay
+      mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml
+      subPath: kuberay-operator-image-patch.yaml
+    - name: mlbatch-kueue
+      mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml
+      subPath: controller_manager_config.yaml
+    - name: mlbatch-kueue
+      mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml
+      subPath: manager_config_patch.yaml
+    - name: mlbatch-training-operator
+      mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml
+      subPath: manager_config_patch.yaml
+    volumes:
+    - name: mlbatch-codeflare
+      configMap:
+        name: mlbatch-codeflare
+    - name: mlbatch-kuberay
+      configMap:
+        name: mlbatch-kuberay
+    - name: mlbatch-kueue
+      configMap:
+        name: mlbatch-kueue
+    - name: mlbatch-training-operator
+      configMap:
+        name: mlbatch-training-operator
diff --git a/setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml b/setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
new file mode 100644
index 0000000..f7ded1d
--- /dev/null
+++ b/setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
@@ -0,0 +1,115 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: codeflare-operator-config
+  namespace: redhat-ods-applications
+data:
+  config.yaml: |
+    appwrapper:
+      enabled: true
+      Config:
+        manageJobsWithoutQueueName: true
+        userRBACAdmissionCheck: false
+        schedulerName: scheduler-plugins-scheduler
+        defaultQueueName: default-queue
+        slackQueueName: slack-cluster-queue
+        autopilot:
+          resourceTaints:
+            nvidia.com/gpu:
+            - key: autopilot.ibm.com/gpuhealth
+              value: ERR
+              effect: NoSchedule
+            - key: autopilot.ibm.com/gpuhealth
+              value: TESTING
+              effect: NoSchedule
+            - key: autopilot.ibm.com/gpuhealth
+              value: EVICT
+              effect: NoExecute
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: mlbatch-kueue
+  namespace: redhat-ods-operator
+data:
+  controller_manager_config.yaml: |
+    apiVersion: config.kueue.x-k8s.io/v1beta1
+    kind: Configuration
+    health:
+      healthProbeBindAddress: :8081
+    metrics:
+      bindAddress: :8080
+      enableClusterQueueResources: true
+    webhook:
+      port: 9443
+    leaderElection:
+      leaderElect: true
+      resourceName: c1f6bfd2.kueue.x-k8s.io
+    controller:
+      groupKindConcurrency:
+        Job.batch: 5
+        Pod: 5
+        Workload.kueue.x-k8s.io: 5
+        LocalQueue.kueue.x-k8s.io: 1
+        ClusterQueue.kueue.x-k8s.io: 1
+        ResourceFlavor.kueue.x-k8s.io: 1
+    clientConnection:
+      qps: 50
+      burst: 100
+    #pprofBindAddress: :8082
+    waitForPodsReady:
+      enable: false
+      blockAdmission: false
+    manageJobsWithoutQueueName: true
+    #internalCertManagement:
+    #  enable: false
+    #  webhookServiceName: ""
+    #  webhookSecretName: ""
+    integrations:
+      frameworks:
+    # - "batch/job"
+      - "kubeflow.org/mpijob"
+      - "ray.io/rayjob"
+      - "ray.io/raycluster"
+      - "jobset.x-k8s.io/jobset"
+      - "kubeflow.org/mxjob"
+      - "kubeflow.org/paddlejob"
+      - "kubeflow.org/pytorchjob"
+      - "kubeflow.org/tfjob"
+      - "kubeflow.org/xgboostjob"
+    # - "pod"
+      externalFrameworks:
+      - "AppWrapper.v1beta2.workload.codeflare.dev"
+    # podOptions:
+    #   namespaceSelector:
+    #     matchExpressions:
+    #       - key: kubernetes.io/metadata.name
+    #         operator: NotIn
+    #         values: [ kube-system, kueue-system ]
+  manager_config_patch.yaml: |
+    apiVersion: apps/v1
+    kind: Deployment
+    metadata:
+      name: controller-manager
+      namespace: system
+    spec:
+      template:
+        spec:
+          priorityClassName: system-node-critical
+          containers:
+          - name: manager
+            image: $(image)
+            args:
+            - "--config=/controller_manager_config.yaml"
+            - "--zap-log-level=2"
+            - "--feature-gates=LendingLimit=true"
+            volumeMounts:
+            - name: manager-config
+              mountPath: /controller_manager_config.yaml
+              subPath: controller_manager_config.yaml
+          volumes:
+          - name: manager-config
+            configMap:
+              name: manager-config
+---
diff --git a/setup.tmpl/Makefile b/setup.tmpl/Makefile
index fe596ff..861de9f 100644
--- a/setup.tmpl/Makefile
+++ b/setup.tmpl/Makefile
@@ -27,6 +27,8 @@ docs: gotmpl
 	../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.11/TEAM-SETUP.md -values RHOAI-v2.11.yaml
 	../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.12/CLUSTER-SETUP.md -values RHOAI-v2.12.yaml
 	../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.12/TEAM-SETUP.md -values RHOAI-v2.12.yaml
+	../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/CLUSTER-SETUP.md -values RHOAI-v2.13.yaml
+	../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/TEAM-SETUP.md -values RHOAI-v2.13.yaml
 	../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.25/CLUSTER-SETUP.md -values Kubernetes-v1.25.yaml
 	../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s-v1.25/TEAM-SETUP.md -values Kubernetes-v1.25.yaml
 	../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.30/CLUSTER-SETUP.md -values Kubernetes-v1.30.yaml
diff --git a/setup.tmpl/RHOAI-v2.13.yaml b/setup.tmpl/RHOAI-v2.13.yaml
new file mode 100644
index 0000000..200b631
--- /dev/null
+++ b/setup.tmpl/RHOAI-v2.13.yaml
@@ -0,0 +1,6 @@
+# Values for RHOAI 2.13
+
+OPENSHIFT: true
+VERSION: RHOAI-v2.13
+KUBECTL: oc
+SLACKCQ: true
\ No newline at end of file

From 129d5ddbe2324253ea21fc64e216ccf981bd6c55 Mon Sep 17 00:00:00 2001
From: David Grove <groved@us.ibm.com>
Date: Mon, 16 Sep 2024 11:30:28 -0400
Subject: [PATCH 2/4] fix links

---
 SETUP.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SETUP.md b/SETUP.md
index fc2c4f4..0d1e032 100644
--- a/SETUP.md
+++ b/SETUP.md
@@ -32,8 +32,8 @@ Instructions are provided for the following OpenShift AI ***stable*** releases:
 + OpenShift AI 2.13
    + [RHOAI 2.13 Cluster Setup](./setup.RHOAI-v2.13/CLUSTER-SETUP.md)
    + [RHOAI 2.13 Team Setup](./setup.RHOAI-v2.13/TEAM-SETUP.md)
-   + [UPGRADING from RHOAI 2.10](./setup.RHOAI-v2.12/UPGRADE-STABLE.md)
-   + [UPGRADING from RHOAI 2.12](./setup.RHOAI-v2.12/UPGRADE-FAST.md)
+   + [UPGRADING from RHOAI 2.10](./setup.RHOAI-v2.13/UPGRADE-STABLE.md)
+   + [UPGRADING from RHOAI 2.12](./setup.RHOAI-v2.13/UPGRADE-FAST.md)
    + [RHOAI 2.13 Uninstall](./setup.RHOAI-v2.13/UNINSTALL.md)
 
 Instructions are provided for the following OpenShift AI ***fast*** releases:

From 8b515d70bea9e37950dc56ad506d1aba3b2c0a7b Mon Sep 17 00:00:00 2001
From: David Grove <groved@us.ibm.com>
Date: Mon, 16 Sep 2024 11:32:12 -0400
Subject: [PATCH 3/4] rename file

---
 setup.RHOAI-v2.13/{UPGRADE.md => UPGRADE-STABLE.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename setup.RHOAI-v2.13/{UPGRADE.md => UPGRADE-STABLE.md} (100%)

diff --git a/setup.RHOAI-v2.13/UPGRADE.md b/setup.RHOAI-v2.13/UPGRADE-STABLE.md
similarity index 100%
rename from setup.RHOAI-v2.13/UPGRADE.md
rename to setup.RHOAI-v2.13/UPGRADE-STABLE.md

From 1c0b7d3264b7b5afd6aa466235930e5751e3ac7a Mon Sep 17 00:00:00 2001
From: David Grove <groved@us.ibm.com>
Date: Mon, 16 Sep 2024 11:33:42 -0400
Subject: [PATCH 4/4] put stable in name of upgrade yaml

---
 setup.RHOAI-v2.13/UPGRADE-STABLE.md                             | 2 +-
 ...e-configmaps.yaml => mlbatch-upgrade-stable-configmaps.yaml} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename setup.RHOAI-v2.13/{mlbatch-upgrade-configmaps.yaml => mlbatch-upgrade-stable-configmaps.yaml} (100%)

diff --git a/setup.RHOAI-v2.13/UPGRADE-STABLE.md b/setup.RHOAI-v2.13/UPGRADE-STABLE.md
index d0dd202..fd92fc9 100644
--- a/setup.RHOAI-v2.13/UPGRADE-STABLE.md
+++ b/setup.RHOAI-v2.13/UPGRADE-STABLE.md
@@ -21,7 +21,7 @@ Assuming the install plan exists you can begin the upgrade process.
 
 First, update the MLBatch modifications to the default RHOAI configuration maps.
 ```sh
-oc apply -f setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
+oc apply -f setup.RHOAI-v2.13/mlbatch-upgrade-stable-configmaps.yaml
 ```
 
 Second, approve the install plan replacing the example plan name below with the actual
diff --git a/setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml b/setup.RHOAI-v2.13/mlbatch-upgrade-stable-configmaps.yaml
similarity index 100%
rename from setup.RHOAI-v2.13/mlbatch-upgrade-configmaps.yaml
rename to setup.RHOAI-v2.13/mlbatch-upgrade-stable-configmaps.yaml