From 2a6c6db5dd66107407d75881a2db755d65829935 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Fri, 24 Jan 2025 15:31:00 -0800
Subject: [PATCH 01/16] consolidate ai blueprints

---
 ai-ml/infrastructure/terraform/addons.tf      |  653 +++
 ai-ml/infrastructure/terraform/cleanup.sh     |   71 +
 ai-ml/infrastructure/terraform/eks.tf         |  212 +
 .../terraform/fsx-for-lustre.tf               |  138 +
 .../fsx-for-lustre/fsxlustre-static-pv.yaml   |   21 +
 .../fsx-for-lustre/fsxlustre-static-pvc.yaml  |   12 +
 .../fsxlustre-storage-class.yaml              |    9 +
 .../helm-values/argo-events-values.yaml       |    4 +
 .../helm-values/argo-workflows-values.yaml    |    4 +
 .../aws-cloudwatch-metrics-values.yaml        |   11 +
 .../aws-efa-k8s-device-plugin-values.yaml     |    5 +
 .../helm-values/ingress-nginx-values.yaml     |   11 +
 .../helm-values/jupyterhub-values.yaml        |   54 +
 .../helm-values/kube-prometheus.yaml          |   48 +
 .../helm-values/kubecost-values.yaml          |   69 +
 .../helm-values/mlflow-tracking-values.yaml   |   88 +
 ai-ml/infrastructure/terraform/install.sh     |   33 +
 ai-ml/infrastructure/terraform/karpenter.tf   |    0
 ai-ml/infrastructure/terraform/main.tf        |   60 +
 ai-ml/infrastructure/terraform/mlflow-core.tf |  245 +
 .../terraform/monitoring/dcgm.yaml            |   82 +
 .../monitoring/neuron-monitor-daemonset.yaml  |   42 +
 .../terraform/monitoring/podMonitor.yaml      |   21 +
 .../data_grafana_dashboard.json               | 4535 +++++++++++++++++
 .../default_grafana_dashboard.json            | 2836 +++++++++++
 .../serve_deployment_grafana_dashboard.json   | 2115 ++++++++
 .../serve_grafana_dashboard.json              | 3098 +++++++++++
 .../terraform/monitoring/serviceMonitor.yaml  |   25 +
 ai-ml/infrastructure/terraform/outputs.tf     |    9 +
 ai-ml/infrastructure/terraform/variables.tf   |  107 +
 ai-ml/infrastructure/terraform/versions.tf    |   33 +
 ai-ml/infrastructure/terraform/vpc.tf         |   62 +
 32 files changed, 14713 insertions(+)
 create mode 100644 ai-ml/infrastructure/terraform/addons.tf
 create mode 100755 ai-ml/infrastructure/terraform/cleanup.sh
 create mode 100644 ai-ml/infrastructure/terraform/eks.tf
 create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre.tf
 create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml
 create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml
 create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml
 create mode 100755 ai-ml/infrastructure/terraform/install.sh
 create mode 100644 ai-ml/infrastructure/terraform/karpenter.tf
 create mode 100644 ai-ml/infrastructure/terraform/main.tf
 create mode 100644 ai-ml/infrastructure/terraform/mlflow-core.tf
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml
 create mode 100644 ai-ml/infrastructure/terraform/outputs.tf
 create mode 100644 ai-ml/infrastructure/terraform/variables.tf
 create mode 100644 ai-ml/infrastructure/terraform/versions.tf
 create mode 100644 ai-ml/infrastructure/terraform/vpc.tf

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
new file mode 100644
index 000000000..f8ae300cc
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -0,0 +1,653 @@
+#---------------------------------------------------------------
+# GP3 Encrypted Storage Class
+#---------------------------------------------------------------
+resource "kubernetes_annotations" "disable_gp2" {
+  annotations = {
+    "storageclass.kubernetes.io/is-default-class" : "false"
+  }
+  api_version = "storage.k8s.io/v1"
+  kind        = "StorageClass"
+  metadata {
+    name = "gp2"
+  }
+  force = true
+
+  depends_on = [module.eks.eks_cluster_id]
+}
+
+resource "kubernetes_storage_class" "default_gp3" {
+  metadata {
+    name = "gp3"
+    annotations = {
+      "storageclass.kubernetes.io/is-default-class" : "true"
+    }
+  }
+
+  storage_provisioner    = "ebs.csi.aws.com"
+  reclaim_policy         = "Delete"
+  allow_volume_expansion = true
+  volume_binding_mode    = "WaitForFirstConsumer"
+  parameters = {
+    fsType    = "ext4"
+    encrypted = true
+    type      = "gp3"
+  }
+
+  depends_on = [kubernetes_annotations.disable_gp2]
+}
+
+#---------------------------------------------------------------
+# IRSA for EBS CSI Driver
+#---------------------------------------------------------------
+module "ebs_csi_driver_irsa" {
+  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+  version               = "~> 5.20"
+  role_name_prefix      = format("%s-%s-", local.name, "ebs-csi-driver")
+  attach_ebs_csi_policy = true
+  oidc_providers = {
+    main = {
+      provider_arn               = module.eks.oidc_provider_arn
+      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
+    }
+  }
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# EKS Blueprints Addons
+#---------------------------------------------------------------
+module "eks_blueprints_addons" {
+  source  = "aws-ia/eks-blueprints-addons/aws"
+  version = "~> 1.2"
+
+  cluster_name      = module.eks.cluster_name
+  cluster_endpoint  = module.eks.cluster_endpoint
+  cluster_version   = module.eks.cluster_version
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  #---------------------------------------
+  # Amazon EKS Managed Add-ons
+  #---------------------------------------
+  eks_addons = {
+    aws-ebs-csi-driver = {
+      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
+    }
+    coredns = {
+      preserve = true
+    }
+    kube-proxy = {
+      preserve = true
+    }
+    # VPC CNI uses worker node IAM role policies
+    vpc-cni = {
+      preserve = true
+    }
+  }
+
+  #---------------------------------------
+  # AWS Load Balancer Controller Add-on
+  #---------------------------------------
+  enable_aws_load_balancer_controller = true
+  # turn off the mutating webhook for services because we are using
+  # service.beta.kubernetes.io/aws-load-balancer-type: external
+  aws_load_balancer_controller = {
+    set = [{
+      name  = "enableServiceMutatorWebhook"
+      value = "false"
+    }]
+  }
+
+  #---------------------------------------
+  # Ingress Nginx Add-on
+  #---------------------------------------
+  enable_ingress_nginx = true
+  ingress_nginx = {
+    values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
+  }
+
+  #---------------------------------------
+  # Karpenter Autoscaler for EKS Cluster
+  #---------------------------------------
+  enable_karpenter                  = true
+  karpenter_enable_spot_termination = true
+  karpenter_node = {
+    iam_role_additional_policies = {
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
+  karpenter = {
+    chart_version       = "0.37.0"
+    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+    repository_password = data.aws_ecrpublic_authorization_token.token.password
+    source_policy_documents = [
+      data.aws_iam_policy_document.karpenter_controller_policy.json
+    ]
+  }
+
+  #---------------------------------------
+  # Argo Workflows & Argo Events
+  #---------------------------------------
+  enable_argo_workflows = var.enable_argo_workflows
+  argo_workflows = {
+    name       = "argo-workflows"
+    namespace  = "argo-workflows"
+    repository = "https://argoproj.github.io/argo-helm"
+    values     = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})]
+  }
+
+  enable_argo_events = var.enable_argo_events
+  argo_events = {
+    name       = "argo-events"
+    namespace  = "argo-events"
+    repository = "https://argoproj.github.io/argo-helm"
+    values     = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})]
+  }
+
+  #---------------------------------------
+  # Prometheus and Grafana stack
+  #---------------------------------------
+  #---------------------------------------------------------------
+  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
+  # 2- Grafana Admin user: admin
+  # 3- Get secret name from Terrafrom output: `terraform output grafana_secret_name`
+  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <REPLACE_WIRTH_SECRET_ID> --region $AWS_REGION --query "SecretString" --output text`
+  #---------------------------------------------------------------
+  enable_kube_prometheus_stack = var.enable_kube_prometheus_stack
+  kube_prometheus_stack = {
+    values = [
+      templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
+        storage_class_type = kubernetes_storage_class.default_gp3.id
+      })
+    ]
+    chart_version = "48.1.1"
+    set_sensitive = [
+      {
+        name  = "grafana.adminPassword"
+        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
+      }
+    ],
+  }
+
+  #---------------------------------------
+  # Enable FSx for Lustre CSI Driver
+  #---------------------------------------
+  enable_aws_fsx_csi_driver = var.enable_aws_efa_k8s_device_plugin
+
+  tags = local.tags
+
+  #---------------------------------------
+  # CloudWatch metrics for EKS
+  #---------------------------------------
+  enable_aws_cloudwatch_metrics = var.enable_aws_cloudwatch_metrics
+  aws_cloudwatch_metrics = {
+    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
+  }
+
+}
+
+#---------------------------------------------------------------
+# Data on EKS Kubernetes Addons
+#---------------------------------------------------------------
+
+module "data_addons" {
+  source  = "aws-ia/eks-data-addons/aws"
+  version = "1.33.0"
+
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  #---------------------------------------------------------------
+  # JupyterHub Add-on
+  #---------------------------------------------------------------
+  enable_jupyterhub = var.enable_jupyterhub
+  jupyterhub_helm_config = {
+    namespace        = kubernetes_namespace_v1.jupyterhub.id
+    create_namespace = false
+    values           = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
+  }
+
+  enable_volcano = var.enable_volcano
+  #---------------------------------------
+  # Kuberay Operator
+  #---------------------------------------
+  enable_kuberay_operator = var.enable_kuberay_operator
+  kuberay_operator_helm_config = {
+    version = "1.1.1"
+    # Enabling Volcano as Batch scheduler for KubeRay Operator
+    values = [
+      <<-EOT
+      batchScheduler:
+        enabled: true
+    EOT
+    ]
+  }
+
+  #---------------------------------------------------------------
+  # NVIDIA Device Plugin Add-on
+  #---------------------------------------------------------------
+  enable_nvidia_device_plugin = true
+  nvidia_device_plugin_helm_config = {
+    version = "v0.16.1"
+    name    = "nvidia-device-plugin"
+    values = [
+      <<-EOT
+        nodeSelector:
+          accelerator: nvidia
+        gfd:
+          enabled: true
+        nfd:
+          gc:
+            nodeSelector:
+              accelerator: nvidia
+          topologyUpdater:
+            nodeSelector:
+              accelerator: nvidia
+          worker:
+            nodeSelector:
+              accelerator: nvidia
+            tolerations:
+              - key: nvidia.com/gpu
+                operator: Exists
+                effect: NoSchedule
+              - operator: "Exists"
+      EOT
+    ]
+  }
+
+  #---------------------------------------
+  # EFA Device Plugin Add-on
+  #---------------------------------------
+  # IMPORTANT: Enable EFA only on nodes with EFA devices attached.
+  # Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling.
+  enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin
+  aws_efa_k8s_device_plugin_helm_config = {
+    values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")]
+  }
+
+  #---------------------------------------------------------------
+  # Kubecost Add-on
+  #---------------------------------------------------------------
+  enable_kubecost = var.enable_kubecost
+  kubecost_helm_config = {
+    values              = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})]
+    version             = "2.2.2"
+    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
+    repository_password = data.aws_ecrpublic_authorization_token.token.password
+  }
+
+  #---------------------------------------------------------------
+  # Neuron Add-on
+  #---------------------------------------------------------------
+  enable_aws_neuron_device_plugin = true
+
+  aws_neuron_device_plugin_helm_config = {
+    # Enable default scheduler
+    values = [
+      <<-EOT
+      devicePlugin:
+        tolerations:
+        - key: CriticalAddonsOnly
+          operator: Exists
+        - key: aws.amazon.com/neuron
+          operator: Exists
+          effect: NoSchedule
+        - key: hub.jupyter.org/dedicated
+          operator: Exists
+          effect: NoSchedule
+      scheduler:
+        enabled: true
+      npd:
+        enabled: false
+      EOT
+    ]
+  }
+
+  #---------------------------------------------------------------
+  # Karpenter Resources Add-on
+  #---------------------------------------------------------------
+  enable_karpenter_resources = true
+  karpenter_resources_helm_config = {
+
+    g5-gpu-karpenter = {
+      values = [
+        <<-EOT
+      name: g5-gpu-karpenter
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        amiFamily: Bottlerocket
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[2]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        instanceStorePolicy: RAID0
+        blockDeviceMappings:
+          # Root device
+          - deviceName: /dev/xvda
+            ebs:
+              volumeSize: 50Gi
+              volumeType: gp3
+              encrypted: true
+          # Data device: Container resources such as images and logs
+          - deviceName: /dev/xvdb
+            ebs:
+              volumeSize: 300Gi
+              volumeType: gp3
+              encrypted: true
+              ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""}
+
+      nodePool:
+        labels:
+          - instanceType: g5-gpu-karpenter
+          - type: karpenter
+          - accelerator: nvidia
+        taints:
+          - key: nvidia.com/gpu
+            value: "Exists"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["g5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: [ "2xlarge", "4xlarge", "8xlarge" ]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 300s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+    x86-cpu-karpenter = {
+      values = [
+        <<-EOT
+      name: x86-cpu-karpenter
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        amiFamily: Bottlerocket
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        blockDeviceMappings:
+          # Root device
+          - deviceName: /dev/xvda
+            ebs:
+              volumeSize: 100Gi
+              volumeType: gp3
+              encrypted: true
+          # Data device: Container resources such as images and logs
+          - deviceName: /dev/xvdb
+            ebs:
+              volumeSize: 300Gi
+              volumeType: gp3
+              encrypted: true
+              ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""}
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - instanceType: x86-cpu-karpenter
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["m5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 300s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+    trainium-trn1 = {
+      values = [
+        <<-EOT
+      name: trainium-trn1
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        amiSelectorTerms:
+          - alias: al2023@v20241024
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[2]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        instanceStorePolicy: RAID0
+        blockDeviceMappings:
+          # Root device
+          - deviceName: /dev/xvda
+            ebs:
+              volumeSize: 100Gi
+              volumeType: gp3
+              encrypted: true
+          # Data device: Container resources such as images and logs
+          - deviceName: /dev/xvdb
+            ebs:
+              volumeSize: 300Gi
+              volumeType: gp3
+              encrypted: true
+              ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""}
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - instanceType: trainium-trn1
+          - accelerator: neuron
+        taints:
+          - key: aws.amazon.com/neuron
+            value: "true"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["trn1"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 300s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+    inferentia-inf2 = {
+      values = [
+        <<-EOT
+      name: inferentia-inf2
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        amiSelectorTerms:
+          - alias: al2023@v20241024
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[2]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        blockDevice:
+          # Root device
+          - deviceName: /dev/xvda
+            ebs:
+              volumeSize: 100Gi
+              volumeType: gp3
+              encrypted: true
+          # Data device: Container resources such as images and logs
+          - deviceName: /dev/xvdb
+            ebs:
+              volumeSize: 300Gi
+              volumeType: gp3
+              encrypted: true
+              ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""}
+      nodePool:
+        labels:
+          - instanceType: inferentia-inf2
+          - type: karpenter
+          - accelerator: neuron
+        taints:
+          - key: aws.amazon.com/neuron
+            value: "true"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["inf2"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: [ "on-demand"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 300s
+          expireAfter: 720h
+        weight: 100
+      EOT
+      ]
+    }
+  }
+
+  depends_on = [
+    kubernetes_secret_v1.huggingface_token,
+    kubernetes_config_map_v1.notebook
+  ]
+}
+
+
+#---------------------------------------------------------------
+# Additional Resources
+#---------------------------------------------------------------
+
+resource "kubernetes_namespace_v1" "jupyterhub" {
+  metadata {
+    name = "jupyterhub"
+  }
+}
+
+
+resource "kubernetes_secret_v1" "huggingface_token" {
+  metadata {
+    name      = "hf-token"
+    namespace = kubernetes_namespace_v1.jupyterhub.id
+  }
+
+  data = {
+    token = var.huggingface_token
+  }
+}
+
+resource "kubernetes_config_map_v1" "notebook" {
+  metadata {
+    name      = "notebook"
+    namespace = kubernetes_namespace_v1.jupyterhub.id
+  }
+}
+
+#---------------------------------------------------------------
+# MLflow Tracking Add-on
+#---------------------------------------------------------------
+module "eks_data_addons" {
+  source = "aws-ia/eks-data-addons/aws"
+  version = "1.33.0" # ensure to update this to the latest/desired version
+
+  oidc_provider_arn      = module.eks.oidc_provider_arn
+  enable_mlflow_tracking = var.enable_mlflow_tracking
+
+  mlflow_tracking_helm_config = {
+    mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace)
+
+    values = [
+      templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", {
+        mlflow_sa          = local.mlflow_service_account
+        mlflow_irsa = try(module.mlflow_irsa[0].iam_role_arn, "")
+        # MLflow Postgres RDS Config
+        mlflow_db_username = local.mlflow_name
+        mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "")
+        mlflow_db_name = try(module.db[0].db_instance_name, "")
+        mlflow_db_host = try(element(split(":", module.db[0].db_instance_endpoint), 0), "")
+        # S3 bucket config for artifacts
+        s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "")
+      })
+    ]
+  }
+}
+
+#---------------------------------------------------------------
+# Grafana Admin credentials resources
+# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
+#---------------------------------------------------------------
+data "aws_secretsmanager_secret_version" "admin_password_version" {
+  secret_id  = aws_secretsmanager_secret.grafana.id
+  depends_on = [aws_secretsmanager_secret_version.grafana]
+}
+
+resource "random_password" "grafana" {
+  length           = 16
+  special          = true
+  override_special = "@_"
+}
+
+#tfsec:ignore:aws-ssm-secret-use-customer-key
+resource "aws_secretsmanager_secret" "grafana" {
+  name_prefix             = "${local.name}-oss-grafana"
+  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
+}
+
+resource "aws_secretsmanager_secret_version" "grafana" {
+  secret_id     = aws_secretsmanager_secret.grafana.id
+  secret_string = random_password.grafana.result
+}
+
+resource "kubectl_manifest" "neuron_monitor" {
+  yaml_body = file("${path.module}/monitoring/neuron-monitor-daemonset.yaml")
+}
+
+resource "kubectl_manifest" "dcgm" {
+  yaml_body = file("${path.module}/monitoring/dcgm.yaml")
+}
+
+data "aws_iam_policy_document" "karpenter_controller_policy" {
+  statement {
+    actions = [
+      "ec2:RunInstances",
+      "ec2:CreateLaunchTemplate",
+    ]
+    resources = ["*"]
+    effect    = "Allow"
+    sid       = "KarpenterControllerAdditionalPolicy"
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/cleanup.sh b/ai-ml/infrastructure/terraform/cleanup.sh
new file mode 100755
index 000000000..b09efd384
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/cleanup.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+echo "Destroying RayService..."
+
+# Delete the Ingress/SVC before removing the addons
+TMPFILE=$(mktemp)
+terraform output -raw configure_kubectl > "$TMPFILE"
+# check if TMPFILE contains the string "No outputs found"
+if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then
+  echo "No outputs found, skipping kubectl delete"
+  source "$TMPFILE"
+  kubectl delete -f src/service/ray-service.yaml
+fi
+
+
+# List of Terraform modules to apply in sequence
+targets=(
+  "module.data_addons"
+  "module.eks_blueprints_addons"
+  "module.eks"
+  "module.vpc"
+)
+
+# Destroy modules in sequence
+for target in "${targets[@]}"
+do
+  echo "Destroying module $target..."
+  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty)
+  if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
+    echo "SUCCESS: Terraform destroy of $target completed successfully"
+  else
+    echo "FAILED: Terraform destroy of $target failed"
+    exit 1
+  fi
+done
+
+echo "Destroying Load Balancers..."
+
+for arn in $(aws resourcegroupstaggingapi get-resources \
+  --resource-type-filters elasticloadbalancing:loadbalancer \
+  --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
+  --query 'ResourceTagMappingList[].ResourceARN' \
+  --output text); do \
+    aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \
+  done
+
+echo "Destroying Target Groups..."
+for arn in $(aws resourcegroupstaggingapi get-resources \
+  --resource-type-filters elasticloadbalancing:targetgroup \
+  --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
+  --query 'ResourceTagMappingList[].ResourceARN' \
+  --output text); do \
+    aws elbv2 delete-target-group --target-group-arn "$arn"; \
+  done
+
+echo "Destroying Security Groups..."
+for sg in $(aws ec2 describe-security-groups \
+  --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \
+  --query 'SecurityGroups[].GroupId' --output text); do \
+    aws ec2 delete-security-group --group-id "$sg"; \
+  done
+
+## Final destroy to catch any remaining resources
+echo "Destroying remaining resources..."
+destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
+if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
+  echo "SUCCESS: Terraform destroy of all modules completed successfully"
+else
+  echo "FAILED: Terraform destroy of all modules failed"
+  exit 1
+fi
diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf
new file mode 100644
index 000000000..3543232ec
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/eks.tf
@@ -0,0 +1,212 @@
+#---------------------------------------------------------------
+# EKS Cluster
+#---------------------------------------------------------------
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.15"
+
+  cluster_name    = local.name
+  cluster_version = var.eks_cluster_version
+
+  # if true, Your cluster API server is accessible from the internet.
+  # You can, optionally, limit the CIDR blocks that can access the public endpoint.
+  #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
+  cluster_endpoint_public_access = true
+
+  vpc_id = module.vpc.vpc_id
+  # Filtering only Secondary CIDR private subnets starting with "100.".
+  # Subnet IDs where the EKS Control Plane ENIs will be created
+  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+  substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
+
+  manage_aws_auth_configmap = true
+  aws_auth_roles = [
+    # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
+    {
+      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
+      username = "system:node:{{EC2PrivateDNSName}}"
+      groups = [
+        "system:bootstrappers",
+        "system:nodes",
+      ]
+    }
+  ]
+  #---------------------------------------
+  # Note: This can further restricted to specific required for each Add-on and your application
+  #---------------------------------------
+  # Extend cluster security group rules
+  cluster_security_group_additional_rules = {
+    ingress_nodes_ephemeral_ports_tcp = {
+      description                = "Nodes on ephemeral ports"
+      protocol                   = "tcp"
+      from_port                  = 0
+      to_port                    = 65535
+      type                       = "ingress"
+      source_node_security_group = true
+    }
+  }
+
+  node_security_group_additional_rules = {
+    # Allows Control Plane Nodes to talk to Worker nodes on all ports.
+    # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
+    # This can be restricted further to specific port based on the requirement for each Add-on
+    # e.g., coreDNS 53, metrics-server 4443.
+    # Update this according to your security requirements if needed
+    ingress_cluster_to_node_all_traffic = {
+      description                   = "Cluster API to Nodegroup all traffic"
+      protocol                      = "-1"
+      from_port                     = 0
+      to_port                       = 0
+      type                          = "ingress"
+      source_cluster_security_group = true
+    }
+  }
+
+  eks_managed_node_group_defaults = {
+    iam_role_additional_policies = {
+      # Not required, but used in the example to access the nodes to inspect mounted volumes
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+
+    ebs_optimized = true
+    # This block device is used only for root volume. Adjust volume according to your size.
+    # NOTE: Don't use this volume for ML workloads
+    block_device_mappings = {
+      xvda = {
+        device_name = "/dev/xvda"
+        ebs = {
+          volume_size = 100
+          volume_type = "gp3"
+        }
+      }
+    }
+  }
+
+  eks_managed_node_groups = {
+    #  It's recommended to have a Managed Node group for hosting critical add-ons
+    #  It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
+    #  You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
+    core_node_group = {
+      name        = "core-node-group"
+      description = "EKS Core node group for hosting system add-ons"
+      # Filtering only Secondary CIDR private subnets starting with "100.".
+      # Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+      )
+
+      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2
+      ami_type     = "AL2_x86_64" # Use this for Graviton AL2_ARM_64
+      min_size     = 2
+      max_size     = 8
+      desired_size = 2
+
+      instance_types = ["m5.xlarge"]
+
+      labels = {
+        WorkerType    = "ON_DEMAND"
+        NodeGroupType = "core"
+      }
+
+      tags = merge(local.tags, {
+        Name = "core-node-grp"
+      })
+    }
+
+#     # GPU Nodegroup for JupyterHub Notebook and Ray Service
+#     gpu1 = {
+#       name        = "gpu-node-grp"
+#       description = "EKS Node Group to run GPU workloads"
+#       # Filtering only Secondary CIDR private subnets starting with "100.".
+#       # Subnet IDs where the nodes/node groups will be provisioned
+#       subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+#         substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+#       )
+#
+#       ami_type     = "AL2_x86_64_GPU"
+#       min_size     = 0
+#       max_size     = 1
+#       desired_size = 0
+#
+#       instance_types = ["g5.12xlarge"]
+#
+#       labels = {
+#         WorkerType    = "ON_DEMAND"
+#         NodeGroupType = "gpu"
+#       }
+#
+#       taints = {
+#         gpu = {
+#           key      = "nvidia.com/gpu"
+#           effect   = "NO_SCHEDULE"
+#           operator = "EXISTS"
+#         }
+#       }
+#
+#       tags = merge(local.tags, {
+#         Name = "gpu-node-grp"
+#       })
+#     }
+
+    # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation.
+    # #
+    # gpu_p5_node_group = {
+    #   name        = "p5-gpu-node-grp"
+    #   description = "EKS Node Group to run GPU workloads"
+
+    #   ami_type     = "AL2_x86_64_GPU"
+
+    #   instance_types = ["p5.48xlarge"]
+    #   capacity_type = "ON_DEMAND"
+
+    #   # Filtering only Secondary CIDR private subnets starting with "100.".
+    #   # Subnet IDs where the nodes/node groups will be provisioned
+    #   subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
+    #     substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
+    #   )
+
+    #   # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation.
+    #   # subnet_ids = ["subnet-01234567890fds"]
+    #   # capacity_reservation_specification = {
+    #   #   capacity_reservation_target = {
+    #   #     capacity_reservation_id = "cr-01234567890fds"
+    #   #   }
+    #   # }
+
+    #   min_size     = 1
+    #   max_size     = 1
+    #   desired_size = 1
+
+    #   # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance
+    #   # we assign the host interface device_index=0, and all other interfaces device_index=1
+    #   #   p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31
+    #   #   p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3
+    #   network_interfaces = [
+    #     for i in range(32) : {
+    #       associate_public_ip_address = false
+    #       delete_on_termination       = true
+    #       device_index                = i == 0 ? 0 : 1
+    #       network_card_index          = i
+    #       interface_type              = "efa"
+    #     }
+    #   ]
+
+    #   # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171
+    #   bootstrap_extra_args = "--local-disks raid0"
+    #   taints = {
+    #     gpu = {
+    #       key      = "nvidia.com/gpu"
+    #       effect   = "NO_SCHEDULE"
+    #       operator = "EXISTS"
+    #     }
+    #   }
+    #   labels = {
+    #     WorkerType    = "ON_DEMAND"
+    #     NodeGroupType = "gpu"
+    #   }
+    #   tags = merge(local.tags, {
+    #     Name = "p5-gpu-node-grp"
+    #   })
+    # }
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre.tf b/ai-ml/infrastructure/terraform/fsx-for-lustre.tf
new file mode 100644
index 000000000..d97eef9b0
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/fsx-for-lustre.tf
@@ -0,0 +1,138 @@
+#---------------------------------------------------------------
+# FSx for Lustre File system Static provisioning
+#    1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600)
+#    2> Create Storage Class for Filesystem (Cluster scoped)
+#    3> Persistent Volume with  Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped)
+#    4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped)
+#---------------------------------------------------------------
+# NOTE: FSx for Lustre file system creation can take up to 10 mins
+resource "aws_fsx_lustre_file_system" "this" {
+  count                       = var.deploy_fsx_volume ? 1 : 0
+  deployment_type             = "PERSISTENT_2"
+  storage_type                = "SSD"
+  per_unit_storage_throughput = "500" # 125, 250, 500, 1000
+  storage_capacity            = 2400
+
+  subnet_ids         = [module.vpc.private_subnets[0]]
+  security_group_ids = [aws_security_group.fsx[0].id]
+  log_configuration {
+    level = "WARN_ERROR"
+  }
+  tags = merge({ "Name" : "${local.name}-static" }, local.tags)
+}
+
+# This process can take up to 7 mins
+resource "aws_fsx_data_repository_association" "this" {
+  count                = var.deploy_fsx_volume ? 1 : 0
+  file_system_id       = aws_fsx_lustre_file_system.this[0].id
+  data_repository_path = "s3://${module.fsx_s3_bucket[0].s3_bucket_id}"
+  file_system_path     = "/data" # This directory will be used in Spark podTemplates under volumeMounts as subPath
+
+  s3 {
+    auto_export_policy {
+      events = ["NEW", "CHANGED", "DELETED"]
+    }
+
+    auto_import_policy {
+      events = ["NEW", "CHANGED", "DELETED"]
+    }
+  }
+}
+
+#---------------------------------------------------------------
+# Sec group for FSx for Lustre
+#---------------------------------------------------------------
+resource "aws_security_group" "fsx" {
+  count       = var.deploy_fsx_volume ? 1 : 0
+  name        = "${local.name}-fsx"
+  description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem"
+  vpc_id      = module.vpc.vpc_id
+
+  ingress {
+    description = "Allows Lustre traffic between Lustre clients"
+    cidr_blocks = module.vpc.private_subnets_cidr_blocks
+    from_port   = 1021
+    to_port     = 1023
+    protocol    = "tcp"
+  }
+  ingress {
+    description = "Allows Lustre traffic between Lustre clients"
+    cidr_blocks = module.vpc.private_subnets_cidr_blocks
+    from_port   = 988
+    to_port     = 988
+    protocol    = "tcp"
+  }
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# S3 bucket for DataSync between FSx for Lustre and S3 Bucket
+#---------------------------------------------------------------
+#tfsec:ignore:aws-s3-enable-bucket-logging tfsec:ignore:aws-s3-enable-versioning
+module "fsx_s3_bucket" {
+  count   = var.deploy_fsx_volume ? 1 : 0
+  source  = "terraform-aws-modules/s3-bucket/aws"
+  version = "~> 3.0"
+
+  create_bucket = true
+
+  bucket_prefix = "${local.name}-fsx-"
+  # For example only - please evaluate for your environment
+  force_destroy = true
+
+  server_side_encryption_configuration = {
+    rule = {
+      apply_server_side_encryption_by_default = {
+        sse_algorithm = "AES256"
+      }
+    }
+  }
+}
+
+#---------------------------------------------------------------
+# Storage Class - FSx for Lustre
+#---------------------------------------------------------------
+resource "kubectl_manifest" "storage_class" {
+  count               = var.deploy_fsx_volume ? 1 : 0
+  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-storage-class.yaml", {
+    subnet_id         = module.vpc.private_subnets[0],
+    security_group_id = aws_security_group.fsx[0].id
+  })
+
+  depends_on = [
+    module.eks_blueprints_addons
+  ]
+}
+
+#---------------------------------------------------------------
+# FSx for Lustre Persistent Volume - Static provisioning
+#---------------------------------------------------------------
+resource "kubectl_manifest" "static_pv" {
+  count           = var.deploy_fsx_volume ? 1 : 0
+  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pv.yaml", {
+    filesystem_id = aws_fsx_lustre_file_system.this[0].id,
+    dns_name      = aws_fsx_lustre_file_system.this[0].dns_name
+    mount_name    = aws_fsx_lustre_file_system.this[0].mount_name,
+  })
+
+  depends_on = [
+    module.eks_blueprints_addons,
+    kubectl_manifest.storage_class,
+    aws_fsx_lustre_file_system.this
+  ]
+}
+
+#---------------------------------------------------------------
+# FSx for Lustre Persistent Volume Claim
+#---------------------------------------------------------------
+resource "kubectl_manifest" "static_pvc" {
+  count = var.deploy_fsx_volume ? 1 : 0
+  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pvc.yaml", {})
+
+  depends_on = [
+    module.eks_blueprints_addons,
+    kubectl_manifest.storage_class,
+    kubectl_manifest.static_pv,
+    aws_fsx_lustre_file_system.this
+  ]
+}
diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml
new file mode 100644
index 000000000..857bdcf3a
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml
@@ -0,0 +1,21 @@
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: fsx-static-pv
+spec:
+  capacity:
+    storage: 1000Gi
+  volumeMode: Filesystem
+  storageClassName: fsx
+  accessModes:
+    - ReadWriteMany
+  mountOptions:
+    - flock
+  persistentVolumeReclaimPolicy: Recycle
+  csi:
+    driver: fsx.csi.aws.com
+    volumeHandle: ${filesystem_id}
+    volumeAttributes:
+      dnsname: ${dns_name}
+      mountname: ${mount_name}
diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml
new file mode 100644
index 000000000..dddebd66c
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: fsx-static-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: fsx
+  resources:
+    requests:
+      storage: 1000Gi
+  volumeName: fsx-static-pv
diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml
new file mode 100644
index 000000000..125fb2478
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml
@@ -0,0 +1,9 @@
+---
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: fsx
+provisioner: fsx.csi.aws.com
+parameters:
+  subnetId: ${subnet_id}
+  securityGroupIds: ${security_group_id}
diff --git a/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml b/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml
new file mode 100644
index 000000000..de495c16a
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml
@@ -0,0 +1,4 @@
+## Argo Events admission webhook
+webhook:
+  # -- Enable admission webhook. Applies only for cluster-wide installation
+  enabled: true
diff --git a/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml b/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml
new file mode 100644
index 000000000..86c764042
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml
@@ -0,0 +1,4 @@
+server:
+  autoscaling:
+    enabled: true
+    minReplicas: 1
diff --git a/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
new file mode 100644
index 000000000..ae3c41d44
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
@@ -0,0 +1,11 @@
+resources:
+  limits:
+    cpu: 500m
+    memory: 2Gi
+  requests:
+    cpu: 200m
+    memory: 1Gi
+
+# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
+tolerations:
+  - operator: Exists
diff --git a/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml b/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
new file mode 100644
index 000000000..c214e10ba
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
@@ -0,0 +1,5 @@
+tolerations:
+  - key: nvidia.com/gpu
+    operator: Exists
+    effect: NoSchedule
+  - operator: "Exists"
diff --git a/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml
new file mode 100644
index 000000000..c8b1a5d74
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml
@@ -0,0 +1,11 @@
+controller:
+  service:
+    externalTrafficPolicy: "Local"
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
+    targetPorts:
+      http: http
+      https: http
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml
new file mode 100644
index 000000000..03ce4b4be
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml
@@ -0,0 +1,54 @@
+hub:
+  config:
+    Authenticator:
+      admin_users:
+        - admin1
+      allowed_users:
+        - user1
+    # testing only - do not do this for production
+    DummyAuthenticator:
+      password: never-do-this
+    JupyterHub:
+      authenticator_class: dummy
+proxy:
+  service:
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+singleuser:
+  image:
+    name: public.ecr.aws/h3o5n2r0/gpu-jupyter
+    tag: v1.5_cuda-11.6_ubuntu-20.04_python-only
+    pullPolicy: Always
+  cmd: null
+  startTimeout: 600
+  memory:
+    guarantee: 24G
+  extraResource:
+    limits:
+      nvidia.com/gpu: "1"
+  extraEnv:
+    HUGGING_FACE_HUB_TOKEN:
+      valueFrom:
+        secretKeyRef:
+          name: hf-token
+          key: token
+  storage:
+    capacity: 100Gi
+    extraVolumes:
+      - name: shm-volume
+        emptyDir:
+          medium: Memory
+    extraVolumeMounts:
+      - name: shm-volume
+        mountPath: /dev/shm
+  extraTolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+scheduling:
+  userScheduler:
+    enabled: false
diff --git a/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml
new file mode 100644
index 000000000..47e090743
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml
@@ -0,0 +1,48 @@
+prometheus:
+  prometheusSpec:
+    retention: 5h
+    scrapeInterval: 30s
+    evaluationInterval: 30s
+    scrapeTimeout: 10s
+    serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
+    storageSpec:
+      volumeClaimTemplate:
+        metadata:
+          name: data
+        spec:
+          storageClassName: ${storage_class_type}
+          accessModes:
+          - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+alertmanager:
+  enabled: false
+
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: true
+prometheus:
+  prometheusSpec:
+    retention: 5h
+    scrapeInterval: 30s
+    evaluationInterval: 30s
+    scrapeTimeout: 10s
+    serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
+    storageSpec:
+      volumeClaimTemplate:
+        metadata:
+          name: data
+        spec:
+          storageClassName: ${storage_class_type}
+          accessModes:
+          - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+alertmanager:
+  enabled: false
+
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: true
diff --git a/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml b/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml
new file mode 100644
index 000000000..178eb68cf
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml
@@ -0,0 +1,69 @@
+
+# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090
+
+global:
+  # pricingCsv:
+  #   enabled: false
+  #   location:
+  #     provider: "AWS"
+  #     region: "us-east-1"
+  #     URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
+  #     csvAccessCredentials: pricing-schema-access-secret
+
+  prometheus:
+    enabled: true  # Kubecost depends on Prometheus data, it is not optional. When enabled: false, Prometheus will not be installed and you must configure your own Prometheus to scrape kubecost as well as provide the fqdn below. -- Warning: Before changing this setting, please read to understand the risks https://docs.kubecost.com/install-and-configure/install/custom-prom
+    fqdn: http://cost-analyzer-prometheus-server.default.svc  # example address of a prometheus to connect to. Include protocol (http:// or https://) Ignored if enabled: true
+
+  grafana:
+    enabled: true  # If false, Grafana will not be installed
+    domainName: cost-analyzer-grafana.default.svc  # example grafana domain Ignored if enabled: true
+    scheme: "http"  # http or https, for the domain name above.
+    proxy: true  # If true, the kubecost frontend will route to your grafana through its service endpoint
+
+kubecostFrontend:
+  image: public.ecr.aws/kubecost/frontend
+  resources:
+    requests:
+      cpu: "200m"
+      memory: "512Mi"
+
+kubecostMetrics:
+  emitPodAnnotations: true
+  emitNamespaceAnnotations: true
+
+kubecostModel:
+  image: public.ecr.aws/kubecost/cost-model
+  resources:
+    requests:
+      cpu: "500m"
+      memory: "512Mi"
+
+forecasting:
+  fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.6
+
+networkCosts:
+  image:
+    repository: public.ecr.aws/kubecost/kubecost-network-costs
+
+clusterController:
+  image:
+    repository: public.ecr.aws/kubecost/cluster-controller
+
+prometheus:
+  server:
+    image:
+      repository: public.ecr.aws/kubecost/prometheus
+
+  configmapReload:
+    prometheus:
+      image:
+        repository: public.ecr.aws/kubecost/prometheus-config-reloader
+
+reporting:
+  productAnalytics: false
+
+# Define persistence volume for cost-analyzer
+persistentVolume:
+  size: 32Gi
+  dbSize: 32.0Gi
+  enabled: true # Note that setting this to false means configurations will be wiped out on pod restart.
diff --git a/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml b/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml
new file mode 100644
index 000000000..1f604f610
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml
@@ -0,0 +1,88 @@
+# Default values for mlflow-tracking-server.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+image:
+  repository: public.ecr.aws/data-on-eks/mlflow
+  pullPolicy: Always
+  tag: 2.7.1
+
+imagePullSecrets: []
+
+nameOverride: mlflow-tracking-server
+
+fullnameOverride: mlflow-tracking-server
+
+podAnnotations: {}
+
+replicaCount: 1
+
+service:
+  type: ClusterIP
+  port: 5000
+
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: false
+  # Annotations to add to the service account
+  annotations:
+    eks.amazonaws.com/role-arn: ${mlflow_irsa}
+  labels: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ${mlflow_sa}
+
+ingress:
+  enabled: true
+  className: nginx
+  annotations:
+    kubernetes.io/ingress.class: nginx
+    nginx.ingress.kubernetes.io/use-regex: "true"
+  hosts:
+    - host:
+      paths:
+        - path: /
+          pathType: Prefix
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+mlflow:
+  artifacts:
+    bucketName: ${s3_bucket_name}
+  database:
+    name: ${mlflow_db_name}
+    username: ${mlflow_db_username}
+    password: ${mlflow_db_password}
+    host: ${mlflow_db_host}
+    port: 5432
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+securityContext: {}
+  # capabilities:
+  #   drop:
+  #   - ALL
+  # readOnlyRootFilesystem: true
+  # runAsNonRoot: true
+  # runAsUser: 1000
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}
diff --git a/ai-ml/infrastructure/terraform/install.sh b/ai-ml/infrastructure/terraform/install.sh
new file mode 100755
index 000000000..1814a9044
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/install.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# List of Terraform modules to apply in sequence
+targets=(
+  "module.vpc"
+  "module.eks"
+)
+
+# Initialize Terraform
+terraform init -upgrade
+
+# Apply modules in sequence
+for target in "${targets[@]}"
+do
+  echo "Applying module $target..."
+  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
+  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+    echo "SUCCESS: Terraform apply of $target completed successfully"
+  else
+    echo "FAILED: Terraform apply of $target failed"
+    exit 1
+  fi
+done
+
+# Final apply to catch any remaining resources
+echo "Applying remaining resources..."
+apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
+if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
+  echo "SUCCESS: Terraform apply of all modules completed successfully"
+else
+  echo "FAILED: Terraform apply of all modules failed"
+  exit 1
+fi
diff --git a/ai-ml/infrastructure/terraform/karpenter.tf b/ai-ml/infrastructure/terraform/karpenter.tf
new file mode 100644
index 000000000..e69de29bb
diff --git a/ai-ml/infrastructure/terraform/main.tf b/ai-ml/infrastructure/terraform/main.tf
new file mode 100644
index 000000000..938dc4b74
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/main.tf
@@ -0,0 +1,60 @@
+provider "aws" {
+  region = local.region
+}
+
+# ECR always authenticates with `us-east-1` region
+# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
+provider "aws" {
+  alias  = "ecr"
+  region = "us-east-1"
+}
+
+provider "kubernetes" {
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  token                  = data.aws_eks_cluster_auth.this.token
+}
+
+provider "helm" {
+  kubernetes {
+    host                   = module.eks.cluster_endpoint
+    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+    token                  = data.aws_eks_cluster_auth.this.token
+  }
+}
+provider "kubectl" {
+  apply_retry_count      = 30
+  host                   = module.eks.cluster_endpoint
+  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
+  token                  = data.aws_eks_cluster_auth.this.token
+  load_config_file       = false
+}
+
+data "aws_eks_cluster_auth" "this" {
+  name = module.eks.cluster_name
+}
+
+data "aws_availability_zones" "available" {}
+
+data "aws_ecrpublic_authorization_token" "token" {
+  provider = aws.ecr
+}
+
+data "aws_caller_identity" "current" {}
+data "aws_partition" "current" {}
+
+locals {
+  name   = var.name
+  region = var.region
+  azs    = slice(data.aws_availability_zones.available.names, 0, 2)
+  partition  = data.aws_partition.current.partition
+  account_id = data.aws_caller_identity.current.account_id
+  mlflow_name            = "mlflow"
+  mlflow_namespace       = "mlflow"
+  mlflow_service_account = "mlflow"
+
+  tags = {
+    Blueprint  = local.name
+    GithubRepo = "github.com/awslabs/data-on-eks"
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/mlflow-core.tf b/ai-ml/infrastructure/terraform/mlflow-core.tf
new file mode 100644
index 000000000..55dafeff1
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/mlflow-core.tf
@@ -0,0 +1,245 @@
+#---------------------------------------------------------------
+# RDS Postgres Database for MLflow Backend
+#---------------------------------------------------------------
+module "db" {
+  count   = var.enable_mlflow_tracking ? 1 : 0
+  source  = "terraform-aws-modules/rds/aws"
+  version = "~> 5.0"
+
+  identifier = local.mlflow_name
+
+  engine               = "postgres"
+  engine_version       = "14.13"
+  family               = "postgres14"
+  major_engine_version = "14"
+  instance_class       = "db.m6i.xlarge"
+
+  storage_type      = "io1"
+  allocated_storage = 100
+  iops              = 3000
+
+  db_name                = local.mlflow_name
+  username               = local.mlflow_name
+  create_random_password = false
+  password               = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string)
+  port                   = 5432
+
+  multi_az               = true
+  db_subnet_group_name   = module.vpc.database_subnet_group
+  vpc_security_group_ids = [module.security_group[0].security_group_id]
+
+  maintenance_window              = "Mon:00:00-Mon:03:00"
+  backup_window                   = "03:00-06:00"
+  enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
+  create_cloudwatch_log_group     = true
+
+  backup_retention_period = 5
+  skip_final_snapshot     = true
+  deletion_protection     = false
+
+  performance_insights_enabled          = true
+  performance_insights_retention_period = 7
+  create_monitoring_role                = true
+  monitoring_interval                   = 60
+  monitoring_role_name                  = "mlflow-backend"
+  monitoring_role_use_name_prefix       = true
+  monitoring_role_description           = "MLflow Postgres Backend for monitoring role"
+
+  parameters = [
+    {
+      name  = "autovacuum"
+      value = 1
+    },
+    {
+      name  = "client_encoding"
+      value = "utf8"
+    }
+  ]
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# MLflow Postgres Backend DB Master password
+#---------------------------------------------------------------
+resource "random_password" "postgres" {
+  count   = var.enable_mlflow_tracking ? 1 : 0
+  length  = 16
+  special = false
+}
+#tfsec:ignore:aws-ssm-secret-use-customer-key
+resource "aws_secretsmanager_secret" "postgres" {
+  count                   = var.enable_mlflow_tracking ? 1 : 0
+  name                    = local.mlflow_name
+  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
+}
+
+resource "aws_secretsmanager_secret_version" "postgres" {
+  count         = var.enable_mlflow_tracking ? 1 : 0
+  secret_id     = aws_secretsmanager_secret.postgres[0].id
+  secret_string = random_password.postgres[0].result
+}
+
+#---------------------------------------------------------------
+# PostgreSQL RDS security group
+#---------------------------------------------------------------
+module "security_group" {
+  count   = var.enable_mlflow_tracking ? 1 : 0
+  source  = "terraform-aws-modules/security-group/aws"
+  version = "~> 5.0"
+
+  name        = local.name
+  description = "Complete PostgreSQL example security group"
+  vpc_id      = module.vpc.vpc_id
+
+  # ingress
+  ingress_with_cidr_blocks = [
+    {
+      from_port   = 5432
+      to_port     = 5432
+      protocol    = "tcp"
+      description = "PostgreSQL access from within VPC"
+      cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}"
+    },
+  ]
+
+  tags = local.tags
+}
+
+
+#---------------------------------------------------------------
+# S3 bucket for MLflow artifacts
+#---------------------------------------------------------------
+
+#tfsec:ignore:*
+module "mlflow_s3_bucket" {
+  count   = var.enable_mlflow_tracking ? 1 : 0
+  source  = "terraform-aws-modules/s3-bucket/aws"
+  version = "~> 3.0"
+
+  bucket_prefix = "${local.name}-artifacts-"
+
+  # For example only - please evaluate for your environment
+  force_destroy = true
+
+  server_side_encryption_configuration = {
+    rule = {
+      apply_server_side_encryption_by_default = {
+        sse_algorithm = "AES256"
+      }
+    }
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# MLflow Namespace
+#---------------------------------------------------------------
+resource "kubernetes_namespace_v1" "mlflow" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+  metadata {
+    name = local.mlflow_namespace
+  }
+  timeouts {
+    delete = "15m"
+  }
+}
+
+resource "kubernetes_service_account_v1" "mlflow" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+  metadata {
+    name        = local.mlflow_service_account
+    namespace   = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn }
+  }
+
+  automount_service_account_token = true
+}
+
+resource "kubernetes_secret_v1" "mlflow" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+  metadata {
+    name      = "${local.mlflow_service_account}-secret"
+    namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    annotations = {
+      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.mlflow[0].metadata[0].name
+      "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+    }
+  }
+
+  type = "kubernetes.io/service-account-token"
+}
+
+# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled
+module "mlflow_irsa" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+
+  source  = "aws-ia/eks-blueprints-addon/aws"
+  version = "~> 1.0" #ensure to update this to the latest/desired version
+
+  # Disable helm release
+  create_release = false
+
+  # IAM role for service account (IRSA)
+  create_role   = true
+  create_policy = false # Policy is created in the next resource
+
+  role_name     = local.mlflow_service_account
+  role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn }
+
+  oidc_providers = {
+    this = {
+      provider_arn    = module.eks.oidc_provider_arn
+      namespace       = kubernetes_namespace_v1.mlflow[0].metadata[0].name
+      service_account = local.mlflow_service_account
+    }
+  }
+
+  tags = local.tags
+}
+
+#--------------------------------------------------------------------------
+# IAM policy for MLflow for accessing S3 artifacts and RDS Postgres backend
+#--------------------------------------------------------------------------
+resource "aws_iam_policy" "mlflow" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+
+  description = "IAM policy for MLflow"
+  name_prefix = format("%s-%s-", local.name, "mlflow")
+  path        = "/"
+  policy      = data.aws_iam_policy_document.mlflow[0].json
+}
+
+data "aws_iam_policy_document" "mlflow" {
+  count = var.enable_mlflow_tracking ? 1 : 0
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"]
+
+    actions = [
+      "s3:ListBucket"
+    ]
+  }
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"]
+
+    actions = [
+      "s3:GetObject",
+      "s3:PutObject",
+      "s3:DeleteObject"
+    ]
+  }
+  statement {
+    sid       = ""
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
+
+    actions = [
+      "rds-db:connect",
+    ]
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
new file mode 100644
index 000000000..c3ffe67d3
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
@@ -0,0 +1,82 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: "dcgm-exporter"
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "3.6.1"
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: "dcgm-exporter"
+      app.kubernetes.io/version: "3.6.1"
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: "dcgm-exporter"
+        app.kubernetes.io/version: "3.6.1"
+      name: "dcgm-exporter"
+    spec:
+      containers:
+        - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04"
+          env:
+            - name: "DCGM_EXPORTER_LISTEN"
+              value: ":9400"
+            - name: "DCGM_EXPORTER_KUBERNETES"
+              value: "true"
+          name: "dcgm-exporter"
+          ports:
+            - name: "metrics"
+              containerPort: 9400
+          securityContext:
+            runAsNonRoot: false
+            runAsUser: 0
+            capabilities:
+              add: ["SYS_ADMIN"]
+          volumeMounts:
+            - name: "pod-gpu-resources"
+              readOnly: true
+              mountPath: "/var/lib/kubelet/pod-resources"
+      volumes:
+        - name: "pod-gpu-resources"
+          hostPath:
+            path: "/var/lib/kubelet/pod-resources"
+      nodeSelector:
+        accelerator: nvidia
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+---
+kind: Service
+apiVersion: v1
+metadata:
+  name: "dcgm-exporter"
+  namespace: kube-system
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "3.6.1"
+spec:
+  selector:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "3.6.1"
+  ports:
+    - name: "metrics"
+      port: 9400
diff --git a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
new file mode 100644
index 000000000..2ed065546
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
@@ -0,0 +1,42 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: neuron-monitor
+  namespace: kube-system
+  labels:
+    app: neuron-monitor
+    version: v1
+spec:
+  selector:
+    matchLabels:
+      app: neuron-monitor
+  template:
+    metadata:
+      labels:
+        app: neuron-monitor
+        version: v1
+    spec:
+      containers:
+        - name: neuron-monitor
+          image: public.ecr.aws/neuron/neuron-monitor:1.1.0
+          ports:
+            - containerPort: 8000
+          command:
+             - "/opt/bin/entrypoint.sh"
+          args: 
+            - "--port"
+            - "8000"  
+          resources:
+            limits:
+              cpu: 500m
+              memory: 256Mi
+            requests:
+              cpu: 256m
+              memory: 128Mi
+          env:
+          - name: GOMEMLIMIT
+            value: 160MiB
+          securityContext:
+            privileged: true
+      nodeSelector:
+        accelerator: neuron
diff --git a/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml b/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml
new file mode 100644
index 000000000..8ade99739
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml
@@ -0,0 +1,21 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: ray-workers-monitor
+  namespace: kube-prometheus-stack
+  labels:
+    # `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label.
+    release: kube-prometheus-stack
+spec:
+  jobLabel: ray-workers
+  # Only select Kubernetes Pods in the "default" namespace.
+  namespaceSelector:
+    matchNames:
+      - rayserve-vllm
+  # Only select Kubernetes Pods with "matchLabels".
+  selector:
+    matchLabels:
+      ray.io/node-type: worker
+  # A list of endpoints allowed as part of this PodMonitor.
+  podMetricsEndpoints:
+  - port: metrics
diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
new file mode 100644
index 000000000..26d11b3f1
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
@@ -0,0 +1,4535 @@
+{
+    "annotations": {
+        "list": [
+            {
+                "builtIn": 1,
+                "datasource": "-- Grafana --",
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+            }
+        ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "iteration": 1667344411089,
+    "links": [],
+    "panels": [
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 0
+            },
+            "hiddenSeries": false,
+            "id": 1,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Bytes Spilled",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Amount allocated by dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 0
+            },
+            "hiddenSeries": false,
+            "id": 2,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Bytes Allocated",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Amount freed by dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 1
+            },
+            "hiddenSeries": false,
+            "id": 3,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Bytes Freed",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Amount of memory store used by dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 1
+            },
+            "hiddenSeries": false,
+            "id": 4,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Current Usage: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Object Store Memory",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Logical CPUs allocated to dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 2
+            },
+            "hiddenSeries": false,
+            "id": 5,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "CPU Usage: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "CPUs (logical slots)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "cores",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Logical GPUs allocated to dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 2
+            },
+            "hiddenSeries": false,
+            "id": 6,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "GPU Usage: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "GPUs (logical slots)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "cores",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Total bytes outputted by dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 3
+            },
+            "hiddenSeries": false,
+            "id": 7,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Bytes Outputted",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Total rows outputted by dataset operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 3
+            },
+            "hiddenSeries": false,
+            "id": 11,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Rows Outputted",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "rows",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of input blocks received by operator.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 4
+            },
+            "hiddenSeries": false,
+            "id": 17,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Blocks Received: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Input Blocks Received by Operator",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of input blocks received by operator.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 4
+            },
+            "hiddenSeries": false,
+            "id": 18,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Received: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Input Blocks Received by Operator",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of input blocks that operator's tasks have finished processing.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 5
+            },
+            "hiddenSeries": false,
+            "id": 19,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Blocks Processed: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Input Blocks Processed by Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of input blocks that operator's tasks have finished processing.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 5
+            },
+            "hiddenSeries": false,
+            "id": 20,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Processed: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Input Bytes Processed by Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of input blocks passed to submitted tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 6
+            },
+            "hiddenSeries": false,
+            "id": 21,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Submitted: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Input Bytes Submitted to Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of output blocks generated by tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 6
+            },
+            "hiddenSeries": false,
+            "id": 22,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Blocks Generated: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Blocks Generated by Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of output blocks generated by tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 7
+            },
+            "hiddenSeries": false,
+            "id": 23,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Generated: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Bytes Generated by Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of rows in generated output blocks from finished tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 7
+            },
+            "hiddenSeries": false,
+            "id": 24,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Rows Generated: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Rows Generated by Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "rows",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of output blocks that are already taken by downstream operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 8
+            },
+            "hiddenSeries": false,
+            "id": 25,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Blocks Taken: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Output Blocks Taken by Downstream Operators",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of output blocks that are already taken by downstream operators.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 8
+            },
+            "hiddenSeries": false,
+            "id": 26,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Taken: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Output Bytes Taken by Downstream Operators",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of submitted tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 9
+            },
+            "hiddenSeries": false,
+            "id": 29,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Submitted Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of running tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 9
+            },
+            "hiddenSeries": false,
+            "id": 30,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Running Tasks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Running Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of tasks that already have output.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 10
+            },
+            "hiddenSeries": false,
+            "id": 31,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Tasks with output blocks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of finished tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 10
+            },
+            "hiddenSeries": false,
+            "id": 32,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Finished Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of failed tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 11
+            },
+            "hiddenSeries": false,
+            "id": 33,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Failed Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Time spent generating blocks in tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 11
+            },
+            "hiddenSeries": false,
+            "id": 8,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Block Generation Time",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Time spent in task submission backpressure.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 12
+            },
+            "hiddenSeries": false,
+            "id": 37,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Task Submission Backpressure Time",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of blocks in operator's internal input queue",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 12
+            },
+            "hiddenSeries": false,
+            "id": 13,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Operator Internal Inqueue Size (Blocks)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of input blocks in the operator's internal input queue.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 13
+            },
+            "hiddenSeries": false,
+            "id": 14,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Operator Internal Inqueue Size (Bytes)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of blocks in operator's internal output queue",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 13
+            },
+            "hiddenSeries": false,
+            "id": 15,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Operator Internal Outqueue Size (Blocks)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "blocks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of output blocks in the operator's internal output queue.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 14
+            },
+            "hiddenSeries": false,
+            "id": 16,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Operator Internal Outqueue Size (Bytes)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of input blocks used by pending tasks.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 14
+            },
+            "hiddenSeries": false,
+            "id": 34,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Size of Blocks used in Pending Tasks (Bytes)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of freed memory in object store.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 15
+            },
+            "hiddenSeries": false,
+            "id": 35,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Freed Memory in Object Store (Bytes)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Byte size of spilled memory in object store.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 15
+            },
+            "hiddenSeries": false,
+            "id": 36,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
+                    "interval": "",
+                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Spilled Memory in Object Store (Bytes)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Seconds spent in iterator initialization code",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 16
+            },
+            "hiddenSeries": false,
+            "id": 12,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
+                    "interval": "",
+                    "legendFormat": "Seconds: {{dataset}}, {{operator}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Iteration Initialization Time",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Seconds user thread is blocked by iter_batches()",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 16
+            },
+            "hiddenSeries": false,
+            "id": 9,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
+                    "interval": "",
+                    "legendFormat": "Seconds: {{dataset}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Iteration Blocked Time",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Seconds spent in user code",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 17
+            },
+            "hiddenSeries": false,
+            "id": 10,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
+                    "interval": "",
+                    "legendFormat": "Seconds: {{dataset}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Iteration User Time",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        }
+    ],
+    "refresh": false,
+    "schemaVersion": 27,
+    "style": "dark",
+    "tags": [
+        "rayVersion:2.24.0"
+    ],
+    "templating": {
+        "list": [
+            {
+                "current": {
+                    "selected": false
+                },
+                "description": "Filter queries of a specific Prometheus type.",
+                "hide": 2,
+                "includeAll": false,
+                "multi": false,
+                "name": "datasource",
+                "options": [],
+                "query": "prometheus",
+                "refresh": 1,
+                "regex": "",
+                "skipUrlSync": false,
+                "type": "datasource"
+            },
+            {
+                "allValue": ".+",
+                "current": {
+                    "selected": false
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_data_allocated_bytes{}, SessionName)",
+                "description": "Filter queries to specific ray sessions.",
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": false,
+                "name": "SessionName",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_data_allocated_bytes{}, SessionName)",
+                    "refId": "StandardVariableQuery"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 2,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".+",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_data_allocated_bytes{}, dataset)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "DatasetID",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_data_allocated_bytes{}, dataset)",
+                    "refId": "Prometheus-Dataset-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            }
+        ]
+    },
+    "rayMeta": [
+        "excludesSystemRoutes",
+        "supportsGlobalFilterOverride"
+    ],
+    "time": {
+        "from": "now-30m",
+        "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Data Dashboard",
+    "uid": "rayDataDashboard",
+    "version": 1
+}
diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
new file mode 100644
index 000000000..7814395f5
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
@@ -0,0 +1,2836 @@
+{
+    "annotations": {
+        "list": [
+            {
+                "builtIn": 1,
+                "datasource": "-- Grafana --",
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+            }
+        ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "iteration": 1667344411089,
+    "links": [],
+    "panels": [
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 0
+            },
+            "hiddenSeries": false,
+            "id": 26,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)",
+                    "interval": "",
+                    "legendFormat": "{{State}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)",
+                    "interval": "",
+                    "legendFormat": "{{State}} (retry)",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduler Task State",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 0
+            },
+            "hiddenSeries": false,
+            "id": 35,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)",
+                    "interval": "",
+                    "legendFormat": "{{Name}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)",
+                    "interval": "",
+                    "legendFormat": "{{Name}} (retry)",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Active Tasks by Name",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 1
+            },
+            "hiddenSeries": false,
+            "id": 33,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)",
+                    "interval": "",
+                    "legendFormat": "{{State}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduler Actor State",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "actors",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current number of (live) actors with a particular name.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 1
+            },
+            "hiddenSeries": false,
+            "id": 36,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)",
+                    "interval": "",
+                    "legendFormat": "{{Name}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Active Actors by Name",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "actors",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 2
+            },
+            "hiddenSeries": false,
+            "id": 27,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)",
+                    "interval": "",
+                    "legendFormat": "CPU Usage: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))",
+                    "interval": "",
+                    "legendFormat": "MAX + PENDING",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduler CPUs (logical slots)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "cores",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 2
+            },
+            "hiddenSeries": false,
+            "id": 29,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)",
+                    "interval": "",
+                    "legendFormat": "{{Location}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Object Store Memory",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 3
+            },
+            "hiddenSeries": false,
+            "id": 28,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "GPU Usage: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))",
+                    "interval": "",
+                    "legendFormat": "MAX + PENDING",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduler GPUs (logical slots)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "GPUs",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 3
+            },
+            "hiddenSeries": false,
+            "id": 40,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)",
+                    "interval": "",
+                    "legendFormat": "{{State}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduler Placement Groups",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "placement groups",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 4
+            },
+            "hiddenSeries": false,
+            "id": 2,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100",
+                    "interval": "",
+                    "legendFormat": "CPU Usage: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node CPU (hardware utilization)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "cores",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 4
+            },
+            "hiddenSeries": false,
+            "id": 8,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100",
+                    "interval": "",
+                    "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node GPU (hardware utilization)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "GPUs",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 5
+            },
+            "hiddenSeries": false,
+            "id": 6,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Disk Used: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Disk",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Disk IO per node.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 5
+            },
+            "hiddenSeries": false,
+            "id": 32,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Write: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Read: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Disk IO Speed",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "Bps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 6
+            },
+            "hiddenSeries": false,
+            "id": 4,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Memory Used: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Memory (heap + object store)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 6
+            },
+            "hiddenSeries": false,
+            "id": 44,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "OOM Killed: {{Name}}, {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Out of Memory Failures by Name",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "failures",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 7
+            },
+            "hiddenSeries": false,
+            "id": 34,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))",
+                    "interval": "",
+                    "legendFormat": "{{Component}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "shared_memory",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Memory by Component",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 7
+            },
+            "hiddenSeries": false,
+            "id": 37,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100",
+                    "interval": "",
+                    "legendFormat": "{{Component}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node CPU by Component",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "cores",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 8
+            },
+            "hiddenSeries": false,
+            "id": 18,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024",
+                    "interval": "",
+                    "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024",
+                    "interval": "",
+                    "legendFormat": "MAX",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node GPU Memory (GRAM)",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "bytes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Network speed per node",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 8
+            },
+            "hiddenSeries": false,
+            "id": 20,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Recv: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
+                    "interval": "",
+                    "legendFormat": "Send: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Network",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "Bps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 0,
+                "y": 9
+            },
+            "hiddenSeries": false,
+            "id": 24,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Active Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Failed Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Pending Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node Count",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "nodes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "h": 8,
+                "w": 12,
+                "x": 12,
+                "y": 9
+            },
+            "hiddenSeries": false,
+            "id": 41,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})",
+                    "interval": "",
+                    "legendFormat": "CPU (physical)",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))",
+                    "interval": "",
+                    "legendFormat": "GPU (physical)",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100",
+                    "interval": "",
+                    "legendFormat": "Memory (RAM)",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100",
+                    "interval": "",
+                    "legendFormat": "GRAM",
+                    "queryType": "randomWalk",
+                    "refId": "D"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100",
+                    "interval": "",
+                    "legendFormat": "Object Store Memory",
+                    "queryType": "randomWalk",
+                    "refId": "E"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100",
+                    "interval": "",
+                    "legendFormat": "Disk",
+                    "queryType": "randomWalk",
+                    "refId": "F"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Cluster Utilization",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "%",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        }
+    ],
+    "refresh": false,
+    "schemaVersion": 27,
+    "style": "dark",
+    "tags": [
+        "rayVersion:2.24.0"
+    ],
+    "templating": {
+        "list": [
+            {
+                "current": {
+                    "selected": false
+                },
+                "description": "Filter queries of a specific Prometheus type.",
+                "hide": 2,
+                "includeAll": false,
+                "multi": false,
+                "name": "datasource",
+                "options": [],
+                "query": "prometheus",
+                "refresh": 1,
+                "regex": "",
+                "skipUrlSync": false,
+                "type": "datasource"
+            },
+            {
+                "allValue": ".+",
+                "current": {
+                    "selected": false
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_node_network_receive_speed{}, SessionName)",
+                "description": "Filter queries to specific ray sessions.",
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": false,
+                "name": "SessionName",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_node_network_receive_speed{}, SessionName)",
+                    "refId": "StandardVariableQuery"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 2,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".+",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Instance",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            }
+        ]
+    },
+    "time": {
+        "from": "now-30m",
+        "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Default Dashboard",
+    "uid": "rayDefaultDashboard",
+    "version": 4,
+    "rayMeta": [
+        "supportsGlobalFilterOverride"
+    ]
+}
diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
new file mode 100644
index 000000000..8648e308a
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
@@ -0,0 +1,2115 @@
+{
+    "annotations": {
+        "list": [
+            {
+                "builtIn": 1,
+                "datasource": "-- Grafana --",
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+            }
+        ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "iteration": 1667344411089,
+    "links": [],
+    "panels": [
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of replicas per deployment. Ignores \"Route\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 0,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 1,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Replicas per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "replicas",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "QPS for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 0,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 2,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "QPS per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Error QPS for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 0,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 3,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Error QPS per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P50 latency per replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 1,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 4,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P50 latency per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P90 latency per replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 1,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 5,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P90 latency per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P99 latency per replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 1,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 6,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P99 latency per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of requests queued per deployment. Ignores \"Replica\" and \"Route\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 2,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 7,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Queue size per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "requests",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Current running requests for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 12,
+                "y": 2,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 8,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Running requests per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "requests",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of multiplexed models for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 9,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Multiplexed models per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "models",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of times of multiplexed models loaded for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 10,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Multiplexed model loads per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "times",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of times of multiplexed models unloaded for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 11,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Multiplexed model unloads per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "times",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P99 latency of mutliplexed model load per replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 12,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P99 latency of multiplexed model loads per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P99 latency of mutliplexed model unload per replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 13,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P99 latency of multiplexed model unloads per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The ids of multiplexed models for each replica.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 14,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}",
+                    "interval": "",
+                    "legendFormat": "{{replica}}:{{model_id}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Multiplexed model ids per replica",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "model",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The cache hit rate of multiplexed models for the deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 5,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 15,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))",
+                    "interval": "",
+                    "legendFormat": "{{replica}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Multiplexed model cache hit rate",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "%",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        }
+    ],
+    "refresh": false,
+    "schemaVersion": 27,
+    "style": "dark",
+    "tags": [
+        "rayVersion:2.24.0"
+    ],
+    "templating": {
+        "list": [
+            {
+                "current": {
+                    "selected": false
+                },
+                "description": "Filter queries to specific prometheus type.",
+                "hide": 2,
+                "includeAll": false,
+                "multi": false,
+                "name": "datasource",
+                "options": [],
+                "query": "prometheus",
+                "refresh": 1,
+                "regex": "",
+                "skipUrlSync": false,
+                "type": "datasource"
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Application",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_deployment_replica_healthy{}, application)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Deployment",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Replica",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Route",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            }
+        ]
+    },
+    "rayMeta": [
+        "excludesSystemRoutes",
+        "supportsGlobalFilterOverride"
+    ],
+    "time": {
+        "from": "now-30m",
+        "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Serve Deployment Dashboard",
+    "uid": "rayServeDeploymentDashboard",
+    "version": 1
+}
diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
new file mode 100644
index 000000000..4d1ec6e8e
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
@@ -0,0 +1,3098 @@
+{
+    "annotations": {
+        "list": [
+            {
+                "builtIn": 1,
+                "datasource": "-- Grafana --",
+                "enable": true,
+                "hide": true,
+                "iconColor": "rgba(0, 211, 255, 1)",
+                "name": "Annotations & Alerts",
+                "type": "dashboard"
+            }
+        ]
+    },
+    "editable": true,
+    "gnetId": null,
+    "graphTooltip": 0,
+    "iteration": 1667344411089,
+    "links": [],
+    "panels": [
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 0,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 5,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "avg(ray_node_cpu_utilization{})",
+                    "interval": "",
+                    "legendFormat": "CPU (physical)",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))",
+                    "interval": "",
+                    "legendFormat": "GPU (physical)",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100",
+                    "interval": "",
+                    "legendFormat": "Memory (RAM)",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100",
+                    "interval": "",
+                    "legendFormat": "GRAM",
+                    "queryType": "randomWalk",
+                    "refId": "D"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100",
+                    "interval": "",
+                    "legendFormat": "Object Store Memory",
+                    "queryType": "randomWalk",
+                    "refId": "E"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100",
+                    "interval": "",
+                    "legendFormat": "Disk",
+                    "queryType": "randomWalk",
+                    "refId": "F"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Cluster Utilization",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "%",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "QPS for each selected application.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 12,
+                "y": 0,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 7,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)",
+                    "interval": "",
+                    "legendFormat": "{{application, route}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)",
+                    "interval": "",
+                    "legendFormat": "{{application, method}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "QPS per application",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Error QPS for each selected application.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 1,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 8,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)",
+                    "interval": "",
+                    "legendFormat": "{{application, route}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)",
+                    "interval": "",
+                    "legendFormat": "{{application, method}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Error QPS per application",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Error QPS for each selected application.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 12,
+                "y": 1,
+                "w": 12,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 17,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, error_code)",
+                    "interval": "",
+                    "legendFormat": "{{application, route, error_code}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, error_code)",
+                    "interval": "",
+                    "legendFormat": "{{application, method, error_code}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Error QPS per application per error code",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P50 latency for selected applications.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 2,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 12,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, route}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, method}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P50 latency per application",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P90 latency for selected applications.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 2,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 15,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, route}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, method}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P90 latency per application",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P99 latency for selected applications.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 2,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 16,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, route}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, method}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P99 latency per application",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of replicas per deployment. Ignores \"Application\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 2,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Replicas per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "replicas",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "QPS for each deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 13,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "QPS per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Error QPS for each deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 3,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 14,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Error QPS per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "qps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P50 latency per deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 9,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P50 latency per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P90 latency per deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 10,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P90 latency per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "P99 latency per deployment.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 4,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 11,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
+                    "interval": "",
+                    "legendFormat": "Total",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "P99 latency per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "ms",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of requests queued per deployment. Ignores \"Application\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 0,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 5,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 3,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)",
+                    "interval": "",
+                    "legendFormat": "{{application, deployment}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Queue size per deployment",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "requests",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Number of nodes in this cluster. Ignores \"Application\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 5,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 4,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_active_nodes{}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Active Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Failed Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)",
+                    "interval": "",
+                    "legendFormat": "Pending Nodes: {{NodeType}}",
+                    "queryType": "randomWalk",
+                    "refId": "C"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node count",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "nodes",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "Network speed per node. Ignores \"Application\" variable.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 1,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 5,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 6,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 2,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": false,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_network_receive_speed{}) by (instance)",
+                    "interval": "",
+                    "legendFormat": "Recv: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                },
+                {
+                    "exemplar": true,
+                    "expr": "sum(ray_node_network_send_speed{}) by (instance)",
+                    "interval": "",
+                    "legendFormat": "Send: {{instance}}",
+                    "queryType": "randomWalk",
+                    "refId": "B"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Node network",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "Bps",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of ongoing requests in the HTTP Proxy.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 6,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 20,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_num_ongoing_http_requests{}",
+                    "interval": "",
+                    "legendFormat": "Ongoing HTTP Requests",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Ongoing HTTP Requests",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "requests",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of ongoing requests in the gRPC Proxy.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 6,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 21,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_num_ongoing_grpc_requests{}",
+                    "interval": "",
+                    "legendFormat": "Ongoing gRPC Requests",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Ongoing gRPC Requests",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "requests",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of request scheduling tasks in the router.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 6,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 22,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_num_scheduling_tasks{}",
+                    "interval": "",
+                    "legendFormat": "Scheduling Tasks",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduling Tasks",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of request scheduling tasks in the router that are undergoing backoff.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 0,
+                "y": 7,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 23,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_num_scheduling_tasks_in_backoff{}",
+                    "interval": "",
+                    "legendFormat": "Scheduling Tasks in Backoff",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Scheduling Tasks in Backoff",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "tasks",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The duration of the last control loop.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 8,
+                "y": 7,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 24,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_controller_control_loop_duration_s{}",
+                    "interval": "",
+                    "legendFormat": "Control Loop Duration",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Controller Control Loop Duration",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "seconds",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        },
+        {
+            "aliasColors": {},
+            "bars": false,
+            "dashLength": 10,
+            "dashes": false,
+            "datasource": "${datasource}",
+            "description": "The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
+            "fieldConfig": {
+                "defaults": {},
+                "overrides": []
+            },
+            "fill": 10,
+            "fillGradient": 0,
+            "gridPos": {
+                "x": 16,
+                "y": 7,
+                "w": 8,
+                "h": 8
+            },
+            "hiddenSeries": false,
+            "id": 25,
+            "legend": {
+                "alignAsTable": true,
+                "avg": false,
+                "current": true,
+                "hideEmpty": false,
+                "hideZero": true,
+                "max": false,
+                "min": false,
+                "rightSide": false,
+                "show": true,
+                "sort": "current",
+                "sortDesc": true,
+                "total": false,
+                "values": true
+            },
+            "lines": true,
+            "linewidth": 1,
+            "nullPointMode": "null",
+            "options": {
+                "alertThreshold": true
+            },
+            "percentage": false,
+            "pluginVersion": "7.5.17",
+            "pointradius": 2,
+            "points": false,
+            "renderer": "flot",
+            "seriesOverrides": [
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX",
+                    "dashes": true,
+                    "color": "#1F60C4",
+                    "fill": 0,
+                    "stack": false
+                },
+                {
+                    "$$hashKey": "object:78",
+                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
+                    "hiddenSeries": true
+                },
+                {
+                    "$$hashKey": "object:2987",
+                    "alias": "MAX + PENDING",
+                    "dashes": true,
+                    "color": "#777777",
+                    "fill": 0,
+                    "stack": false
+                }
+            ],
+            "spaceLength": 10,
+            "stack": true,
+            "steppedLine": false,
+            "targets": [
+                {
+                    "exemplar": true,
+                    "expr": "ray_serve_controller_num_control_loops{}",
+                    "interval": "",
+                    "legendFormat": "Control Loops",
+                    "queryType": "randomWalk",
+                    "refId": "A"
+                }
+            ],
+            "thresholds": [],
+            "timeFrom": null,
+            "timeRegions": [],
+            "timeShift": null,
+            "title": "Number of Control Loops",
+            "tooltip": {
+                "shared": true,
+                "sort": 0,
+                "value_type": "individual"
+            },
+            "type": "graph",
+            "xaxis": {
+                "buckets": null,
+                "mode": "time",
+                "name": null,
+                "show": true,
+                "values": []
+            },
+            "yaxes": [
+                {
+                    "$$hashKey": "object:628",
+                    "format": "loops",
+                    "label": "",
+                    "logBase": 1,
+                    "max": null,
+                    "min": "0",
+                    "show": true
+                },
+                {
+                    "$$hashKey": "object:629",
+                    "format": "short",
+                    "label": null,
+                    "logBase": 1,
+                    "max": null,
+                    "min": null,
+                    "show": true
+                }
+            ],
+            "yaxis": {
+                "align": false,
+                "alignLevel": null
+            }
+        }
+    ],
+    "refresh": false,
+    "schemaVersion": 27,
+    "style": "dark",
+    "tags": [
+        "rayVersion:2.24.0"
+    ],
+    "templating": {
+        "list": [
+            {
+                "current": {
+                    "selected": false
+                },
+                "description": "Filter queries of a specific Prometheus type.",
+                "hide": 2,
+                "includeAll": false,
+                "multi": false,
+                "name": "datasource",
+                "options": [],
+                "query": "prometheus",
+                "refresh": 1,
+                "regex": "",
+                "skipUrlSync": false,
+                "type": "datasource"
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": null,
+                "multi": true,
+                "name": "Application",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_deployment_replica_healthy{}, application)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_num_http_requests_total{}, route)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": "HTTP Route",
+                "multi": true,
+                "name": "HTTP_Route",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_num_http_requests_total{}, route)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            },
+            {
+                "allValue": ".*",
+                "current": {
+                    "selected": true,
+                    "text": [
+                        "All"
+                    ],
+                    "value": [
+                        "$__all"
+                    ]
+                },
+                "datasource": "${datasource}",
+                "definition": "label_values(ray_serve_num_grpc_requests{}, method)",
+                "description": null,
+                "error": null,
+                "hide": 0,
+                "includeAll": true,
+                "label": "gRPC Service Method",
+                "multi": true,
+                "name": "gRPC_Method",
+                "options": [],
+                "query": {
+                    "query": "label_values(ray_serve_num_grpc_requests{}, method)",
+                    "refId": "Prometheus-Instance-Variable-Query"
+                },
+                "refresh": 2,
+                "regex": "",
+                "skipUrlSync": false,
+                "sort": 0,
+                "tagValuesQuery": "",
+                "tags": [],
+                "tagsQuery": "",
+                "type": "query",
+                "useTags": false
+            }
+        ]
+    },
+    "rayMeta": [
+        "excludesSystemRoutes",
+        "supportsGlobalFilterOverride"
+    ],
+    "time": {
+        "from": "now-30m",
+        "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Serve Dashboard",
+    "uid": "rayServeDashboard",
+    "version": 1
+}
diff --git a/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml
new file mode 100644
index 000000000..dbda70c40
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml
@@ -0,0 +1,25 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: ray-head-monitor
+  namespace: kube-prometheus-stack
+  labels:
+    # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label.
+    release: kube-prometheus-stack
+spec:
+  jobLabel: ray-head
+  # Only select Kubernetes Services in the "default" namespace.
+  namespaceSelector:
+    matchNames:
+      - rayserve-vllm
+  # Only select Kubernetes Services with "matchLabels".
+  selector:
+    matchLabels:
+      ray.io/node-type: head
+  # A list of endpoints allowed as part of this ServiceMonitor.
+  endpoints:
+    - port: metrics
+    - port: as-metrics # autoscaler metrics
+    - port: dash-metrics # dashboard metrics
+  targetLabels:
+  - ray.io/cluster
diff --git a/ai-ml/infrastructure/terraform/outputs.tf b/ai-ml/infrastructure/terraform/outputs.tf
new file mode 100644
index 000000000..5771ae141
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/outputs.tf
@@ -0,0 +1,9 @@
+output "configure_kubectl" {
+  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
+  value       = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}"
+}
+
+output "grafana_secret_name" {
+  description = "The name of the secret containing the Grafana admin password."
+  value       = aws_secretsmanager_secret.grafana.name
+}
diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf
new file mode 100644
index 000000000..465f9c152
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/variables.tf
@@ -0,0 +1,107 @@
+variable "name" {
+  description = "Name of the VPC and EKS Cluster"
+  default     = "ai-stack"
+  type        = string
+}
+
+variable "region" {
+  description = "region"
+  default     = "us-east-1"
+  type        = string
+}
+
+variable "eks_cluster_version" {
+  description = "EKS Cluster version"
+  default     = "1.30"
+  type        = string
+}
+
+# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
+variable "vpc_cidr" {
+  description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range"
+  default     = "10.1.0.0/21"
+  type        = string
+}
+
+# RFC6598 range 100.64.0.0/10
+# Note you can only /16 range to VPC. You can add multiples of /16 if required
+variable "secondary_cidr_blocks" {
+  description = "Secondary CIDR blocks to be attached to VPC"
+  default     = ["100.64.0.0/16"]
+  type        = list(string)
+}
+
+# Infrastructure Variables
+variable "enable_aws_cloudwatch_metrics" {
+  description = "Enable AWS Cloudwatch Metrics addon"
+  type        = bool
+  default     = true
+}
+variable "bottlerocket_data_disk_snapshot_id" {
+  description = "Bottlerocket Data Disk Snapshot ID"
+  type        = string
+  default     = ""
+}
+variable "enable_aws_efa_k8s_device_plugin" {
+  description = "Enable AWS EFA K8s Device Plugin"
+  type        = bool
+  default     = false
+}
+variable "enable_aws_fsx_csi_driver"{
+  description = "Whether or not to deploy the Fsx Driver"
+  type        = bool
+  default     = false
+}
+variable "deploy_fsx_volume" {
+  description = "Whether or not to deploy the example Fsx Volume"
+  type        = bool
+  default     = false
+}
+
+# Addon Variables
+variable "enable_kube_prometheus_stack" {
+  description = "Enable Kube Prometheus addon"
+  type        = bool
+  default     = false
+}
+variable "enable_kubecost" {
+  description = "Enable Kubecost addon"
+  type        = bool
+  default     = false
+}
+variable "enable_argo_workflows" {
+  description = "Enable Argo Workflows addon"
+  type        = bool
+  default     = false
+}
+variable "enable_argo_events" {
+  description = "Enable Argo Events addon"
+  type        = bool
+  default     = false
+}
+variable "enable_mlflow_tracking" {
+  description = "Enable MLFlow Tracking"
+  type        = bool
+  default     = false
+}
+variable "enable_jupyterhub" {
+  description = "Enable JupyterHub"
+  type        = bool
+  default     = false
+}
+variable "enable_volcano" {
+  description = "Enable Volcano"
+  type        = bool
+  default     = false
+}
+variable "enable_kuberay_operator" {
+  description = "Enable KubeRay Operator"
+  type        = bool
+  default     = true
+}
+variable "huggingface_token" {
+  description = "Hugging Face Secret Token"
+  type        = string
+  default     = "DUMMY_TOKEN_REPLACE_ME"
+  sensitive   = true
+}
diff --git a/ai-ml/infrastructure/terraform/versions.tf b/ai-ml/infrastructure/terraform/versions.tf
new file mode 100644
index 000000000..e24e99c1f
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/versions.tf
@@ -0,0 +1,33 @@
+terraform {
+  required_version = ">= 1.0.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = ">= 3.72"
+    }
+    kubernetes = {
+      source  = "hashicorp/kubernetes"
+      version = ">= 2.10"
+    }
+    helm = {
+      source  = "hashicorp/helm"
+      version = ">= 2.4.1"
+    }
+    kubectl = {
+      source  = "gavinbunney/kubectl"
+      version = ">= 1.14"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = ">= 3.6.0" # Replace with the appropriate version of the random provider
+    }
+  }
+
+  # ##  Used for end-to-end testing on project; update to suit your needs
+  # backend "s3" {
+  #   bucket = "doeks-github-actions-e2e-test-state"
+  #   region = "us-west-2"
+  #   key    = "e2e/jark/terraform.tfstate"
+  # }
+}
diff --git a/ai-ml/infrastructure/terraform/vpc.tf b/ai-ml/infrastructure/terraform/vpc.tf
new file mode 100644
index 000000000..b12f63d59
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/vpc.tf
@@ -0,0 +1,62 @@
+locals {
+  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
+  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
+  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
+  # Routable Public subnets with NAT Gateway and Internet Gateway
+  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
+  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
+
+  database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)]
+
+  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
+  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
+  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
+}
+
+#---------------------------------------------------------------
+# VPC
+#---------------------------------------------------------------
+# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts.
+# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements
+
+module "vpc" {
+  source  = "terraform-aws-modules/vpc/aws"
+  version = "~> 5.0"
+
+  name = local.name
+  cidr = var.vpc_cidr
+  azs  = local.azs
+
+  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
+  secondary_cidr_blocks = var.secondary_cidr_blocks
+
+  # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
+  # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
+  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
+
+  # ------------------------------
+  # Private Subnets for MLflow backend store
+  database_subnets                   = local.database_private_subnets
+  create_database_subnet_group       = true
+  create_database_subnet_route_table = true
+
+  # ------------------------------
+  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
+  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
+  public_subnets     = local.public_subnets
+  enable_nat_gateway = true
+  single_nat_gateway = true
+  #-------------------------------
+
+  public_subnet_tags = {
+    "kubernetes.io/role/elb" = 1
+  }
+
+  private_subnet_tags = {
+    "kubernetes.io/role/internal-elb" = 1
+    # Tags subnets for Karpenter auto-discovery
+    "karpenter.sh/discovery" = local.name
+  }
+
+  tags = local.tags
+}

From a148ef574f1271dea1172a8748f3ae0e9380e1c5 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 14:42:17 -0800
Subject: [PATCH 02/16] split dcgm and enable volcano to fix kuberay startup

---
 ai-ml/bionemo/addons.tf                       |   93 -
 ai-ml/bionemo/cleanup.sh                      |   45 -
 ai-ml/bionemo/eks.tf                          |  145 -
 ai-ml/bionemo/fsx-for-lustre.tf               |  136 -
 .../fsx-for-lustre/fsxlustre-static-pv.yaml   |   21 -
 .../fsx-for-lustre/fsxlustre-static-pvc.yaml  |   12 -
 .../fsxlustre-storage-class.yaml              |    9 -
 ai-ml/bionemo/main.tf                         |   53 -
 ai-ml/bionemo/outputs.tf                      |    9 -
 ai-ml/bionemo/versions.tf                     |   29 -
 ai-ml/bionemo/vpc.tf                          |   57 -
 ai-ml/infrastructure/terraform/addons.tf      |    4 +
 ai-ml/infrastructure/terraform/variables.tf   |    2 +-
 ai-ml/jark-stack/terraform/addons.tf          |  460 --
 ai-ml/jark-stack/terraform/eks.tf             |  212 -
 .../helm-values/argo-events-values.yaml       |    4 -
 .../helm-values/argo-workflows-values.yaml    |    5 -
 .../aws-cloudwatch-metrics-values.yaml        |   11 -
 .../aws-efa-k8s-device-plugin-values.yaml     |    5 -
 .../helm-values/ingress-nginx-values.yaml     |   11 -
 .../helm-values/jupyterhub-values.yaml        |   59 -
 .../helm-values/kube-prometheus.yaml          |   48 -
 .../helm-values/kubecost-values.yaml          |   69 -
 ai-ml/jark-stack/terraform/karpenter.tf       |    0
 ai-ml/jark-stack/terraform/main.tf            |   51 -
 .../terraform/monitoring/podMonitor.yaml      |   21 -
 .../data_grafana_dashboard.json               | 4535 -----------------
 .../default_grafana_dashboard.json            | 2836 -----------
 .../serve_deployment_grafana_dashboard.json   | 2115 --------
 .../serve_grafana_dashboard.json              | 3098 -----------
 .../terraform/monitoring/serviceMonitor.yaml  |   25 -
 ai-ml/jark-stack/terraform/outputs.tf         |    9 -
 ai-ml/jark-stack/terraform/versions.tf        |   33 -
 ai-ml/jark-stack/terraform/vpc.tf             |   53 -
 ai-ml/mlflow/addons.tf                        |  431 --
 ai-ml/mlflow/amp.tf                           |  136 -
 ai-ml/mlflow/eks.tf                           |  118 -
 .../helm-values/aws-for-fluentbit-values.yaml |  102 -
 .../cluster-autoscaler-values.yaml            |   25 -
 .../coredns-autoscaler-values.yaml            |   40 -
 .../helm-values/ingress-nginx-values.yaml     |   11 -
 .../kube-prometheus-amp-enable.yaml           |   65 -
 ai-ml/mlflow/helm-values/kube-prometheus.yaml |   36 -
 .../helm-values/metrics-server-values.yaml    |   52 -
 .../helm-values/mlflow-tracking-values.yaml   |   88 -
 ai-ml/mlflow/helm-values/nvidia-values.yaml   |   97 -
 .../00-karpenter-provisioner-cpu.yaml         |   57 -
 ai-ml/mlflow/main.tf                          |   65 -
 ai-ml/mlflow/mlflow-core.tf                   |  245 -
 ai-ml/mlflow/outputs.tf                       |   24 -
 ai-ml/mlflow/versions.tf                      |   33 -
 ai-ml/mlflow/vpc.tf                           |   59 -
 ai-ml/trainium-inferentia/addons.tf           |  536 --
 ai-ml/trainium-inferentia/eks.tf              |  410 --
 .../elastic-cache-redis.tf                    |   57 -
 ai-ml/trainium-inferentia/fsx-for-lustre.tf   |  118 -
 .../aws-cloudwatch-metrics-values.yaml        |   11 -
 .../helm-values/aws-for-fluentbit-values.yaml |  102 -
 .../cluster-autoscaler-values.yaml            |   15 -
 .../helm-values/ingress-nginx-values.yaml     |   11 -
 .../helm-values/jupyterhub-values.yaml        |  139 -
 .../helm-values/kube-prometheus.yaml          |   23 -
 .../helm-values/metrics-server-values.yaml    |   52 -
 ai-ml/trainium-inferentia/jupyterhub.tf       |  181 -
 ai-ml/trainium-inferentia/main.tf             |   75 -
 ai-ml/trainium-inferentia/outputs.tf          |    9 -
 ai-ml/trainium-inferentia/versions.tf         |   37 -
 ai-ml/trainium-inferentia/vpc.tf              |   53 -
 68 files changed, 5 insertions(+), 17783 deletions(-)
 delete mode 100644 ai-ml/bionemo/addons.tf
 delete mode 100755 ai-ml/bionemo/cleanup.sh
 delete mode 100644 ai-ml/bionemo/eks.tf
 delete mode 100644 ai-ml/bionemo/fsx-for-lustre.tf
 delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml
 delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml
 delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml
 delete mode 100644 ai-ml/bionemo/main.tf
 delete mode 100644 ai-ml/bionemo/outputs.tf
 delete mode 100644 ai-ml/bionemo/versions.tf
 delete mode 100644 ai-ml/bionemo/vpc.tf
 delete mode 100644 ai-ml/jark-stack/terraform/addons.tf
 delete mode 100644 ai-ml/jark-stack/terraform/eks.tf
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/karpenter.tf
 delete mode 100644 ai-ml/jark-stack/terraform/main.tf
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
 delete mode 100644 ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml
 delete mode 100644 ai-ml/jark-stack/terraform/outputs.tf
 delete mode 100644 ai-ml/jark-stack/terraform/versions.tf
 delete mode 100644 ai-ml/jark-stack/terraform/vpc.tf
 delete mode 100644 ai-ml/mlflow/addons.tf
 delete mode 100644 ai-ml/mlflow/amp.tf
 delete mode 100644 ai-ml/mlflow/eks.tf
 delete mode 100644 ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/kube-prometheus.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/metrics-server-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
 delete mode 100644 ai-ml/mlflow/helm-values/nvidia-values.yaml
 delete mode 100644 ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
 delete mode 100644 ai-ml/mlflow/main.tf
 delete mode 100644 ai-ml/mlflow/mlflow-core.tf
 delete mode 100644 ai-ml/mlflow/outputs.tf
 delete mode 100644 ai-ml/mlflow/versions.tf
 delete mode 100644 ai-ml/mlflow/vpc.tf
 delete mode 100644 ai-ml/trainium-inferentia/addons.tf
 delete mode 100644 ai-ml/trainium-inferentia/eks.tf
 delete mode 100644 ai-ml/trainium-inferentia/elastic-cache-redis.tf
 delete mode 100644 ai-ml/trainium-inferentia/fsx-for-lustre.tf
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/aws-cloudwatch-metrics-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/aws-for-fluentbit-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml
 delete mode 100644 ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/jupyterhub.tf
 delete mode 100755 ai-ml/trainium-inferentia/main.tf
 delete mode 100755 ai-ml/trainium-inferentia/outputs.tf
 delete mode 100755 ai-ml/trainium-inferentia/versions.tf
 delete mode 100755 ai-ml/trainium-inferentia/vpc.tf

diff --git a/ai-ml/bionemo/addons.tf b/ai-ml/bionemo/addons.tf
deleted file mode 100644
index 6b47a5ffe..000000000
--- a/ai-ml/bionemo/addons.tf
+++ /dev/null
@@ -1,93 +0,0 @@
-#---------------------------------------------------------------
-# EKS Blueprints Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_blueprints_addons" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.3"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    coredns = {
-      preserve = true
-    }
-    vpc-cni = {
-      preserve = true
-    }
-    kube-proxy = {
-      preserve = true
-    }
-    amazon-cloudwatch-observability = {
-      preserve                 = true
-      service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn
-    }
-  }
-
-  #---------------------------------------
-  # ALB Controller
-  #---------------------------------------
-  enable_aws_load_balancer_controller = true
-
-  #---------------------------------------
-  # Kubernetes Metrics Server
-  #---------------------------------------
-  enable_metrics_server = true
-
-
-  #---------------------------------------
-  # Enable FSx for Lustre CSI Driver
-  #---------------------------------------
-  enable_aws_fsx_csi_driver = true
-
-  tags = local.tags
-
-}
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "~> 1.30" # ensure to update this to the latest/desired version
-
-  oidc_provider_arn           = module.eks.oidc_provider_arn
-  enable_nvidia_device_plugin = true
-
-}
-
-#---------------------------------------------------------------
-# EKS Amazon CloudWatch Observability Role
-#---------------------------------------------------------------
-resource "aws_iam_role" "cloudwatch_observability_role" {
-  name = "eks-cloudwatch-agent-role"
-
-  assume_role_policy = jsonencode({
-    Version = "2012-10-17"
-    Statement = [
-      {
-        Action = "sts:AssumeRoleWithWebIdentity"
-        Effect = "Allow"
-        Principal = {
-          Federated = module.eks.oidc_provider_arn
-        }
-        Condition = {
-          StringEquals = {
-            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent",
-            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com"
-          }
-        }
-      }
-    ]
-  })
-}
-
-resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" {
-  policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
-  role       = aws_iam_role.cloudwatch_observability_role.name
-}
diff --git a/ai-ml/bionemo/cleanup.sh b/ai-ml/bionemo/cleanup.sh
deleted file mode 100755
index da1fb7c16..000000000
--- a/ai-ml/bionemo/cleanup.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-set -o errexit
-set -o pipefail
-
-targets=(
-  "module.eks"
-  "module.vpc"
-)
-
-#-------------------------------------------
-# Helpful to delete the stuck in "Terminating" namespaces
-# Rerun the cleanup.sh script to detect and delete the stuck resources
-#-------------------------------------------
-terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
-
-# If there are no terminating namespaces, exit the script
-if [[ -z $terminating_namespaces ]]; then
-    echo "No terminating namespaces found"
-fi
-
-for ns in $terminating_namespaces; do
-    echo "Terminating namespace: $ns"
-    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
-done
-
-for target in "${targets[@]}"
-do
-  terraform destroy -target="$target" -auto-approve
-  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
-  if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-    echo "SUCCESS: Terraform destroy of $target completed successfully"
-  else
-    echo "FAILED: Terraform destroy of $target failed"
-    exit 1
-  fi
-done
-
-terraform destroy -auto-approve
-destroy_output=$(terraform destroy -auto-approve 2>&1)
-if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-  echo "SUCCESS: Terraform destroy of all targets completed successfully"
-else
-  echo "FAILED: Terraform destroy of all targets failed"
-  exit 1
-fi
diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf
deleted file mode 100644
index e45e5a816..000000000
--- a/ai-ml/bionemo/eks.tf
+++ /dev/null
@@ -1,145 +0,0 @@
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 19.15"
-
-  cluster_name                   = local.name
-  cluster_version                = var.eks_cluster_version
-  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
-  vpc_id                         = module.vpc.vpc_id
-  subnet_ids                     = module.vpc.private_subnets
-  manage_aws_auth_configmap      = true
-
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 1025
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  # Extend node-to-node security group rules
-  node_security_group_additional_rules = {
-    ingress_self_all = {
-      description = "Node to node all ports/protocols"
-      protocol    = "-1"
-      from_port   = 0
-      to_port     = 0
-      type        = "ingress"
-      self        = true
-    }
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
-    # Change this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  We recommend to have a MNG to place your critical workloads and add-ons
-    #  Then rely on Karpenter to scale your workloads
-    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
-
-    core_node_group = {
-      name        = "core-node-group"
-      description = "EKS Core node group for hosting critical add-ons"
-      # Filtering only Secondary CIDR private subnets starting with "100.".
-      # Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      min_size     = 3
-      max_size     = 9
-      desired_size = 3
-
-      instance_types = ["m5.xlarge"]
-
-      ebs_optimized = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "core"
-      }
-
-      tags = merge(local.tags, {
-        Name                     = "core-node-grp",
-        "karpenter.sh/discovery" = local.name
-      })
-    }
-
-    gpu1 = {
-      name        = "gpu-node-grp"
-      description = "EKS Node Group to run GPU workloads"
-      # Filtering only Secondary CIDR private subnets starting with "100.".
-      # Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      ami_type     = "AL2_x86_64_GPU"
-      min_size     = 2
-      max_size     = 3
-      desired_size = 2
-
-      instance_types = ["g5.12xlarge"]
-      ebs_optimized  = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 200
-            volume_type = "gp3"
-          }
-        }
-      }
-      taints = {
-        gpu = {
-          key      = "nvidia.com/gpu"
-          effect   = "NO_SCHEDULE"
-          operator = "EXISTS"
-        }
-      }
-      labels = {
-        WorkerType = "ON_DEMAND"
-        eks-node   = "gpu"
-      }
-
-      tags = merge(local.tags, {
-        Name                     = "gpu-node-grp",
-        "karpenter.sh/discovery" = local.name
-      })
-    }
-  }
-}
diff --git a/ai-ml/bionemo/fsx-for-lustre.tf b/ai-ml/bionemo/fsx-for-lustre.tf
deleted file mode 100644
index 2175461f0..000000000
--- a/ai-ml/bionemo/fsx-for-lustre.tf
+++ /dev/null
@@ -1,136 +0,0 @@
-#---------------------------------------------------------------
-# FSx for Lustre File system Static provisioning
-#    1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600)
-#    2> Create Storage Class for Filesystem (Cluster scoped)
-#    3> Persistent Volume with  Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped)
-#    4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped)
-#---------------------------------------------------------------
-# NOTE: FSx for Lustre file system creation can take up to 10 mins
-resource "aws_fsx_lustre_file_system" "this" {
-  deployment_type             = "PERSISTENT_2"
-  storage_type                = "SSD"
-  per_unit_storage_throughput = "500" # 125, 250, 500, 1000
-  storage_capacity            = 2400
-
-  subnet_ids         = [module.vpc.private_subnets[0]]
-  security_group_ids = [aws_security_group.fsx.id]
-  log_configuration {
-    level = "WARN_ERROR"
-  }
-  tags = merge({ "Name" : "${local.name}-static" }, local.tags)
-}
-
-# This process can take upto 7 mins
-resource "aws_fsx_data_repository_association" "this" {
-
-  file_system_id       = aws_fsx_lustre_file_system.this.id
-  data_repository_path = "s3://${module.fsx_s3_bucket.s3_bucket_id}"
-  file_system_path     = "/data" # This directory will be used in Spark podTemplates under volumeMounts as subPath
-
-  s3 {
-    auto_export_policy {
-      events = ["NEW", "CHANGED", "DELETED"]
-    }
-
-    auto_import_policy {
-      events = ["NEW", "CHANGED", "DELETED"]
-    }
-  }
-}
-
-#---------------------------------------------------------------
-# Sec group for FSx for Lustre
-#---------------------------------------------------------------
-resource "aws_security_group" "fsx" {
-
-  name        = "${local.name}-fsx"
-  description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem"
-  vpc_id      = module.vpc.vpc_id
-
-  ingress {
-    description = "Allows Lustre traffic between Lustre clients"
-    cidr_blocks = module.vpc.private_subnets_cidr_blocks
-    from_port   = 1021
-    to_port     = 1023
-    protocol    = "tcp"
-  }
-  ingress {
-    description = "Allows Lustre traffic between Lustre clients"
-    cidr_blocks = module.vpc.private_subnets_cidr_blocks
-    from_port   = 988
-    to_port     = 988
-    protocol    = "tcp"
-  }
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# S3 bucket for DataSync between FSx for Lustre and S3 Bucket
-#---------------------------------------------------------------
-#tfsec:ignore:aws-s3-enable-bucket-logging tfsec:ignore:aws-s3-enable-versioning
-module "fsx_s3_bucket" {
-  source  = "terraform-aws-modules/s3-bucket/aws"
-  version = "~> 3.0"
-
-  create_bucket = true
-
-  bucket_prefix = "${local.name}-fsx-"
-  # For example only - please evaluate for your environment
-  force_destroy = true
-
-  server_side_encryption_configuration = {
-    rule = {
-      apply_server_side_encryption_by_default = {
-        sse_algorithm = "AES256"
-      }
-    }
-  }
-}
-
-#---------------------------------------------------------------
-# Storage Class - FSx for Lustre
-#---------------------------------------------------------------
-resource "kubectl_manifest" "storage_class" {
-
-  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-storage-class.yaml", {
-    subnet_id         = module.vpc.private_subnets[0],
-    security_group_id = aws_security_group.fsx.id
-  })
-
-  depends_on = [
-    module.eks_blueprints_addons
-  ]
-}
-
-#---------------------------------------------------------------
-# FSx for Lustre Persistent Volume - Static provisioning
-#---------------------------------------------------------------
-resource "kubectl_manifest" "static_pv" {
-
-  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pv.yaml", {
-    filesystem_id = aws_fsx_lustre_file_system.this.id,
-    dns_name      = aws_fsx_lustre_file_system.this.dns_name
-    mount_name    = aws_fsx_lustre_file_system.this.mount_name,
-  })
-
-  depends_on = [
-    module.eks_blueprints_addons,
-    kubectl_manifest.storage_class,
-    aws_fsx_lustre_file_system.this
-  ]
-}
-
-#---------------------------------------------------------------
-# FSx for Lustre Persistent Volume Claim
-#---------------------------------------------------------------
-resource "kubectl_manifest" "static_pvc" {
-
-  yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pvc.yaml", {})
-
-  depends_on = [
-    module.eks_blueprints_addons,
-    kubectl_manifest.storage_class,
-    kubectl_manifest.static_pv,
-    aws_fsx_lustre_file_system.this
-  ]
-}
diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml
deleted file mode 100644
index 857bdcf3a..000000000
--- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
----
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: fsx-static-pv
-spec:
-  capacity:
-    storage: 1000Gi
-  volumeMode: Filesystem
-  storageClassName: fsx
-  accessModes:
-    - ReadWriteMany
-  mountOptions:
-    - flock
-  persistentVolumeReclaimPolicy: Recycle
-  csi:
-    driver: fsx.csi.aws.com
-    volumeHandle: ${filesystem_id}
-    volumeAttributes:
-      dnsname: ${dns_name}
-      mountname: ${mount_name}
diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml
deleted file mode 100644
index dddebd66c..000000000
--- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: fsx-static-pvc
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: fsx
-  resources:
-    requests:
-      storage: 1000Gi
-  volumeName: fsx-static-pv
diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml
deleted file mode 100644
index 125fb2478..000000000
--- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
----
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: fsx
-provisioner: fsx.csi.aws.com
-parameters:
-  subnetId: ${subnet_id}
-  securityGroupIds: ${security_group_id}
diff --git a/ai-ml/bionemo/main.tf b/ai-ml/bionemo/main.tf
deleted file mode 100644
index dd7d220a0..000000000
--- a/ai-ml/bionemo/main.tf
+++ /dev/null
@@ -1,53 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-
-provider "kubectl" {
-  apply_retry_count      = 10
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  load_config_file       = false
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-data "aws_availability_zones" "available" {}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-#---------------------------------------------------------------
-# Local variables
-#---------------------------------------------------------------
-locals {
-  name     = var.name
-  region   = var.region
-  vpc_cidr = var.vpc_cidr
-  azs      = slice(data.aws_availability_zones.available.names, 0, 2)
-
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
diff --git a/ai-ml/bionemo/outputs.tf b/ai-ml/bionemo/outputs.tf
deleted file mode 100644
index 0f7edf2c1..000000000
--- a/ai-ml/bionemo/outputs.tf
+++ /dev/null
@@ -1,9 +0,0 @@
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}"
-}
-
-output "eks_api_server_url" {
-  description = "Your eks API server endpoint"
-  value       = module.eks.cluster_endpoint
-}
diff --git a/ai-ml/bionemo/versions.tf b/ai-ml/bionemo/versions.tf
deleted file mode 100644
index a62c02b66..000000000
--- a/ai-ml/bionemo/versions.tf
+++ /dev/null
@@ -1,29 +0,0 @@
-terraform {
-  required_version = ">= 1.0.0"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 3.72"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.4.1"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.14"
-    }
-  }
-
-  # ##  Used for end-to-end testing on project; update to suit your needs
-  # backend "s3" {
-  #   bucket = "doeks-github-actions-e2e-test-state"
-  #   region = "us-west-2"
-  #   key    = "e2e/bionemo/terraform.tfstate"
-  # }
-}
diff --git a/ai-ml/bionemo/vpc.tf b/ai-ml/bionemo/vpc.tf
deleted file mode 100644
index f63ccbe0c..000000000
--- a/ai-ml/bionemo/vpc.tf
+++ /dev/null
@@ -1,57 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-
-  database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = local.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets = local.public_subnets
-
-  # ------------------------------
-  # Private Subnets for MLflow backend store
-  database_subnets                   = local.database_private_subnets
-  create_database_subnet_group       = true
-  create_database_subnet_route_table = true
-
-  enable_nat_gateway   = true
-  single_nat_gateway   = true
-  enable_dns_hostnames = true
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index f8ae300cc..93cc471cb 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -640,6 +640,10 @@ resource "kubectl_manifest" "dcgm" {
   yaml_body = file("${path.module}/monitoring/dcgm.yaml")
 }
 
+resource "kubectl_manifest" "dcgm" {
+  yaml_body = file("${path.module}/monitoring/dcgm-service.yaml")
+}
+
 data "aws_iam_policy_document" "karpenter_controller_policy" {
   statement {
     actions = [
diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf
index 465f9c152..f0606cd0e 100644
--- a/ai-ml/infrastructure/terraform/variables.tf
+++ b/ai-ml/infrastructure/terraform/variables.tf
@@ -92,7 +92,7 @@ variable "enable_jupyterhub" {
 variable "enable_volcano" {
   description = "Enable Volcano"
   type        = bool
-  default     = false
+  default     = true
 }
 variable "enable_kuberay_operator" {
   description = "Enable KubeRay Operator"
diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf
deleted file mode 100644
index 4ec1171cf..000000000
--- a/ai-ml/jark-stack/terraform/addons.tf
+++ /dev/null
@@ -1,460 +0,0 @@
-#---------------------------------------------------------------
-# GP3 Encrypted Storage Class
-#---------------------------------------------------------------
-resource "kubernetes_annotations" "disable_gp2" {
-  annotations = {
-    "storageclass.kubernetes.io/is-default-class" : "false"
-  }
-  api_version = "storage.k8s.io/v1"
-  kind        = "StorageClass"
-  metadata {
-    name = "gp2"
-  }
-  force = true
-
-  depends_on = [module.eks.eks_cluster_id]
-}
-
-resource "kubernetes_storage_class" "default_gp3" {
-  metadata {
-    name = "gp3"
-    annotations = {
-      "storageclass.kubernetes.io/is-default-class" : "true"
-    }
-  }
-
-  storage_provisioner    = "ebs.csi.aws.com"
-  reclaim_policy         = "Delete"
-  allow_volume_expansion = true
-  volume_binding_mode    = "WaitForFirstConsumer"
-  parameters = {
-    fsType    = "ext4"
-    encrypted = true
-    type      = "gp3"
-  }
-
-  depends_on = [kubernetes_annotations.disable_gp2]
-}
-
-#---------------------------------------------------------------
-# IRSA for EBS CSI Driver
-#---------------------------------------------------------------
-module "ebs_csi_driver_irsa" {
-  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-  version               = "~> 5.20"
-  role_name_prefix      = format("%s-%s-", local.name, "ebs-csi-driver")
-  attach_ebs_csi_policy = true
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
-    }
-  }
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# EKS Blueprints Addons
-#---------------------------------------------------------------
-module "eks_blueprints_addons" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.2"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    aws-ebs-csi-driver = {
-      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
-    }
-    coredns = {
-      preserve = true
-    }
-    kube-proxy = {
-      preserve = true
-    }
-    # VPC CNI uses worker node IAM role policies
-    vpc-cni = {
-      preserve = true
-    }
-  }
-
-  #---------------------------------------
-  # AWS Load Balancer Controller Add-on
-  #---------------------------------------
-  enable_aws_load_balancer_controller = true
-  # turn off the mutating webhook for services because we are using
-  # service.beta.kubernetes.io/aws-load-balancer-type: external
-  aws_load_balancer_controller = {
-    set = [{
-      name  = "enableServiceMutatorWebhook"
-      value = "false"
-    }]
-  }
-
-  #---------------------------------------
-  # Ingress Nginx Add-on
-  #---------------------------------------
-  enable_ingress_nginx = true
-  ingress_nginx = {
-    values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Karpenter Autoscaler for EKS Cluster
-  #---------------------------------------
-  enable_karpenter                  = true
-  karpenter_enable_spot_termination = true
-  karpenter_node = {
-    iam_role_additional_policies = {
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-  karpenter = {
-    chart_version       = "0.37.0"
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-    source_policy_documents = [
-      data.aws_iam_policy_document.karpenter_controller_policy.json
-    ]
-  }
-
-  #---------------------------------------
-  # Argo Workflows & Argo Events
-  #---------------------------------------
-  enable_argo_workflows = true
-  argo_workflows = {
-    name       = "argo-workflows"
-    namespace  = "argo-workflows"
-    repository = "https://argoproj.github.io/argo-helm"
-    values     = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})]
-  }
-
-  enable_argo_events = true
-  argo_events = {
-    name       = "argo-events"
-    namespace  = "argo-events"
-    repository = "https://argoproj.github.io/argo-helm"
-    values     = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Prommetheus and Grafana stack
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
-  # 2- Grafana Admin user: admin
-  # 3- Get sexret name from Terrafrom output: `terraform output grafana_secret_name`
-  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <REPLACE_WIRTH_SECRET_ID> --region $AWS_REGION --query "SecretString" --output text`
-  #---------------------------------------------------------------
-  enable_kube_prometheus_stack = true
-  kube_prometheus_stack = {
-    values = [
-      templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
-        storage_class_type = kubernetes_storage_class.default_gp3.id
-      })
-    ]
-    chart_version = "48.1.1"
-    set_sensitive = [
-      {
-        name  = "grafana.adminPassword"
-        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
-      }
-    ],
-  }
-
-  #---------------------------------------
-  # CloudWatch metrics for EKS
-  #---------------------------------------
-  enable_aws_cloudwatch_metrics = true
-  aws_cloudwatch_metrics = {
-    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
-  }
-
-}
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-
-module "data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "1.33.0"
-
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------------------------------
-  # JupyterHub Add-on
-  #---------------------------------------------------------------
-  enable_jupyterhub = true
-  jupyterhub_helm_config = {
-    namespace        = kubernetes_namespace_v1.jupyterhub.id
-    create_namespace = false
-    values           = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
-  }
-
-  enable_volcano = true
-  #---------------------------------------
-  # Kuberay Operator
-  #---------------------------------------
-  enable_kuberay_operator = true
-  kuberay_operator_helm_config = {
-    version = "1.1.1"
-    # Enabling Volcano as Batch scheduler for KubeRay Operator
-    values = [
-      <<-EOT
-      batchScheduler:
-        enabled: true
-    EOT
-    ]
-  }
-
-  #---------------------------------------------------------------
-  # NVIDIA Device Plugin Add-on
-  #---------------------------------------------------------------
-  enable_nvidia_device_plugin = true
-  nvidia_device_plugin_helm_config = {
-    version = "v0.16.1"
-    name    = "nvidia-device-plugin"
-    values = [
-      <<-EOT
-        gfd:
-          enabled: true
-        nfd:
-          worker:
-            tolerations:
-              - key: nvidia.com/gpu
-                operator: Exists
-                effect: NoSchedule
-              - operator: "Exists"
-      EOT
-    ]
-  }
-
-  #---------------------------------------
-  # EFA Device Plugin Add-on
-  #---------------------------------------
-  # IMPORTANT: Enable EFA only on nodes with EFA devices attached.
-  # Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling.
-  enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin
-  aws_efa_k8s_device_plugin_helm_config = {
-    values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")]
-  }
-
-  #---------------------------------------------------------------
-  # Kubecost Add-on
-  #---------------------------------------------------------------
-  enable_kubecost = var.enable_kubecost
-  kubecost_helm_config = {
-    values              = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})]
-    version             = "2.2.2"
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-  }
-
-  #---------------------------------------------------------------
-  # Karpenter Resources Add-on
-  #---------------------------------------------------------------
-  enable_karpenter_resources = true
-  karpenter_resources_helm_config = {
-
-    g5-gpu-karpenter = {
-      values = [
-        <<-EOT
-      name: g5-gpu-karpenter
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        amiFamily: Bottlerocket
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-        blockDeviceMappings:
-          # Root device
-          - deviceName: /dev/xvda
-            ebs:
-              volumeSize: 50Gi
-              volumeType: gp3
-              encrypted: true
-          # Data device: Container resources such as images and logs
-          - deviceName: /dev/xvdb
-            ebs:
-              volumeSize: 300Gi
-              volumeType: gp3
-              encrypted: true
-              ${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""}
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodeGroupType: g5-gpu-karpenter
-        taints:
-          - key: nvidia.com/gpu
-            value: "Exists"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["g5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: [ "2xlarge", "4xlarge", "8xlarge" ]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 300s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    x86-cpu-karpenter = {
-      values = [
-        <<-EOT
-      name: x86-cpu-karpenter
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        amiFamily: Bottlerocket
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[3]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        # instanceStorePolicy: RAID0
-        blockDeviceMappings:
-          # Root device
-          - deviceName: /dev/xvda
-            ebs:
-              volumeSize: 100Gi
-              volumeType: gp3
-              encrypted: true
-          # Data device: Container resources such as images and logs
-          - deviceName: /dev/xvdb
-            ebs:
-              volumeSize: 300Gi
-              volumeType: gp3
-              encrypted: true
-              ${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""}
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodeGroupType: x86-cpu-karpenter
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["m5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 300s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-  }
-
-  depends_on = [
-    kubernetes_secret_v1.huggingface_token,
-    kubernetes_config_map_v1.notebook
-  ]
-}
-
-
-#---------------------------------------------------------------
-# Additional Resources
-#---------------------------------------------------------------
-
-resource "kubernetes_namespace_v1" "jupyterhub" {
-  metadata {
-    name = "jupyterhub"
-  }
-}
-
-
-resource "kubernetes_secret_v1" "huggingface_token" {
-  metadata {
-    name      = "hf-token"
-    namespace = kubernetes_namespace_v1.jupyterhub.id
-  }
-
-  data = {
-    token = var.huggingface_token
-  }
-}
-
-resource "kubernetes_config_map_v1" "notebook" {
-  metadata {
-    name      = "notebook"
-    namespace = kubernetes_namespace_v1.jupyterhub.id
-  }
-
-  data = {
-    "dogbooth.ipynb" = file("${path.module}/src/notebook/dogbooth.ipynb")
-  }
-}
-
-#---------------------------------------------------------------
-# Grafana Admin credentials resources
-# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
-#---------------------------------------------------------------
-data "aws_secretsmanager_secret_version" "admin_password_version" {
-  secret_id  = aws_secretsmanager_secret.grafana.id
-  depends_on = [aws_secretsmanager_secret_version.grafana]
-}
-
-resource "random_password" "grafana" {
-  length           = 16
-  special          = true
-  override_special = "@_"
-}
-
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "grafana" {
-  name_prefix             = "${local.name}-oss-grafana"
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "grafana" {
-  secret_id     = aws_secretsmanager_secret.grafana.id
-  secret_string = random_password.grafana.result
-}
-
-data "aws_iam_policy_document" "karpenter_controller_policy" {
-  statement {
-    actions = [
-      "ec2:RunInstances",
-      "ec2:CreateLaunchTemplate",
-    ]
-    resources = ["*"]
-    effect    = "Allow"
-    sid       = "KarpenterControllerAdditionalPolicy"
-  }
-}
diff --git a/ai-ml/jark-stack/terraform/eks.tf b/ai-ml/jark-stack/terraform/eks.tf
deleted file mode 100644
index aaf11a9e7..000000000
--- a/ai-ml/jark-stack/terraform/eks.tf
+++ /dev/null
@@ -1,212 +0,0 @@
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 19.15"
-
-  cluster_name    = local.name
-  cluster_version = var.eks_cluster_version
-
-  # if true, Your cluster API server is accessible from the internet.
-  # You can, optionally, limit the CIDR blocks that can access the public endpoint.
-  #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
-  cluster_endpoint_public_access = true
-
-  vpc_id = module.vpc.vpc_id
-  # Filtering only Secondary CIDR private subnets starting with "100.".
-  # Subnet IDs where the EKS Control Plane ENIs will be created
-  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-  substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-  manage_aws_auth_configmap = true
-  aws_auth_roles = [
-    # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
-    {
-      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
-      username = "system:node:{{EC2PrivateDNSName}}"
-      groups = [
-        "system:bootstrappers",
-        "system:nodes",
-      ]
-    }
-  ]
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 0
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  node_security_group_additional_rules = {
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports.
-    # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on
-    # e.g., coreDNS 53, metrics-server 4443.
-    # Update this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-
-    ebs_optimized = true
-    # This block device is used only for root volume. Adjust volume according to your size.
-    # NOTE: Don't use this volume for ML workloads
-    block_device_mappings = {
-      xvda = {
-        device_name = "/dev/xvda"
-        ebs = {
-          volume_size = 100
-          volume_type = "gp3"
-        }
-      }
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  It's recommended to have a Managed Node group for hosting critical add-ons
-    #  It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
-    #  You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
-    core_node_group = {
-      name        = "core-node-group"
-      description = "EKS Core node group for hosting system add-ons"
-      # Filtering only Secondary CIDR private subnets starting with "100.".
-      # Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2
-      ami_type     = "AL2_x86_64" # Use this for Graviton AL2_ARM_64
-      min_size     = 2
-      max_size     = 8
-      desired_size = 2
-
-      instance_types = ["m5.xlarge"]
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "core"
-      }
-
-      tags = merge(local.tags, {
-        Name = "core-node-grp"
-      })
-    }
-
-    # GPU Nodegroup for JupyterHub Notebook and Ray Service
-    gpu1 = {
-      name        = "gpu-node-grp"
-      description = "EKS Node Group to run GPU workloads"
-      # Filtering only Secondary CIDR private subnets starting with "100.".
-      # Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      ami_type     = "AL2_x86_64_GPU"
-      min_size     = 0
-      max_size     = 1
-      desired_size = 0
-
-      instance_types = ["g5.12xlarge"]
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "gpu"
-      }
-
-      taints = {
-        gpu = {
-          key      = "nvidia.com/gpu"
-          effect   = "NO_SCHEDULE"
-          operator = "EXISTS"
-        }
-      }
-
-      tags = merge(local.tags, {
-        Name = "gpu-node-grp"
-      })
-    }
-
-    # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation.
-    # #
-    # gpu_p5_node_group = {
-    #   name        = "p5-gpu-node-grp"
-    #   description = "EKS Node Group to run GPU workloads"
-
-    #   ami_type     = "AL2_x86_64_GPU"
-
-    #   instance_types = ["p5.48xlarge"]
-    #   capacity_type = "ON_DEMAND"
-
-    #   # Filtering only Secondary CIDR private subnets starting with "100.".
-    #   # Subnet IDs where the nodes/node groups will be provisioned
-    #   subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-    #     substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-    #   )
-
-    #   # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation.
-    #   # subnet_ids = ["subnet-01234567890fds"]
-    #   # capacity_reservation_specification = {
-    #   #   capacity_reservation_target = {
-    #   #     capacity_reservation_id = "cr-01234567890fds"
-    #   #   }
-    #   # }
-
-    #   min_size     = 1
-    #   max_size     = 1
-    #   desired_size = 1
-
-    #   # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance
-    #   # we assign the host interface device_index=0, and all other interfaces device_index=1
-    #   #   p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31
-    #   #   p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3
-    #   network_interfaces = [
-    #     for i in range(32) : {
-    #       associate_public_ip_address = false
-    #       delete_on_termination       = true
-    #       device_index                = i == 0 ? 0 : 1
-    #       network_card_index          = i
-    #       interface_type              = "efa"
-    #     }
-    #   ]
-
-    #   # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171
-    #   bootstrap_extra_args = "--local-disks raid0"
-    #   taints = {
-    #     gpu = {
-    #       key      = "nvidia.com/gpu"
-    #       effect   = "NO_SCHEDULE"
-    #       operator = "EXISTS"
-    #     }
-    #   }
-    #   labels = {
-    #     WorkerType    = "ON_DEMAND"
-    #     NodeGroupType = "gpu"
-    #   }
-    #   tags = merge(local.tags, {
-    #     Name = "p5-gpu-node-grp"
-    #   })
-    # }
-  }
-}
diff --git a/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml b/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml
deleted file mode 100644
index de495c16a..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-## Argo Events admission webhook
-webhook:
-  # -- Enable admission webhook. Applies only for cluster-wide installation
-  enabled: true
diff --git a/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml b/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml
deleted file mode 100644
index 2f6c9e729..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-server:
-  autoscaling:
-    enabled: true
-    minReplicas: 1
-  serviceType: LoadBalancer
diff --git a/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
deleted file mode 100644
index ae3c41d44..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-resources:
-  limits:
-    cpu: 500m
-    memory: 2Gi
-  requests:
-    cpu: 200m
-    memory: 1Gi
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml b/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
deleted file mode 100644
index c214e10ba..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-tolerations:
-  - key: nvidia.com/gpu
-    operator: Exists
-    effect: NoSchedule
-  - operator: "Exists"
diff --git a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml
deleted file mode 100644
index c8b1a5d74..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-controller:
-  service:
-    externalTrafficPolicy: "Local"
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-type: external
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
-    targetPorts:
-      http: http
-      https: http
diff --git a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml
deleted file mode 100644
index fcad06b62..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
-hub:
-  config:
-    Authenticator:
-      admin_users:
-        - admin1
-      allowed_users:
-        - user1
-    # testing only - do not do this for production
-    DummyAuthenticator:
-      password: never-do-this
-    JupyterHub:
-      authenticator_class: dummy
-proxy:
-  service:
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
-      service.beta.kubernetes.io/aws-load-balancer-type: external
-      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
-      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
-singleuser:
-  image:
-    name: public.ecr.aws/h3o5n2r0/gpu-jupyter
-    tag: v1.5_cuda-11.6_ubuntu-20.04_python-only
-    pullPolicy: Always
-  cmd: null
-  startTimeout: 600
-  memory:
-    guarantee: 24G
-  extraResource:
-    limits:
-      nvidia.com/gpu: "1"
-  extraEnv:
-    HUGGING_FACE_HUB_TOKEN:
-      valueFrom:
-        secretKeyRef:
-          name: hf-token
-          key: token
-  storage:
-    capacity: 100Gi
-    extraVolumes:
-      - name: shm-volume
-        emptyDir:
-          medium: Memory
-      - name: notebook
-        configMap:
-          name: notebook
-    extraVolumeMounts:
-      - name: shm-volume
-        mountPath: /dev/shm
-      - name: notebook
-        mountPath: /home/jovyan/dogbooth
-  extraTolerations:
-    - key: nvidia.com/gpu
-      operator: Exists
-      effect: NoSchedule
-scheduling:
-  userScheduler:
-    enabled: false
diff --git a/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml b/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml
deleted file mode 100644
index 47e090743..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-          - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-          - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
diff --git a/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml b/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml
deleted file mode 100644
index 178eb68cf..000000000
--- a/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-
-# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090
-
-global:
-  # pricingCsv:
-  #   enabled: false
-  #   location:
-  #     provider: "AWS"
-  #     region: "us-east-1"
-  #     URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
-  #     csvAccessCredentials: pricing-schema-access-secret
-
-  prometheus:
-    enabled: true  # Kubecost depends on Prometheus data, it is not optional. When enabled: false, Prometheus will not be installed and you must configure your own Prometheus to scrape kubecost as well as provide the fqdn below. -- Warning: Before changing this setting, please read to understand the risks https://docs.kubecost.com/install-and-configure/install/custom-prom
-    fqdn: http://cost-analyzer-prometheus-server.default.svc  # example address of a prometheus to connect to. Include protocol (http:// or https://) Ignored if enabled: true
-
-  grafana:
-    enabled: true  # If false, Grafana will not be installed
-    domainName: cost-analyzer-grafana.default.svc  # example grafana domain Ignored if enabled: true
-    scheme: "http"  # http or https, for the domain name above.
-    proxy: true  # If true, the kubecost frontend will route to your grafana through its service endpoint
-
-kubecostFrontend:
-  image: public.ecr.aws/kubecost/frontend
-  resources:
-    requests:
-      cpu: "200m"
-      memory: "512Mi"
-
-kubecostMetrics:
-  emitPodAnnotations: true
-  emitNamespaceAnnotations: true
-
-kubecostModel:
-  image: public.ecr.aws/kubecost/cost-model
-  resources:
-    requests:
-      cpu: "500m"
-      memory: "512Mi"
-
-forecasting:
-  fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.6
-
-networkCosts:
-  image:
-    repository: public.ecr.aws/kubecost/kubecost-network-costs
-
-clusterController:
-  image:
-    repository: public.ecr.aws/kubecost/cluster-controller
-
-prometheus:
-  server:
-    image:
-      repository: public.ecr.aws/kubecost/prometheus
-
-  configmapReload:
-    prometheus:
-      image:
-        repository: public.ecr.aws/kubecost/prometheus-config-reloader
-
-reporting:
-  productAnalytics: false
-
-# Define persistence volume for cost-analyzer
-persistentVolume:
-  size: 32Gi
-  dbSize: 32.0Gi
-  enabled: true # Note that setting this to false means configurations will be wiped out on pod restart.
diff --git a/ai-ml/jark-stack/terraform/karpenter.tf b/ai-ml/jark-stack/terraform/karpenter.tf
deleted file mode 100644
index e69de29bb..000000000
diff --git a/ai-ml/jark-stack/terraform/main.tf b/ai-ml/jark-stack/terraform/main.tf
deleted file mode 100644
index f93511951..000000000
--- a/ai-ml/jark-stack/terraform/main.tf
+++ /dev/null
@@ -1,51 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-provider "kubectl" {
-  apply_retry_count      = 30
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-  load_config_file       = false
-}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-data "aws_availability_zones" "available" {}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-locals {
-  name   = var.name
-  region = var.region
-  azs    = slice(data.aws_availability_zones.available.names, 0, 2)
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
diff --git a/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml b/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml
deleted file mode 100644
index 8ade99739..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: PodMonitor
-metadata:
-  name: ray-workers-monitor
-  namespace: kube-prometheus-stack
-  labels:
-    # `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label.
-    release: kube-prometheus-stack
-spec:
-  jobLabel: ray-workers
-  # Only select Kubernetes Pods in the "default" namespace.
-  namespaceSelector:
-    matchNames:
-      - rayserve-vllm
-  # Only select Kubernetes Pods with "matchLabels".
-  selector:
-    matchLabels:
-      ray.io/node-type: worker
-  # A list of endpoints allowed as part of this PodMonitor.
-  podMetricsEndpoints:
-  - port: metrics
diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
deleted file mode 100644
index 26d11b3f1..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json
+++ /dev/null
@@ -1,4535 +0,0 @@
-{
-    "annotations": {
-        "list": [
-            {
-                "builtIn": 1,
-                "datasource": "-- Grafana --",
-                "enable": true,
-                "hide": true,
-                "iconColor": "rgba(0, 211, 255, 1)",
-                "name": "Annotations & Alerts",
-                "type": "dashboard"
-            }
-        ]
-    },
-    "editable": true,
-    "gnetId": null,
-    "graphTooltip": 0,
-    "iteration": 1667344411089,
-    "links": [],
-    "panels": [
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 0
-            },
-            "hiddenSeries": false,
-            "id": 1,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Bytes Spilled",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Amount allocated by dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 0
-            },
-            "hiddenSeries": false,
-            "id": 2,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Bytes Allocated",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Amount freed by dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 1
-            },
-            "hiddenSeries": false,
-            "id": 3,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Bytes Freed",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Amount of memory store used by dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 1
-            },
-            "hiddenSeries": false,
-            "id": 4,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Current Usage: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Object Store Memory",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Logical CPUs allocated to dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 2
-            },
-            "hiddenSeries": false,
-            "id": 5,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "CPU Usage: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "CPUs (logical slots)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "cores",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Logical GPUs allocated to dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 2
-            },
-            "hiddenSeries": false,
-            "id": 6,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "GPU Usage: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "GPUs (logical slots)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "cores",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Total bytes outputted by dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 3
-            },
-            "hiddenSeries": false,
-            "id": 7,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Bytes Outputted",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Total rows outputted by dataset operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 3
-            },
-            "hiddenSeries": false,
-            "id": 11,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Rows Outputted",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "rows",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of input blocks received by operator.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 4
-            },
-            "hiddenSeries": false,
-            "id": 17,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Blocks Received: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Input Blocks Received by Operator",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of input blocks received by operator.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 4
-            },
-            "hiddenSeries": false,
-            "id": 18,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Received: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Input Blocks Received by Operator",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of input blocks that operator's tasks have finished processing.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 5
-            },
-            "hiddenSeries": false,
-            "id": 19,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Blocks Processed: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Input Blocks Processed by Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of input blocks that operator's tasks have finished processing.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 5
-            },
-            "hiddenSeries": false,
-            "id": 20,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Processed: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Input Bytes Processed by Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of input blocks passed to submitted tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 6
-            },
-            "hiddenSeries": false,
-            "id": 21,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Submitted: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Input Bytes Submitted to Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of output blocks generated by tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 6
-            },
-            "hiddenSeries": false,
-            "id": 22,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Blocks Generated: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Blocks Generated by Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of output blocks generated by tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 7
-            },
-            "hiddenSeries": false,
-            "id": 23,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Generated: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Bytes Generated by Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of rows in generated output blocks from finished tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 7
-            },
-            "hiddenSeries": false,
-            "id": 24,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Rows Generated: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Rows Generated by Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "rows",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of output blocks that are already taken by downstream operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 8
-            },
-            "hiddenSeries": false,
-            "id": 25,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Blocks Taken: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Output Blocks Taken by Downstream Operators",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of output blocks that are already taken by downstream operators.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 8
-            },
-            "hiddenSeries": false,
-            "id": 26,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Taken: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Output Bytes Taken by Downstream Operators",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of submitted tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 9
-            },
-            "hiddenSeries": false,
-            "id": 29,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Submitted Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of running tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 9
-            },
-            "hiddenSeries": false,
-            "id": 30,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Running Tasks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Running Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of tasks that already have output.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 10
-            },
-            "hiddenSeries": false,
-            "id": 31,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Tasks with output blocks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of finished tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 10
-            },
-            "hiddenSeries": false,
-            "id": 32,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Finished Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of failed tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 11
-            },
-            "hiddenSeries": false,
-            "id": 33,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Failed Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Time spent generating blocks in tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 11
-            },
-            "hiddenSeries": false,
-            "id": 8,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Block Generation Time",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Time spent in task submission backpressure.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 12
-            },
-            "hiddenSeries": false,
-            "id": 37,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Task Submission Backpressure Time",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of blocks in operator's internal input queue",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 12
-            },
-            "hiddenSeries": false,
-            "id": 13,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Operator Internal Inqueue Size (Blocks)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of input blocks in the operator's internal input queue.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 13
-            },
-            "hiddenSeries": false,
-            "id": 14,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Operator Internal Inqueue Size (Bytes)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of blocks in operator's internal output queue",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 13
-            },
-            "hiddenSeries": false,
-            "id": 15,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Operator Internal Outqueue Size (Blocks)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "blocks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of output blocks in the operator's internal output queue.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 14
-            },
-            "hiddenSeries": false,
-            "id": 16,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Operator Internal Outqueue Size (Bytes)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of input blocks used by pending tasks.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 14
-            },
-            "hiddenSeries": false,
-            "id": 34,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Size of Blocks used in Pending Tasks (Bytes)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of freed memory in object store.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 15
-            },
-            "hiddenSeries": false,
-            "id": 35,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Freed Memory in Object Store (Bytes)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Byte size of spilled memory in object store.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 15
-            },
-            "hiddenSeries": false,
-            "id": 36,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)",
-                    "interval": "",
-                    "legendFormat": "Bytes Size: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Spilled Memory in Object Store (Bytes)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Seconds spent in iterator initialization code",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 16
-            },
-            "hiddenSeries": false,
-            "id": 12,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
-                    "interval": "",
-                    "legendFormat": "Seconds: {{dataset}}, {{operator}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Iteration Initialization Time",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Seconds user thread is blocked by iter_batches()",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 16
-            },
-            "hiddenSeries": false,
-            "id": 9,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
-                    "interval": "",
-                    "legendFormat": "Seconds: {{dataset}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Iteration Blocked Time",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Seconds spent in user code",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 17
-            },
-            "hiddenSeries": false,
-            "id": 10,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)",
-                    "interval": "",
-                    "legendFormat": "Seconds: {{dataset}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Iteration User Time",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        }
-    ],
-    "refresh": false,
-    "schemaVersion": 27,
-    "style": "dark",
-    "tags": [
-        "rayVersion:2.24.0"
-    ],
-    "templating": {
-        "list": [
-            {
-                "current": {
-                    "selected": false
-                },
-                "description": "Filter queries of a specific Prometheus type.",
-                "hide": 2,
-                "includeAll": false,
-                "multi": false,
-                "name": "datasource",
-                "options": [],
-                "query": "prometheus",
-                "refresh": 1,
-                "regex": "",
-                "skipUrlSync": false,
-                "type": "datasource"
-            },
-            {
-                "allValue": ".+",
-                "current": {
-                    "selected": false
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_data_allocated_bytes{}, SessionName)",
-                "description": "Filter queries to specific ray sessions.",
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": false,
-                "name": "SessionName",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_data_allocated_bytes{}, SessionName)",
-                    "refId": "StandardVariableQuery"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 2,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".+",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_data_allocated_bytes{}, dataset)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "DatasetID",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_data_allocated_bytes{}, dataset)",
-                    "refId": "Prometheus-Dataset-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            }
-        ]
-    },
-    "rayMeta": [
-        "excludesSystemRoutes",
-        "supportsGlobalFilterOverride"
-    ],
-    "time": {
-        "from": "now-30m",
-        "to": "now"
-    },
-    "timepicker": {},
-    "timezone": "",
-    "title": "Data Dashboard",
-    "uid": "rayDataDashboard",
-    "version": 1
-}
diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
deleted file mode 100644
index 7814395f5..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json
+++ /dev/null
@@ -1,2836 +0,0 @@
-{
-    "annotations": {
-        "list": [
-            {
-                "builtIn": 1,
-                "datasource": "-- Grafana --",
-                "enable": true,
-                "hide": true,
-                "iconColor": "rgba(0, 211, 255, 1)",
-                "name": "Annotations & Alerts",
-                "type": "dashboard"
-            }
-        ]
-    },
-    "editable": true,
-    "gnetId": null,
-    "graphTooltip": 0,
-    "iteration": 1667344411089,
-    "links": [],
-    "panels": [
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 0
-            },
-            "hiddenSeries": false,
-            "id": 26,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)",
-                    "interval": "",
-                    "legendFormat": "{{State}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)",
-                    "interval": "",
-                    "legendFormat": "{{State}} (retry)",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduler Task State",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 0
-            },
-            "hiddenSeries": false,
-            "id": 35,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)",
-                    "interval": "",
-                    "legendFormat": "{{Name}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)",
-                    "interval": "",
-                    "legendFormat": "{{Name}} (retry)",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Active Tasks by Name",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 1
-            },
-            "hiddenSeries": false,
-            "id": 33,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)",
-                    "interval": "",
-                    "legendFormat": "{{State}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduler Actor State",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "actors",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current number of (live) actors with a particular name.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 1
-            },
-            "hiddenSeries": false,
-            "id": 36,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)",
-                    "interval": "",
-                    "legendFormat": "{{Name}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Active Actors by Name",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "actors",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 2
-            },
-            "hiddenSeries": false,
-            "id": 27,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)",
-                    "interval": "",
-                    "legendFormat": "CPU Usage: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))",
-                    "interval": "",
-                    "legendFormat": "MAX + PENDING",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduler CPUs (logical slots)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "cores",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 2
-            },
-            "hiddenSeries": false,
-            "id": 29,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)",
-                    "interval": "",
-                    "legendFormat": "{{Location}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Object Store Memory",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 3
-            },
-            "hiddenSeries": false,
-            "id": 28,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "GPU Usage: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))",
-                    "interval": "",
-                    "legendFormat": "MAX + PENDING",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduler GPUs (logical slots)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "GPUs",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 3
-            },
-            "hiddenSeries": false,
-            "id": 40,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)",
-                    "interval": "",
-                    "legendFormat": "{{State}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduler Placement Groups",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "placement groups",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 4
-            },
-            "hiddenSeries": false,
-            "id": 2,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100",
-                    "interval": "",
-                    "legendFormat": "CPU Usage: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node CPU (hardware utilization)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "cores",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 4
-            },
-            "hiddenSeries": false,
-            "id": 8,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100",
-                    "interval": "",
-                    "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node GPU (hardware utilization)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "GPUs",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 5
-            },
-            "hiddenSeries": false,
-            "id": 6,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Disk Used: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Disk",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Disk IO per node.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 5
-            },
-            "hiddenSeries": false,
-            "id": 32,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Write: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Read: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Disk IO Speed",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "Bps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 6
-            },
-            "hiddenSeries": false,
-            "id": 4,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Memory Used: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Memory (heap + object store)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 6
-            },
-            "hiddenSeries": false,
-            "id": 44,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "OOM Killed: {{Name}}, {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Out of Memory Failures by Name",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "failures",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 7
-            },
-            "hiddenSeries": false,
-            "id": 34,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))",
-                    "interval": "",
-                    "legendFormat": "{{Component}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "shared_memory",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Memory by Component",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 7
-            },
-            "hiddenSeries": false,
-            "id": 37,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100",
-                    "interval": "",
-                    "legendFormat": "{{Component}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node CPU by Component",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "cores",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 8
-            },
-            "hiddenSeries": false,
-            "id": 18,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024",
-                    "interval": "",
-                    "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024",
-                    "interval": "",
-                    "legendFormat": "MAX",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node GPU Memory (GRAM)",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "bytes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Network speed per node",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 8
-            },
-            "hiddenSeries": false,
-            "id": 20,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Recv: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}",
-                    "interval": "",
-                    "legendFormat": "Send: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Network",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "Bps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 0,
-                "y": 9
-            },
-            "hiddenSeries": false,
-            "id": 24,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Active Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Failed Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Pending Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node Count",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "nodes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "h": 8,
-                "w": 12,
-                "x": 12,
-                "y": 9
-            },
-            "hiddenSeries": false,
-            "id": 41,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})",
-                    "interval": "",
-                    "legendFormat": "CPU (physical)",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))",
-                    "interval": "",
-                    "legendFormat": "GPU (physical)",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100",
-                    "interval": "",
-                    "legendFormat": "Memory (RAM)",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100",
-                    "interval": "",
-                    "legendFormat": "GRAM",
-                    "queryType": "randomWalk",
-                    "refId": "D"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100",
-                    "interval": "",
-                    "legendFormat": "Object Store Memory",
-                    "queryType": "randomWalk",
-                    "refId": "E"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100",
-                    "interval": "",
-                    "legendFormat": "Disk",
-                    "queryType": "randomWalk",
-                    "refId": "F"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Cluster Utilization",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "%",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        }
-    ],
-    "refresh": false,
-    "schemaVersion": 27,
-    "style": "dark",
-    "tags": [
-        "rayVersion:2.24.0"
-    ],
-    "templating": {
-        "list": [
-            {
-                "current": {
-                    "selected": false
-                },
-                "description": "Filter queries of a specific Prometheus type.",
-                "hide": 2,
-                "includeAll": false,
-                "multi": false,
-                "name": "datasource",
-                "options": [],
-                "query": "prometheus",
-                "refresh": 1,
-                "regex": "",
-                "skipUrlSync": false,
-                "type": "datasource"
-            },
-            {
-                "allValue": ".+",
-                "current": {
-                    "selected": false
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_node_network_receive_speed{}, SessionName)",
-                "description": "Filter queries to specific ray sessions.",
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": false,
-                "name": "SessionName",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_node_network_receive_speed{}, SessionName)",
-                    "refId": "StandardVariableQuery"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 2,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".+",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Instance",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            }
-        ]
-    },
-    "time": {
-        "from": "now-30m",
-        "to": "now"
-    },
-    "timepicker": {},
-    "timezone": "",
-    "title": "Default Dashboard",
-    "uid": "rayDefaultDashboard",
-    "version": 4,
-    "rayMeta": [
-        "supportsGlobalFilterOverride"
-    ]
-}
diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
deleted file mode 100644
index 8648e308a..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json
+++ /dev/null
@@ -1,2115 +0,0 @@
-{
-    "annotations": {
-        "list": [
-            {
-                "builtIn": 1,
-                "datasource": "-- Grafana --",
-                "enable": true,
-                "hide": true,
-                "iconColor": "rgba(0, 211, 255, 1)",
-                "name": "Annotations & Alerts",
-                "type": "dashboard"
-            }
-        ]
-    },
-    "editable": true,
-    "gnetId": null,
-    "graphTooltip": 0,
-    "iteration": 1667344411089,
-    "links": [],
-    "panels": [
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of replicas per deployment. Ignores \"Route\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 0,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 1,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Replicas per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "replicas",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "QPS for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 0,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 2,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "QPS per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Error QPS for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 0,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 3,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Error QPS per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P50 latency per replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 1,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 4,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P50 latency per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P90 latency per replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 1,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 5,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P90 latency per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P99 latency per replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 1,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 6,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P99 latency per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of requests queued per deployment. Ignores \"Replica\" and \"Route\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 2,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 7,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Queue size per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "requests",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Current running requests for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 12,
-                "y": 2,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 8,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Running requests per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "requests",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of multiplexed models for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 9,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Multiplexed models per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "models",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of times of multiplexed models loaded for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 10,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Multiplexed model loads per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "times",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of times of multiplexed models unloaded for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 11,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Multiplexed model unloads per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "times",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P99 latency of mutliplexed model load per replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 12,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P99 latency of multiplexed model loads per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P99 latency of mutliplexed model unload per replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 13,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P99 latency of multiplexed model unloads per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The ids of multiplexed models for each replica.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 14,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}",
-                    "interval": "",
-                    "legendFormat": "{{replica}}:{{model_id}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Multiplexed model ids per replica",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "model",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The cache hit rate of multiplexed models for the deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 5,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 15,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))",
-                    "interval": "",
-                    "legendFormat": "{{replica}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Multiplexed model cache hit rate",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "%",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        }
-    ],
-    "refresh": false,
-    "schemaVersion": 27,
-    "style": "dark",
-    "tags": [
-        "rayVersion:2.24.0"
-    ],
-    "templating": {
-        "list": [
-            {
-                "current": {
-                    "selected": false
-                },
-                "description": "Filter queries to specific prometheus type.",
-                "hide": 2,
-                "includeAll": false,
-                "multi": false,
-                "name": "datasource",
-                "options": [],
-                "query": "prometheus",
-                "refresh": 1,
-                "regex": "",
-                "skipUrlSync": false,
-                "type": "datasource"
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Application",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_deployment_replica_healthy{}, application)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Deployment",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Replica",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Route",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            }
-        ]
-    },
-    "rayMeta": [
-        "excludesSystemRoutes",
-        "supportsGlobalFilterOverride"
-    ],
-    "time": {
-        "from": "now-30m",
-        "to": "now"
-    },
-    "timepicker": {},
-    "timezone": "",
-    "title": "Serve Deployment Dashboard",
-    "uid": "rayServeDeploymentDashboard",
-    "version": 1
-}
diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
deleted file mode 100644
index 4d1ec6e8e..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json
+++ /dev/null
@@ -1,3098 +0,0 @@
-{
-    "annotations": {
-        "list": [
-            {
-                "builtIn": 1,
-                "datasource": "-- Grafana --",
-                "enable": true,
-                "hide": true,
-                "iconColor": "rgba(0, 211, 255, 1)",
-                "name": "Annotations & Alerts",
-                "type": "dashboard"
-            }
-        ]
-    },
-    "editable": true,
-    "gnetId": null,
-    "graphTooltip": 0,
-    "iteration": 1667344411089,
-    "links": [],
-    "panels": [
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 0,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 5,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "avg(ray_node_cpu_utilization{})",
-                    "interval": "",
-                    "legendFormat": "CPU (physical)",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))",
-                    "interval": "",
-                    "legendFormat": "GPU (physical)",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100",
-                    "interval": "",
-                    "legendFormat": "Memory (RAM)",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100",
-                    "interval": "",
-                    "legendFormat": "GRAM",
-                    "queryType": "randomWalk",
-                    "refId": "D"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100",
-                    "interval": "",
-                    "legendFormat": "Object Store Memory",
-                    "queryType": "randomWalk",
-                    "refId": "E"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100",
-                    "interval": "",
-                    "legendFormat": "Disk",
-                    "queryType": "randomWalk",
-                    "refId": "F"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Cluster Utilization",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "%",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "QPS for each selected application.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 12,
-                "y": 0,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 7,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)",
-                    "interval": "",
-                    "legendFormat": "{{application, route}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)",
-                    "interval": "",
-                    "legendFormat": "{{application, method}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "QPS per application",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Error QPS for each selected application.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 1,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 8,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)",
-                    "interval": "",
-                    "legendFormat": "{{application, route}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)",
-                    "interval": "",
-                    "legendFormat": "{{application, method}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Error QPS per application",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Error QPS for each selected application.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 12,
-                "y": 1,
-                "w": 12,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 17,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, error_code)",
-                    "interval": "",
-                    "legendFormat": "{{application, route, error_code}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, error_code)",
-                    "interval": "",
-                    "legendFormat": "{{application, method, error_code}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Error QPS per application per error code",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P50 latency for selected applications.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 2,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 12,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, route}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, method}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P50 latency per application",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P90 latency for selected applications.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 2,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 15,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, route}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, method}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P90 latency per application",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P99 latency for selected applications.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 2,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 16,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, route}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, method}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P99 latency per application",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of replicas per deployment. Ignores \"Application\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 2,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Replicas per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "replicas",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "QPS for each deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 13,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "QPS per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Error QPS for each deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 3,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 14,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Error QPS per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "qps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P50 latency per deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 9,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P50 latency per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P90 latency per deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 10,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P90 latency per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "P99 latency per deployment.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 4,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 11,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))",
-                    "interval": "",
-                    "legendFormat": "Total",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "P99 latency per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "ms",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of requests queued per deployment. Ignores \"Application\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 0,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 5,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 3,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)",
-                    "interval": "",
-                    "legendFormat": "{{application, deployment}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Queue size per deployment",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "requests",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Number of nodes in this cluster. Ignores \"Application\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 5,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 4,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_active_nodes{}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Active Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Failed Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)",
-                    "interval": "",
-                    "legendFormat": "Pending Nodes: {{NodeType}}",
-                    "queryType": "randomWalk",
-                    "refId": "C"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node count",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "nodes",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "Network speed per node. Ignores \"Application\" variable.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 1,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 5,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 6,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 2,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": false,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_network_receive_speed{}) by (instance)",
-                    "interval": "",
-                    "legendFormat": "Recv: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                },
-                {
-                    "exemplar": true,
-                    "expr": "sum(ray_node_network_send_speed{}) by (instance)",
-                    "interval": "",
-                    "legendFormat": "Send: {{instance}}",
-                    "queryType": "randomWalk",
-                    "refId": "B"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Node network",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "Bps",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of ongoing requests in the HTTP Proxy.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 6,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 20,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_num_ongoing_http_requests{}",
-                    "interval": "",
-                    "legendFormat": "Ongoing HTTP Requests",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Ongoing HTTP Requests",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "requests",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of ongoing requests in the gRPC Proxy.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 6,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 21,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_num_ongoing_grpc_requests{}",
-                    "interval": "",
-                    "legendFormat": "Ongoing gRPC Requests",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Ongoing gRPC Requests",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "requests",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of request scheduling tasks in the router.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 6,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 22,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_num_scheduling_tasks{}",
-                    "interval": "",
-                    "legendFormat": "Scheduling Tasks",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduling Tasks",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of request scheduling tasks in the router that are undergoing backoff.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 0,
-                "y": 7,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 23,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_num_scheduling_tasks_in_backoff{}",
-                    "interval": "",
-                    "legendFormat": "Scheduling Tasks in Backoff",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Scheduling Tasks in Backoff",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "tasks",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The duration of the last control loop.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 8,
-                "y": 7,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 24,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_controller_control_loop_duration_s{}",
-                    "interval": "",
-                    "legendFormat": "Control Loop Duration",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Controller Control Loop Duration",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "seconds",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        },
-        {
-            "aliasColors": {},
-            "bars": false,
-            "dashLength": 10,
-            "dashes": false,
-            "datasource": "${datasource}",
-            "description": "The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
-            "fieldConfig": {
-                "defaults": {},
-                "overrides": []
-            },
-            "fill": 10,
-            "fillGradient": 0,
-            "gridPos": {
-                "x": 16,
-                "y": 7,
-                "w": 8,
-                "h": 8
-            },
-            "hiddenSeries": false,
-            "id": 25,
-            "legend": {
-                "alignAsTable": true,
-                "avg": false,
-                "current": true,
-                "hideEmpty": false,
-                "hideZero": true,
-                "max": false,
-                "min": false,
-                "rightSide": false,
-                "show": true,
-                "sort": "current",
-                "sortDesc": true,
-                "total": false,
-                "values": true
-            },
-            "lines": true,
-            "linewidth": 1,
-            "nullPointMode": "null",
-            "options": {
-                "alertThreshold": true
-            },
-            "percentage": false,
-            "pluginVersion": "7.5.17",
-            "pointradius": 2,
-            "points": false,
-            "renderer": "flot",
-            "seriesOverrides": [
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX",
-                    "dashes": true,
-                    "color": "#1F60C4",
-                    "fill": 0,
-                    "stack": false
-                },
-                {
-                    "$$hashKey": "object:78",
-                    "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
-                    "hiddenSeries": true
-                },
-                {
-                    "$$hashKey": "object:2987",
-                    "alias": "MAX + PENDING",
-                    "dashes": true,
-                    "color": "#777777",
-                    "fill": 0,
-                    "stack": false
-                }
-            ],
-            "spaceLength": 10,
-            "stack": true,
-            "steppedLine": false,
-            "targets": [
-                {
-                    "exemplar": true,
-                    "expr": "ray_serve_controller_num_control_loops{}",
-                    "interval": "",
-                    "legendFormat": "Control Loops",
-                    "queryType": "randomWalk",
-                    "refId": "A"
-                }
-            ],
-            "thresholds": [],
-            "timeFrom": null,
-            "timeRegions": [],
-            "timeShift": null,
-            "title": "Number of Control Loops",
-            "tooltip": {
-                "shared": true,
-                "sort": 0,
-                "value_type": "individual"
-            },
-            "type": "graph",
-            "xaxis": {
-                "buckets": null,
-                "mode": "time",
-                "name": null,
-                "show": true,
-                "values": []
-            },
-            "yaxes": [
-                {
-                    "$$hashKey": "object:628",
-                    "format": "loops",
-                    "label": "",
-                    "logBase": 1,
-                    "max": null,
-                    "min": "0",
-                    "show": true
-                },
-                {
-                    "$$hashKey": "object:629",
-                    "format": "short",
-                    "label": null,
-                    "logBase": 1,
-                    "max": null,
-                    "min": null,
-                    "show": true
-                }
-            ],
-            "yaxis": {
-                "align": false,
-                "alignLevel": null
-            }
-        }
-    ],
-    "refresh": false,
-    "schemaVersion": 27,
-    "style": "dark",
-    "tags": [
-        "rayVersion:2.24.0"
-    ],
-    "templating": {
-        "list": [
-            {
-                "current": {
-                    "selected": false
-                },
-                "description": "Filter queries of a specific Prometheus type.",
-                "hide": 2,
-                "includeAll": false,
-                "multi": false,
-                "name": "datasource",
-                "options": [],
-                "query": "prometheus",
-                "refresh": 1,
-                "regex": "",
-                "skipUrlSync": false,
-                "type": "datasource"
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": null,
-                "multi": true,
-                "name": "Application",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_deployment_replica_healthy{}, application)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_num_http_requests_total{}, route)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": "HTTP Route",
-                "multi": true,
-                "name": "HTTP_Route",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_num_http_requests_total{}, route)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            },
-            {
-                "allValue": ".*",
-                "current": {
-                    "selected": true,
-                    "text": [
-                        "All"
-                    ],
-                    "value": [
-                        "$__all"
-                    ]
-                },
-                "datasource": "${datasource}",
-                "definition": "label_values(ray_serve_num_grpc_requests{}, method)",
-                "description": null,
-                "error": null,
-                "hide": 0,
-                "includeAll": true,
-                "label": "gRPC Service Method",
-                "multi": true,
-                "name": "gRPC_Method",
-                "options": [],
-                "query": {
-                    "query": "label_values(ray_serve_num_grpc_requests{}, method)",
-                    "refId": "Prometheus-Instance-Variable-Query"
-                },
-                "refresh": 2,
-                "regex": "",
-                "skipUrlSync": false,
-                "sort": 0,
-                "tagValuesQuery": "",
-                "tags": [],
-                "tagsQuery": "",
-                "type": "query",
-                "useTags": false
-            }
-        ]
-    },
-    "rayMeta": [
-        "excludesSystemRoutes",
-        "supportsGlobalFilterOverride"
-    ],
-    "time": {
-        "from": "now-30m",
-        "to": "now"
-    },
-    "timepicker": {},
-    "timezone": "",
-    "title": "Serve Dashboard",
-    "uid": "rayServeDashboard",
-    "version": 1
-}
diff --git a/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml b/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml
deleted file mode 100644
index dbda70c40..000000000
--- a/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
-metadata:
-  name: ray-head-monitor
-  namespace: kube-prometheus-stack
-  labels:
-    # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label.
-    release: kube-prometheus-stack
-spec:
-  jobLabel: ray-head
-  # Only select Kubernetes Services in the "default" namespace.
-  namespaceSelector:
-    matchNames:
-      - rayserve-vllm
-  # Only select Kubernetes Services with "matchLabels".
-  selector:
-    matchLabels:
-      ray.io/node-type: head
-  # A list of endpoints allowed as part of this ServiceMonitor.
-  endpoints:
-    - port: metrics
-    - port: as-metrics # autoscaler metrics
-    - port: dash-metrics # dashboard metrics
-  targetLabels:
-  - ray.io/cluster
diff --git a/ai-ml/jark-stack/terraform/outputs.tf b/ai-ml/jark-stack/terraform/outputs.tf
deleted file mode 100644
index 5771ae141..000000000
--- a/ai-ml/jark-stack/terraform/outputs.tf
+++ /dev/null
@@ -1,9 +0,0 @@
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}"
-}
-
-output "grafana_secret_name" {
-  description = "The name of the secret containing the Grafana admin password."
-  value       = aws_secretsmanager_secret.grafana.name
-}
diff --git a/ai-ml/jark-stack/terraform/versions.tf b/ai-ml/jark-stack/terraform/versions.tf
deleted file mode 100644
index e24e99c1f..000000000
--- a/ai-ml/jark-stack/terraform/versions.tf
+++ /dev/null
@@ -1,33 +0,0 @@
-terraform {
-  required_version = ">= 1.0.0"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 3.72"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.4.1"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.14"
-    }
-    random = {
-      source  = "hashicorp/random"
-      version = ">= 3.6.0" # Replace with the appropriate version of the random provider
-    }
-  }
-
-  # ##  Used for end-to-end testing on project; update to suit your needs
-  # backend "s3" {
-  #   bucket = "doeks-github-actions-e2e-test-state"
-  #   region = "us-west-2"
-  #   key    = "e2e/jark/terraform.tfstate"
-  # }
-}
diff --git a/ai-ml/jark-stack/terraform/vpc.tf b/ai-ml/jark-stack/terraform/vpc.tf
deleted file mode 100644
index 59c3da89c..000000000
--- a/ai-ml/jark-stack/terraform/vpc.tf
+++ /dev/null
@@ -1,53 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts.
-# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements
-
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = var.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets     = local.public_subnets
-  enable_nat_gateway = true
-  single_nat_gateway = true
-  #-------------------------------
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-    # Tags subnets for Karpenter auto-discovery
-    "karpenter.sh/discovery" = local.name
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf
deleted file mode 100644
index 5e3a7beb1..000000000
--- a/ai-ml/mlflow/addons.tf
+++ /dev/null
@@ -1,431 +0,0 @@
-#---------------------------------------------------------------
-# IRSA for EBS CSI Driver
-#---------------------------------------------------------------
-module "ebs_csi_driver_irsa" {
-  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-  version               = "~> 5.20"
-  role_name_prefix      = format("%s-%s-", local.name, "ebs-csi-driver")
-  attach_ebs_csi_policy = true
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
-    }
-  }
-  tags = local.tags
-}
-#---------------------------------------------------------------
-# EKS Blueprints Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_blueprints_addons" {
-  # Short commit hash from 8th May using git rev-parse --short HEAD
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.3"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    aws-ebs-csi-driver = {
-      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
-    }
-    coredns = {
-      preserve = true
-    }
-    vpc-cni = {
-      preserve = true
-    }
-    kube-proxy = {
-      preserve = true
-    }
-  }
-
-  #---------------------------------------------------------------
-  # CoreDNS Autoscaler helps to scale for large EKS Clusters
-  #   Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/
-  #---------------------------------------------------------------
-  enable_cluster_proportional_autoscaler = true
-  cluster_proportional_autoscaler = {
-    values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
-      target = "deployment/coredns"
-    })]
-    description = "Cluster Proportional Autoscaler for CoreDNS Service"
-  }
-
-  #---------------------------------------
-  # Metrics Server
-  #---------------------------------------
-  enable_metrics_server = true
-  metrics_server = {
-    values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Cluster Autoscaler
-  #---------------------------------------
-  enable_cluster_autoscaler = true
-  cluster_autoscaler = {
-    timeout = "300"
-    values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
-      aws_region     = var.region,
-      eks_cluster_id = module.eks.cluster_name
-    })]
-  }
-
-  #---------------------------------------
-  # AWS for FluentBit - DaemonSet
-  #---------------------------------------
-  enable_aws_for_fluentbit = true
-  aws_for_fluentbit_cw_log_group = {
-    use_name_prefix   = false
-    name              = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
-    retention_in_days = 30
-  }
-  aws_for_fluentbit = {
-    s3_bucket_arns = [
-      module.fluentbit_s3_bucket.s3_bucket_arn,
-      "${module.fluentbit_s3_bucket.s3_bucket_arn}/*"
-    ]
-    values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", {
-      region               = local.region,
-      cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
-      s3_bucket_name       = module.fluentbit_s3_bucket.s3_bucket_id
-      cluster_name         = module.eks.cluster_name
-    })]
-  }
-
-  #---------------------------------------
-  # Karpenter Autoscaler for EKS Cluster
-  #---------------------------------------
-  enable_karpenter                  = true
-  karpenter_enable_spot_termination = true
-  karpenter_node = {
-    iam_role_additional_policies = {
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-  karpenter = {
-    chart_version       = "v0.34.0"
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-  }
-
-  #---------------------------------------
-  # AWS Load Balancer  Controller
-  #---------------------------------------
-  enable_aws_load_balancer_controller = true
-  aws_load_balancer_controller = {
-    set = [{
-      name  = "enableServiceMutatorWebhook"
-      value = "false"
-    }]
-  }
-
-  #---------------------------------------
-  # Ingress Nginx Add-on
-  #---------------------------------------
-  enable_ingress_nginx = true
-  ingress_nginx = {
-    values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Prommetheus and Grafana stack
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # Install Kafka Monitoring Stack with Prometheus and Grafana
-  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
-  # 2- Grafana Admin user: admin
-  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
-  #---------------------------------------------------------------
-  enable_kube_prometheus_stack = true
-  kube_prometheus_stack = {
-    values = [
-      var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
-        region              = local.region
-        amp_sa              = local.amp_ingest_service_account
-        amp_irsa            = module.amp_ingest_irsa[0].iam_role_arn
-        amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
-        amp_url             = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}"
-        storage_class_type  = kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class.id
-      }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {})
-    ]
-    chart_version = "48.1.1"
-    set_sensitive = [
-      {
-        name  = "grafana.adminPassword"
-        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
-      }
-    ],
-  }
-
-  tags = local.tags
-
-}
-
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "1.33.0" # ensure to update this to the latest/desired version
-
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------------------------------
-  # MLflow Tracking Add-on
-  #---------------------------------------------------------------
-
-  enable_mlflow_tracking = true
-  mlflow_tracking_helm_config = {
-    mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace)
-
-    values = [templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", {
-      mlflow_sa   = local.mlflow_service_account
-      mlflow_irsa = module.mlflow_irsa[0].iam_role_arn
-      # MLflow Postgres RDS Config
-      mlflow_db_username = local.mlflow_name
-      mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "")
-      mlflow_db_name     = try(module.db[0].db_instance_name, "")
-      mlflow_db_host     = try(element(split(":", module.db[0].db_instance_endpoint), 0), "")
-      # S3 bucket config for artifacts
-      s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "")
-    })]
-  }
-
-  #---------------------------------------------------------------
-  # NVIDIA GPU Operator Add-on
-  #---------------------------------------------------------------
-  enable_nvidia_gpu_operator = true
-  nvidia_gpu_operator_helm_config = {
-    values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Deploying Karpenter resources(Nodepool and NodeClass) with Helm Chart
-  #---------------------------------------
-  enable_karpenter_resources = true
-  # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-  #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-  karpenter_resources_helm_config = {
-    gpu-g5 = {
-      values = [
-        <<-EOT
-      name: gpu-g5
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        blockDevice:
-          deviceName: /dev/xvda
-          volumeSize: 500Gi
-          volumeType: gp3
-          encrypted: true
-          deleteOnTermination: true
-      nodePool:
-        labels:
-          - instanceType: gp5
-          - provisionerType: Karpenter
-        taints:
-          - key: nvidia.com/gpu
-            operator: "Exists"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["g5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["on-demand"]
-        limits:
-          cpu: 1000
-        amiFamily: Ubuntu
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 30s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    default = {
-      values = [
-        <<-EOT
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-          blockDevice:
-            deviceName: /dev/xvda
-            volumeSize: 200Gi
-            volumeType: gp3
-            encrypted: true
-            deleteOnTermination: true
-      nodePool:
-        labels:
-          - instanceType: mixed-x86
-          - provisionerType: Karpenter
-          - workload: mlflow
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["c5", "m5", "r5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 30s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-  }
-}
-
-#---------------------------------------------------------------
-# Ingress Nginx external security groups
-#---------------------------------------------------------------
-resource "aws_security_group" "ingress_nginx_external" {
-  name        = "ingress-nginx-external"
-  description = "Allow public HTTP and HTTPS traffic"
-  vpc_id      = module.vpc.vpc_id
-
-  ingress {
-    from_port   = 80
-    to_port     = 80
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"] # modify to your requirements
-  }
-
-  ingress {
-    from_port   = 443
-    to_port     = 443
-    protocol    = "tcp"
-    cidr_blocks = ["0.0.0.0/0"] # modify to your requirements
-  }
-
-  egress {
-    from_port   = 0
-    to_port     = 0
-    protocol    = "-1"
-    cidr_blocks = ["0.0.0.0/0"]
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# Grafana Admin credentials resources
-#---------------------------------------------------------------
-data "aws_secretsmanager_secret_version" "admin_password_version" {
-  secret_id  = aws_secretsmanager_secret.grafana.id
-  depends_on = [aws_secretsmanager_secret_version.grafana]
-}
-
-resource "random_password" "grafana" {
-  length           = 16
-  special          = true
-  override_special = "@_"
-}
-
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "grafana" {
-  name                    = "${local.name}-grafana"
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "grafana" {
-  secret_id     = aws_secretsmanager_secret.grafana.id
-  secret_string = random_password.grafana.result
-}
-
-#---------------------------------------------------------------
-# S3 log bucket for FluentBit
-#---------------------------------------------------------------
-#tfsec:ignore:*
-module "fluentbit_s3_bucket" {
-  source  = "terraform-aws-modules/s3-bucket/aws"
-  version = "~> 3.0"
-
-  bucket_prefix = "${local.name}-fluentbit-logs-"
-  # For example only - please evaluate for your environment
-  force_destroy = true
-  server_side_encryption_configuration = {
-    rule = {
-      apply_server_side_encryption_by_default = {
-        sse_algorithm = "AES256"
-      }
-    }
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# GP3 Encrypted Storage Class
-#---------------------------------------------------------------
-
-resource "kubernetes_annotations" "gp2_default" {
-  annotations = {
-    "storageclass.kubernetes.io/is-default-class" : "false"
-  }
-  api_version = "storage.k8s.io/v1"
-  kind        = "StorageClass"
-  metadata {
-    name = "gp2"
-  }
-  force = true
-
-  depends_on = [module.eks]
-}
-
-resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" {
-  metadata {
-    name = "gp3"
-    annotations = {
-      "storageclass.kubernetes.io/is-default-class" : "true"
-    }
-  }
-
-  storage_provisioner    = "ebs.csi.aws.com"
-  reclaim_policy         = "Delete"
-  allow_volume_expansion = true
-  volume_binding_mode    = "WaitForFirstConsumer"
-  parameters = {
-    fsType    = "xfs"
-    encrypted = true
-    type      = "gp3"
-  }
-
-  depends_on = [kubernetes_annotations.gp2_default]
-}
diff --git a/ai-ml/mlflow/amp.tf b/ai-ml/mlflow/amp.tf
deleted file mode 100644
index 14b47ba4c..000000000
--- a/ai-ml/mlflow/amp.tf
+++ /dev/null
@@ -1,136 +0,0 @@
-#------------------------------------------
-# Amazon Prometheus
-#------------------------------------------
-locals {
-  amp_ingest_service_account = "amp-iamproxy-ingest-service-account"
-  amp_namespace              = "kube-prometheus-stack"
-}
-
-resource "aws_prometheus_workspace" "amp" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  alias = format("%s-%s", "amp-ws", local.name)
-  tags  = local.tags
-}
-#IAM Policy for Amazon Prometheus & Grafana
-resource "aws_iam_policy" "grafana" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  description = "IAM policy for Grafana Pod"
-  name_prefix = format("%s-%s-", local.name, "grafana")
-  path        = "/"
-  policy      = data.aws_iam_policy_document.grafana[0].json
-}
-
-data "aws_iam_policy_document" "grafana" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  statement {
-    sid       = "AllowReadingMetricsFromCloudWatch"
-    effect    = "Allow"
-    resources = ["*"]
-
-    actions = [
-      "cloudwatch:DescribeAlarmsForMetric",
-      "cloudwatch:ListMetrics",
-      "cloudwatch:GetMetricData",
-      "cloudwatch:GetMetricStatistics"
-    ]
-  }
-
-  statement {
-    sid       = "AllowGetInsightsCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"]
-
-    actions = [
-      "cloudwatch:GetInsightRuleReport",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingAlarmHistoryFromCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"]
-
-    actions = [
-      "cloudwatch:DescribeAlarmHistory",
-      "cloudwatch:DescribeAlarms",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingLogsFromCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"]
-
-    actions = [
-      "logs:DescribeLogGroups",
-      "logs:GetLogGroupFields",
-      "logs:StartQuery",
-      "logs:StopQuery",
-      "logs:GetQueryResults",
-      "logs:GetLogEvents",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingTagsInstancesRegionsFromEC2"
-    effect    = "Allow"
-    resources = ["*"]
-
-    actions = [
-      "ec2:DescribeTags",
-      "ec2:DescribeInstances",
-      "ec2:DescribeRegions",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingResourcesForTags"
-    effect    = "Allow"
-    resources = ["*"]
-    actions   = ["tag:GetResources"]
-  }
-
-  statement {
-    sid    = "AllowListApsWorkspaces"
-    effect = "Allow"
-    resources = [
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*",
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*",
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*",
-    ]
-    actions = [
-      "aps:ListWorkspaces",
-      "aps:DescribeWorkspace",
-      "aps:GetMetricMetadata",
-      "aps:GetSeries",
-      "aps:QueryMetrics",
-      "aps:RemoteWrite",
-      "aps:GetLabels"
-    ]
-  }
-}
-
-module "amp_ingest_irsa" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  source         = "aws-ia/eks-blueprints-addon/aws"
-  version        = "~> 1.0"
-  create_release = false
-  create_role    = true
-  create_policy  = false
-  role_name      = format("%s-%s", local.name, "amp-ingest")
-  role_policies  = { amp_policy = aws_iam_policy.grafana[0].arn }
-
-  oidc_providers = {
-    this = {
-      provider_arn    = module.eks.oidc_provider_arn
-      namespace       = local.amp_namespace
-      service_account = local.amp_ingest_service_account
-    }
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf
deleted file mode 100644
index 15fa077d1..000000000
--- a/ai-ml/mlflow/eks.tf
+++ /dev/null
@@ -1,118 +0,0 @@
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 19.15"
-
-  cluster_name    = local.name
-  cluster_version = var.eks_cluster_version
-
-  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
-
-  vpc_id = module.vpc.vpc_id
-
-  subnet_ids = module.vpc.private_subnets
-
-  manage_aws_auth_configmap = true
-  aws_auth_roles = [
-    # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
-    {
-      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
-      username = "system:node:{{EC2PrivateDNSName}}"
-      groups = [
-        "system:bootstrappers",
-        "system:nodes",
-      ]
-    }
-  ]
-
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 1025
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  # Extend node-to-node security group rules
-  node_security_group_additional_rules = {
-    ingress_self_all = {
-      description = "Node to node all ports/protocols"
-      protocol    = "-1"
-      from_port   = 0
-      to_port     = 0
-      type        = "ingress"
-      self        = true
-    }
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
-    # Change this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  We recommend to have a MNG to place your critical workloads and add-ons
-    #  Then rely on Karpenter to scale your workloads
-    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
-    core_node_group = {
-      name        = "core-node-group"
-      description = "EKS Core node group for hosting critical add-ons"
-      # Filtering only Secondary CIDR private subnets starting with "100.".
-      # Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      min_size     = 3
-      max_size     = 9
-      desired_size = 3
-
-      instance_types = ["m5.xlarge"]
-
-      ebs_optimized = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        Environment   = "preprod"
-        Zone          = "test"
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "core"
-      }
-
-      tags = merge(local.tags, {
-        Name                     = "core-node-grp",
-        "karpenter.sh/discovery" = local.name
-      })
-    }
-  }
-}
diff --git a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
deleted file mode 100644
index 82a654554..000000000
--- a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-global:
-
-#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
-# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
-hostNetwork: true
-dnsPolicy: ClusterFirstWithHostNet
-
-service:
-  parsersFiles:
-    - /fluent-bit/parsers/parsers.conf
-  extraParsers: |
-    [PARSER]
-        Name    kubernetes
-        Format  regex
-        Regex   ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
-
-input:
-  name: "tail"
-  enabled: true
-  tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
-  path: "/var/log/containers/*.log"
-  db: "/var/log/flb_kube.db"
-  memBufLimit: 5MB
-  skipLongLines: "On"
-  refreshInterval: 10
-  extraInputs: |
-    multiline.parser  docker, cri
-    Tag_Regex         (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
-
-
-# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
-filter:
-  name: "kubernetes"
-  match: "systempods.*"
-  kubeURL: "https://kubernetes.default.svc.cluster.local:443"
-  mergeLog: "On"
-  mergeLogKey: "log_processed"
-  keepLog: "On"
-  k8sLoggingParser: "On"
-  k8sLoggingExclude: "Off"
-  bufferSize: "0"
-  extraFilters: |
-    Kube_Tag_Prefix     systempods.
-    Regex_Parser        kubernetes
-    Labels              On
-    Annotations         Off
-    Use_Kubelet         true
-    Kubelet_Port        10250
-    Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-    Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
-
-# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
-# cloudWatch:
-#   enabled: false
-
-# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
-cloudWatchLogs:
-  enabled: true
-  match: "systempods.*"
-  region: ${region}
-  logGroupName: ${cloudwatch_log_group}
-  autoCreateGroup: false
-  extraOutputs: |
-    log_key               log
-
-#----------------------------------------------------------#
-# OUTPUT logs to S3
-#----------------------------------------------------------#
-
-# This is an example for writing logs to S3 bucket.
-# This example writes system pod logs and spark logs into dedicated prefix.
-# This second output is using the rewrite_tag filter commented above
-
-additionalOutputs: |
-  [OUTPUT]
-      Name                            s3
-      Match                           systempods.*
-      region                          ${region}
-      bucket                          ${s3_bucket_name}
-      total_file_size                 100M
-      s3_key_format                   /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
-      s3_key_format_tag_delimiters    ..
-      store_dir                       /home/ec2-user/buffer
-      upload_timeout                  10m
-      log_key                         log
-
-
-# Resource config for large clusters
-resources:
-  limits:
-    cpu: 1000m
-    memory: 1500Mi
-  requests:
-    cpu: 500m
-    memory: 500Mi
-
-## Assign a PriorityClassName to pods if set
-priorityClassName: system-node-critical
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
deleted file mode 100644
index 5a42794f2..000000000
--- a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-autoDiscovery:
-  clusterName: ${eks_cluster_id}
-
-awsRegion: ${aws_region}
-
-cloudProvider: aws
-
-extraArgs:
-  aws-use-static-instance-list: true
-
-# Best practice to update the resource requests and limits for each add-on
-resources:
-   limits:
-     cpu: 1000m
-     memory: 1G
-   requests:
-     cpu: 200m
-     memory: 512Mi
-
-# Best practice to updateStrategy for each add-on
-updateStrategy:
-  type: RollingUpdate
-  rollingUpdate:
-    maxSurge: 0
-    maxUnavailable: 1
diff --git a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
deleted file mode 100644
index 64cb540bf..000000000
--- a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-nameOverride: kube-dns-autoscaler
-
-# Formula for controlling the replicas. Adjust according to your needs
-#  replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) )
-#  replicas = min(replicas, max)
-#  replicas = max(replicas, min)
-config:
-  linear:
-    coresPerReplica: 256
-    nodesPerReplica: 16
-    min: 1
-    max: 100
-    preventSinglePointFailure: true
-    includeUnschedulableNodes: true
-
-# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive).
-options:
-  target: ${target}
-
-serviceAccount:
-  create: true
-  name: kube-dns-autoscaler
-
-podSecurityContext:
-  seccompProfile:
-    type: RuntimeDefault
-  supplementalGroups: [ 65534 ]
-  fsGroup: 65534
-
-resources:
-  limits:
-    cpu: 100m
-    memory: 128Mi
-  requests:
-    cpu: 100m
-    memory: 128Mi
-
-tolerations:
-  - key: "CriticalAddonsOnly"
-    operator: "Exists"
diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
deleted file mode 100644
index c8b1a5d74..000000000
--- a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-controller:
-  service:
-    externalTrafficPolicy: "Local"
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-type: external
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC
-    targetPorts:
-      http: http
-      https: http
diff --git a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
deleted file mode 100644
index cc7687163..000000000
--- a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-prometheus:
-  serviceAccount:
-    create: true
-    name: ${amp_sa}
-    annotations:
-      eks.amazonaws.com/role-arn: ${amp_irsa}
-  prometheusSpec:
-    remoteWrite:
-      - url: ${amp_remotewrite_url}
-        sigv4:
-          region: ${region}
-        queueConfig:
-          maxSamplesPerSend: 1000
-          maxShards: 200
-          capacity: 2500
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-    # Scrape metrics for Yunikorn add-on
-    additionalScrapeConfigs:
-      - job_name: yunikorn
-        honor_labels: true
-        scrape_interval: 1m
-        scrape_timeout: 10s
-        metrics_path: /ws/v1//metrics
-        scheme: http
-        dns_sd_configs:
-          - names:
-              - yunikorn-service.yunikorn.svc
-            type: 'A'
-            port: 9080
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
-# Adding AMP datasource to Grafana config
-  serviceAccount:
-    create: false
-    name: ${amp_sa}
-  grafana.ini:
-    auth:
-      sigv4_auth_enabled: true
-  additionalDataSources:
-    - name: AMP
-      editable: true
-      jsonData:
-        sigV4Auth: true
-        sigV4Region: ${region}
-      type: prometheus
-      isDefault: false
-      url: ${amp_url}
diff --git a/ai-ml/mlflow/helm-values/kube-prometheus.yaml b/ai-ml/mlflow/helm-values/kube-prometheus.yaml
deleted file mode 100644
index dedff553b..000000000
--- a/ai-ml/mlflow/helm-values/kube-prometheus.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-    # Scrape metrics for Yunikorn add-on
-    additionalScrapeConfigs:
-      - job_name: yunikorn
-        honor_labels: true
-        scrape_interval: 1m
-        scrape_timeout: 10s
-        metrics_path: /ws/v1//metrics
-        scheme: http
-        dns_sd_configs:
-          - names:
-              - yunikorn-service.yunikorn.svc
-            type: 'A'
-            port: 9080
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
diff --git a/ai-ml/mlflow/helm-values/metrics-server-values.yaml b/ai-ml/mlflow/helm-values/metrics-server-values.yaml
deleted file mode 100644
index bc806ced6..000000000
--- a/ai-ml/mlflow/helm-values/metrics-server-values.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# HA config for metrics-server
-image:
-  repository: registry.k8s.io/metrics-server/metrics-server
-  pullPolicy: IfNotPresent
-
-serviceAccount:
-  create: true
-  name: metrics-server
-
-rbac:
-  create: true
-  pspEnabled: false
-
-apiService:
-  create: true
-
-podLabels:
-  k8s-app: metrics-server
-
-# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true
-replicas: 2
-
-updateStrategy:
-  type: RollingUpdate
-  rollingUpdate:
-    maxSurge: 0
-    maxUnavailable: 1
-
-podDisruptionBudget:
-  enabled: true
-  minAvailable: 1
-
-defaultArgs:
-  - --cert-dir=/tmp
-  - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
-  - --kubelet-use-node-status-port
-  - --metric-resolution=15s
-
-resources:
-  requests:
-    cpu: 200m
-    memory: 512Mi
-
-affinity:
-  podAntiAffinity:
-    requiredDuringSchedulingIgnoredDuringExecution:
-      - labelSelector:
-          matchLabels:
-            k8s-app: metrics-server
-        namespaces:
-          - kube-system
-        topologyKey: kubernetes.io/hostname
diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
deleted file mode 100644
index 1f604f610..000000000
--- a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Default values for mlflow-tracking-server.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-image:
-  repository: public.ecr.aws/data-on-eks/mlflow
-  pullPolicy: Always
-  tag: 2.7.1
-
-imagePullSecrets: []
-
-nameOverride: mlflow-tracking-server
-
-fullnameOverride: mlflow-tracking-server
-
-podAnnotations: {}
-
-replicaCount: 1
-
-service:
-  type: ClusterIP
-  port: 5000
-
-serviceAccount:
-  # Specifies whether a service account should be created
-  create: false
-  # Annotations to add to the service account
-  annotations:
-    eks.amazonaws.com/role-arn: ${mlflow_irsa}
-  labels: {}
-  # The name of the service account to use.
-  # If not set and create is true, a name is generated using the fullname template
-  name: ${mlflow_sa}
-
-ingress:
-  enabled: true
-  className: nginx
-  annotations:
-    kubernetes.io/ingress.class: nginx
-    nginx.ingress.kubernetes.io/use-regex: "true"
-  hosts:
-    - host:
-      paths:
-        - path: /
-          pathType: Prefix
-  tls: []
-  #  - secretName: chart-example-tls
-  #    hosts:
-  #      - chart-example.local
-
-mlflow:
-  artifacts:
-    bucketName: ${s3_bucket_name}
-  database:
-    name: ${mlflow_db_name}
-    username: ${mlflow_db_username}
-    password: ${mlflow_db_password}
-    host: ${mlflow_db_host}
-    port: 5432
-
-podSecurityContext: {}
-  # fsGroup: 2000
-
-securityContext: {}
-  # capabilities:
-  #   drop:
-  #   - ALL
-  # readOnlyRootFilesystem: true
-  # runAsNonRoot: true
-  # runAsUser: 1000
-
-resources: {}
-  # We usually recommend not to specify default resources and to leave this as a conscious
-  # choice for the user. This also increases chances charts run on environments with little
-  # resources, such as Minikube. If you do want to specify resources, uncomment the following
-  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
-  # limits:
-  #   cpu: 100m
-  #   memory: 128Mi
-  # requests:
-  #   cpu: 100m
-  #   memory: 128Mi
-
-nodeSelector: {}
-
-tolerations: []
-
-affinity: {}
diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml
deleted file mode 100644
index 60078daa6..000000000
--- a/ai-ml/mlflow/helm-values/nvidia-values.yaml
+++ /dev/null
@@ -1,97 +0,0 @@
-# Default values for gpu-operator.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-daemonsets:
-  labels: {}
-  annotations: {}
-  priorityClassName: system-node-critical
-  tolerations:
-    - key: nvidia.com/gpu
-      operator: Exists
-      effect: NoSchedule
-    - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
-
-validator:
-  repository: nvcr.io/nvidia/cloud-native
-  image: gpu-operator-validator
-
-operator:
-  repository: nvcr.io/nvidia
-  priorityClassName: system-node-critical
-  defaultRuntime: containerd
-  image: gpu-operator
-  cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs
-  upgradeCRD: false
-  resources:
-    limits:
-      cpu: 500m
-      memory: 350Mi
-    requests:
-      cpu: 200m
-      memory: 100Mi
-
-mig:
-  strategy: single
-
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html
-# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers.
-# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed"
-driver:
-  enabled: true
-  repository: nvcr.io/nvidia
-  image: driver
-  # Commented this as latest Ubuntu AMIs are failing with this option enabled
-  # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version.
-  manager:
-    image: k8s-driver-manager
-    repository: nvcr.io/nvidia/cloud-native
-
-toolkit:
-  enabled: true
-
-devicePlugin:
-  enabled: true
-
-dcgm:
-  enabled: false
-
-dcgmExporter:
-  enabled: true
-
-gfd:
-  enabled: true
-
-migManager:
-  enabled: true
-
-nodeStatusExporter:
-  enabled: false
-
-gds:
-  enabled: false
-
-vgpuManager:
-  enabled: false
-
-vgpuDeviceManager:
-  enabled: true
-
-vfioManager:
-  enabled: true
-
-sandboxDevicePlugin:
-  enabled: true
-
-node-feature-discovery:
-  enableNodeFeatureApi: true
-  worker:
-    tolerations:
-      - key: "node-role.kubernetes.io/master"
-        operator: "Equal"
-        value: ""
-        effect: "NoSchedule"
-      - key: nvidia.com/gpu
-        operator: Exists
-        effect: NoSchedule
-      - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
diff --git a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
deleted file mode 100644
index 73e3802df..000000000
--- a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
----
-apiVersion: karpenter.sh/v1alpha5
-kind: Provisioner
-metadata:
-  name: default
-spec:
-  # Which AWS Node Template to pick
-  providerRef:
-    name: default
-
-  # ttlSecondsAfterEmpty: 30
-
-  # Requirements that constrain the parameters of provisioned nodes.
-  # These requirements are combined with pod.spec.affinity.nodeAffinity rules.
-  # Operators { In, not in } are supported to enable including or excluding values
-  requirements:
-    - key: "karpenter.k8s.aws/instance-category"
-      operator: In
-      values: ["c", "m", "r"]
-    - key: "karpenter.k8s.aws/instance-cpu"
-      operator: In
-      values: ["4", "8", "16", "32"]
-    - key: "kubernetes.io/arch"
-      operator: In
-      values: ["amd64"]
-    - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand
-      operator: In
-      values: ["on-demand", "spot"]
-  limits:
-    resources:
-      cpu: 20 # CPU Cores across all instances
-      memory: 2000Gi
-
-  # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those
-  # that can't be removed.  Mutually exclusive with the ttlSecondsAfterEmpty parameter.
-  consolidation:
-    enabled: true
----
-apiVersion: karpenter.k8s.aws/v1alpha1
-kind: AWSNodeTemplate
-metadata:
-  name: default
-spec:
-  subnetSelector:
-    Name: ${cluster_name}-private*     # Name of the Subnets to spin up the nodes
-  securityGroupSelector:                      # required, when not using launchTemplate
-    Name: ${cluster_name}-node*     # name of the SecurityGroup to be used with Nodes
-  blockDeviceMappings:
-    - deviceName: /dev/xvda
-      ebs:
-        volumeSize: 100Gi
-        volumeType: gp3
-        encrypted: true
-  tags:
-    managed-by: "karpenter"
-    intent: "apps"
-    Name: "karpenter-node-default"
diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf
deleted file mode 100644
index a5e4360ea..000000000
--- a/ai-ml/mlflow/main.tf
+++ /dev/null
@@ -1,65 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-
-provider "kubectl" {
-  apply_retry_count      = 10
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  load_config_file       = false
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-data "aws_availability_zones" "available" {}
-data "aws_caller_identity" "current" {}
-data "aws_partition" "current" {}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-#---------------------------------------------------------------
-# Local variables
-#---------------------------------------------------------------
-locals {
-  name       = var.name
-  region     = var.region
-  vpc_cidr   = var.vpc_cidr
-  azs        = slice(data.aws_availability_zones.available.names, 0, 2)
-  account_id = data.aws_caller_identity.current.account_id
-  partition  = data.aws_partition.current.partition
-
-  mlflow_name            = "mlflow"
-  mlflow_namespace       = "mlflow"
-  mlflow_service_account = "mlflow"
-
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf
deleted file mode 100644
index c3ca28e76..000000000
--- a/ai-ml/mlflow/mlflow-core.tf
+++ /dev/null
@@ -1,245 +0,0 @@
-#---------------------------------------------------------------
-# RDS Postgres Database for MLflow Backend
-#---------------------------------------------------------------
-module "db" {
-  count   = var.enable_mlflow_tracking ? 1 : 0
-  source  = "terraform-aws-modules/rds/aws"
-  version = "~> 5.0"
-
-  identifier = local.mlflow_name
-
-  engine               = "postgres"
-  engine_version       = "14.3"
-  family               = "postgres14"
-  major_engine_version = "14"
-  instance_class       = "db.m6i.xlarge"
-
-  storage_type      = "io1"
-  allocated_storage = 100
-  iops              = 3000
-
-  db_name                = local.mlflow_name
-  username               = local.mlflow_name
-  create_random_password = false
-  password               = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string)
-  port                   = 5432
-
-  multi_az               = true
-  db_subnet_group_name   = module.vpc.database_subnet_group
-  vpc_security_group_ids = [module.security_group[0].security_group_id]
-
-  maintenance_window              = "Mon:00:00-Mon:03:00"
-  backup_window                   = "03:00-06:00"
-  enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"]
-  create_cloudwatch_log_group     = true
-
-  backup_retention_period = 5
-  skip_final_snapshot     = true
-  deletion_protection     = false
-
-  performance_insights_enabled          = true
-  performance_insights_retention_period = 7
-  create_monitoring_role                = true
-  monitoring_interval                   = 60
-  monitoring_role_name                  = "mlflow-backend"
-  monitoring_role_use_name_prefix       = true
-  monitoring_role_description           = "MLflow Postgres Backend for monitoring role"
-
-  parameters = [
-    {
-      name  = "autovacuum"
-      value = 1
-    },
-    {
-      name  = "client_encoding"
-      value = "utf8"
-    }
-  ]
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# MLflow Postgres Backend DB Master password
-#---------------------------------------------------------------
-resource "random_password" "postgres" {
-  count   = var.enable_mlflow_tracking ? 1 : 0
-  length  = 16
-  special = false
-}
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "postgres" {
-  count                   = var.enable_mlflow_tracking ? 1 : 0
-  name                    = local.mlflow_name
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "postgres" {
-  count         = var.enable_mlflow_tracking ? 1 : 0
-  secret_id     = aws_secretsmanager_secret.postgres[0].id
-  secret_string = random_password.postgres[0].result
-}
-
-#---------------------------------------------------------------
-# PostgreSQL RDS security group
-#---------------------------------------------------------------
-module "security_group" {
-  count   = var.enable_mlflow_tracking ? 1 : 0
-  source  = "terraform-aws-modules/security-group/aws"
-  version = "~> 5.0"
-
-  name        = local.name
-  description = "Complete PostgreSQL example security group"
-  vpc_id      = module.vpc.vpc_id
-
-  # ingress
-  ingress_with_cidr_blocks = [
-    {
-      from_port   = 5432
-      to_port     = 5432
-      protocol    = "tcp"
-      description = "PostgreSQL access from within VPC"
-      cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}"
-    },
-  ]
-
-  tags = local.tags
-}
-
-
-#---------------------------------------------------------------
-# S3 bucket for MLflow artifacts
-#---------------------------------------------------------------
-
-#tfsec:ignore:*
-module "mlflow_s3_bucket" {
-  count   = var.enable_mlflow_tracking ? 1 : 0
-  source  = "terraform-aws-modules/s3-bucket/aws"
-  version = "~> 3.0"
-
-  bucket_prefix = "${local.name}-artifacts-"
-
-  # For example only - please evaluate for your environment
-  force_destroy = true
-
-  server_side_encryption_configuration = {
-    rule = {
-      apply_server_side_encryption_by_default = {
-        sse_algorithm = "AES256"
-      }
-    }
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# MLflow Namespace
-#---------------------------------------------------------------
-resource "kubernetes_namespace_v1" "mlflow" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-  metadata {
-    name = local.mlflow_namespace
-  }
-  timeouts {
-    delete = "15m"
-  }
-}
-
-resource "kubernetes_service_account_v1" "mlflow" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-  metadata {
-    name        = local.mlflow_service_account
-    namespace   = kubernetes_namespace_v1.mlflow[0].metadata[0].name
-    annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn }
-  }
-
-  automount_service_account_token = true
-}
-
-resource "kubernetes_secret_v1" "mlflow" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-  metadata {
-    name      = "${local.mlflow_service_account}-secret"
-    namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name
-    annotations = {
-      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.mlflow[0].metadata[0].name
-      "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name
-    }
-  }
-
-  type = "kubernetes.io/service-account-token"
-}
-
-# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled
-module "mlflow_irsa" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-
-  source  = "aws-ia/eks-blueprints-addon/aws"
-  version = "~> 1.0" #ensure to update this to the latest/desired version
-
-  # Disable helm release
-  create_release = false
-
-  # IAM role for service account (IRSA)
-  create_role   = true
-  create_policy = false # Policy is created in the next resource
-
-  role_name     = local.mlflow_service_account
-  role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn }
-
-  oidc_providers = {
-    this = {
-      provider_arn    = module.eks.oidc_provider_arn
-      namespace       = kubernetes_namespace_v1.mlflow[0].metadata[0].name
-      service_account = local.mlflow_service_account
-    }
-  }
-
-  tags = local.tags
-}
-
-#--------------------------------------------------------------------------
-# IAM policy for MLflow for accessing S3 artifacts and RDS Postgres backend
-#--------------------------------------------------------------------------
-resource "aws_iam_policy" "mlflow" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-
-  description = "IAM policy for MLflow"
-  name_prefix = format("%s-%s-", local.name, "mlflow")
-  path        = "/"
-  policy      = data.aws_iam_policy_document.mlflow[0].json
-}
-
-data "aws_iam_policy_document" "mlflow" {
-  count = var.enable_mlflow_tracking ? 1 : 0
-  statement {
-    sid       = ""
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"]
-
-    actions = [
-      "s3:ListBucket"
-    ]
-  }
-  statement {
-    sid       = ""
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"]
-
-    actions = [
-      "s3:GetObject",
-      "s3:PutObject",
-      "s3:DeleteObject"
-    ]
-  }
-  statement {
-    sid       = ""
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"]
-
-    actions = [
-      "rds-db:connect",
-    ]
-  }
-}
diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf
deleted file mode 100644
index b5db71900..000000000
--- a/ai-ml/mlflow/outputs.tf
+++ /dev/null
@@ -1,24 +0,0 @@
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}"
-}
-
-output "eks_api_server_url" {
-  description = "Your eks API server endpoint"
-  value       = module.eks.cluster_endpoint
-}
-
-output "grafana_secret_name" {
-  description = "Grafana password secret name"
-  value       = aws_secretsmanager_secret.grafana.name
-}
-
-output "mlflow_s3_artifacts" {
-  description = "S3 bucket for MLflow artifacts"
-  value       = module.mlflow_s3_bucket[0].s3_bucket_id
-}
-
-output "mlflow_db_backend" {
-  description = "Amazon RDS Postgres database for MLflow backend"
-  value       = module.db[0].db_instance_endpoint
-}
diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf
deleted file mode 100644
index 156fc1e49..000000000
--- a/ai-ml/mlflow/versions.tf
+++ /dev/null
@@ -1,33 +0,0 @@
-terraform {
-  required_version = ">= 1.0.0"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 3.72"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.4.1"
-    }
-    random = {
-      source  = "hashicorp/random"
-      version = "3.3.2"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.14"
-    }
-  }
-
-  # ##  Used for end-to-end testing on project; update to suit your needs
-  # backend "s3" {
-  #   bucket = "doeks-github-actions-e2e-test-state"
-  #   region = "us-west-2"
-  #   key    = "e2e/mlflow/terraform.tfstate"
-  # }
-}
diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf
deleted file mode 100644
index 0aa8b7aab..000000000
--- a/ai-ml/mlflow/vpc.tf
+++ /dev/null
@@ -1,59 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-
-  database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = local.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets = local.public_subnets
-
-  # ------------------------------
-  # Private Subnets for MLflow backend store
-  database_subnets                   = local.database_private_subnets
-  create_database_subnet_group       = true
-  create_database_subnet_route_table = true
-
-  enable_nat_gateway   = true
-  single_nat_gateway   = true
-  enable_dns_hostnames = true
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-    # Tags subnets for Karpenter auto-discovery
-    "karpenter.sh/discovery" = local.name
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf
deleted file mode 100644
index 8d15a83bb..000000000
--- a/ai-ml/trainium-inferentia/addons.tf
+++ /dev/null
@@ -1,536 +0,0 @@
-#---------------------------------------------------------------
-# GP3 Encrypted Storage Class
-#---------------------------------------------------------------
-resource "kubernetes_annotations" "disable_gp2" {
-  annotations = {
-    "storageclass.kubernetes.io/is-default-class" : "false"
-  }
-  api_version = "storage.k8s.io/v1"
-  kind        = "StorageClass"
-  metadata {
-    name = "gp2"
-  }
-  force = true
-
-  depends_on = [module.eks.eks_cluster_id]
-}
-
-resource "kubernetes_storage_class_v1" "default_gp3" {
-  metadata {
-    name = "gp3"
-    annotations = {
-      "storageclass.kubernetes.io/is-default-class" : "true"
-    }
-  }
-
-  storage_provisioner    = "ebs.csi.aws.com"
-  reclaim_policy         = "Delete"
-  allow_volume_expansion = true
-  volume_binding_mode    = "WaitForFirstConsumer"
-  parameters = {
-    fsType    = "xfs"
-    encrypted = true
-    type      = "gp3"
-  }
-
-  depends_on = [kubernetes_annotations.disable_gp2]
-}
-
-#---------------------------------------------------------------
-# EKS Pod identiity association
-#---------------------------------------------------------------
-
-module "aws_ebs_csi_pod_identity" {
-  source  = "terraform-aws-modules/eks-pod-identity/aws"
-  version = "~> 1.4.0"
-
-  name                      = "aws-ebs-csi"
-  attach_aws_ebs_csi_policy = true
-
-  # Pod Identity Associations
-  associations = {
-    ebs-csi-controller = {
-      namespace       = "kube-system"
-      service_account = "ebs-csi-controller-sa"
-      cluster_name    = module.eks.cluster_name
-    }
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# EKS Blueprints Addons
-#---------------------------------------------------------------
-module "eks_blueprints_addons" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.16"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    aws-ebs-csi-driver     = {}
-    coredns                = {}
-    eks-pod-identity-agent = {}
-    kube-proxy             = {}
-    vpc-cni                = {}
-    amazon-cloudwatch-observability = {
-      preserve                 = true
-      service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn
-    }
-  }
-
-  #---------------------------------------
-  # Kubernetes Add-ons
-  #---------------------------------------
-
-  #---------------------------------------
-  # Metrics Server
-  #---------------------------------------
-  enable_metrics_server = true
-  metrics_server = {
-    values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Cluster Autoscaler
-  #---------------------------------------
-  enable_cluster_autoscaler = true
-  cluster_autoscaler = {
-    values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Enable FSx for Lustre CSI Driver
-  #---------------------------------------
-  enable_aws_fsx_csi_driver = var.enable_fsx_for_lustre
-  aws_fsx_csi_driver = {
-    # INFO: fsx node daemonset won't be placed on Karpenter nodes with taints without the following toleration
-    values = [
-      <<-EOT
-        node:
-          tolerations:
-            - operator: Exists
-      EOT
-    ]
-  }
-
-  #---------------------------------------
-  # AWS for FluentBit - DaemonSet
-  #---------------------------------------
-  enable_aws_for_fluentbit = true
-  aws_for_fluentbit_cw_log_group = {
-    use_name_prefix   = false
-    name              = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
-    retention_in_days = 30
-  }
-  aws_for_fluentbit = {
-    s3_bucket_arns = [
-      module.s3_bucket.s3_bucket_arn,
-      "${module.s3_bucket.s3_bucket_arn}/*"
-    ]
-    values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", {
-      region               = local.region,
-      cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
-      s3_bucket_name       = module.s3_bucket.s3_bucket_id
-      cluster_name         = module.eks.cluster_name
-    })]
-  }
-
-  #---------------------------------------
-  # Prommetheus and Grafana stack
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
-  # 2- Grafana Admin user: admin
-  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id kafka-on-eks-grafana --region $AWS_REGION --query "SecretString" --output text`
-  #---------------------------------------------------------------
-  enable_kube_prometheus_stack = true
-  kube_prometheus_stack = {
-    values = [templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
-      storage_class_type = kubernetes_storage_class_v1.default_gp3.id
-      })
-    ]
-    chart_version = "48.1.1"
-    set_sensitive = [
-      {
-        name  = "grafana.adminPassword"
-        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
-      }
-    ],
-  }
-
-  #---------------------------------------
-  # AWS Load Balancer Controller Add-on
-  #---------------------------------------
-  enable_aws_load_balancer_controller = true
-  # turn off the mutating webhook for services because we are using
-  # service.beta.kubernetes.io/aws-load-balancer-type: external
-  aws_load_balancer_controller = {
-    set = [{
-      name  = "enableServiceMutatorWebhook"
-      value = "false"
-    }]
-  }
-
-  #---------------------------------------
-  # Ingress Nginx Add-on
-  #---------------------------------------
-  enable_ingress_nginx = true
-  ingress_nginx = {
-    values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})]
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "1.35.0" # ensure to update this to the latest/desired version
-
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  enable_aws_neuron_device_plugin = true
-
-  aws_neuron_device_plugin_helm_config = {
-    # Enable default scheduler
-    values = [
-      <<-EOT
-      devicePlugin:
-        tolerations:
-        - key: CriticalAddonsOnly
-          operator: Exists
-        - key: aws.amazon.com/neuron
-          operator: Exists
-          effect: NoSchedule
-        - key: hub.jupyter.org/dedicated
-          operator: Exists
-          effect: NoSchedule
-      scheduler:
-        enabled: true
-      npd:
-        enabled: false
-      EOT
-    ]
-  }
-
-  enable_aws_efa_k8s_device_plugin = true
-
-  aws_efa_k8s_device_plugin_helm_config = {
-    version = "v0.5.3"
-  }
-
-  #---------------------------------------
-  # Volcano Scheduler for TorchX used in BERT-Large distributed training example
-  # Volcano is also a default scheduler for KubeRay Operator
-  #---------------------------------------
-  enable_volcano = var.enable_volcano
-
-  #---------------------------------------
-  # Kuberay Operator
-  #---------------------------------------
-  enable_kuberay_operator = var.enable_kuberay_operator
-  kuberay_operator_helm_config = {
-    version = "1.1.1"
-    # Enabling Volcano as Batch scheduler for KubeRay Operator
-    values = [
-      <<-EOT
-      batchScheduler:
-        enabled: ${var.enable_volcano}
-    EOT
-    ]
-  }
-
-  #---------------------------------------
-  # JupyterHub Addon
-  #---------------------------------------
-  enable_jupyterhub = var.enable_jupyterhub
-  jupyterhub_helm_config = {
-    values = [
-      templatefile("${path.module}/helm-values/jupyterhub-values.yaml", {
-        jupyter_single_user_sa_name = "${module.eks.cluster_name}-jupyterhub-single-user"
-      })
-    ]
-  }
-
-  #---------------------------------------
-  # Deploying Karpenter resources(Nodepool and NodeClass) with Helm Chart
-  #---------------------------------------
-  enable_karpenter_resources = true
-  # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-  #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-  karpenter_resources_helm_config = {
-    trainium-trn1 = {
-      values = [
-        <<-EOT
-      name: trainium-trn1
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${module.karpenter.node_iam_role_name}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          id: ${module.eks.node_security_group_id}
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        blockDevice:
-          deviceName: /dev/xvda
-          volumeSize: 500Gi
-          volumeType: gp3
-          encrypted: true
-          deleteOnTermination: true
-        amiSelectorTerms:
-          - alias: al2023@v20241024
-      nodePool:
-        labels:
-          - instanceType: trainium-trn1
-          - provisionerType: Karpenter
-          - hub.jupyter.org/node-purpose: user
-          - karpenterVersion: ${resource.helm_release.karpenter.version}
-        taints:
-          - key: aws.amazon.com/neuron
-            value: "true"
-            effect: "NoSchedule"
-          - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["trn1"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 300s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    inferentia-inf2 = {
-      values = [
-        <<-EOT
-      name: inferentia-inf2
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${module.karpenter.node_iam_role_name}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          id: ${module.eks.node_security_group_id}
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        blockDevice:
-          deviceName: /dev/xvda
-          volumeSize: 500Gi
-          volumeType: gp3
-          encrypted: true
-          deleteOnTermination: true
-        amiSelectorTerms:
-          - alias: al2023@v20241024
-      nodePool:
-        labels:
-          - instanceType: inferentia-inf2
-          - provisionerType: Karpenter
-          - hub.jupyter.org/node-purpose: user
-          - karpenterVersion: ${resource.helm_release.karpenter.version}
-        taints:
-          - key: aws.amazon.com/neuron
-            value: "true"
-            effect: "NoSchedule"
-          - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["inf2"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: [ "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 300s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    default = {
-      values = [
-        <<-EOT
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${module.karpenter.node_iam_role_name}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          id: ${module.eks.node_security_group_id}
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        blockDevice:
-          deviceName: /dev/xvda
-          volumeSize: 200Gi
-          volumeType: gp3
-          encrypted: true
-          deleteOnTermination: true
-        amiSelectorTerms:
-          - alias: al2023@v20241024
-      nodePool:
-        labels:
-          - instanceType: mixed-x86
-          - provisionerType: Karpenter
-          - workload: rayhead
-          - karpenterVersion: ${resource.helm_release.karpenter.version}
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["c5", "m5", "r5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 300s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-  }
-}
-
-#---------------------------------------------------------------
-# IAM Role for Amazon CloudWatch Observability
-#---------------------------------------------------------------
-resource "aws_iam_role" "cloudwatch_observability_role" {
-  name_prefix = format("%s-%s", local.name, "cloudwatch-agent")
-  description = "The IAM role for amazon-cloudwatch-observability addon"
-
-  assume_role_policy = jsonencode({
-    Version = "2012-10-17"
-    Statement = [
-      {
-        Action = "sts:AssumeRoleWithWebIdentity"
-        Effect = "Allow"
-        Principal = {
-          Federated = module.eks.oidc_provider_arn
-        }
-        Condition = {
-          StringEquals = {
-            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent",
-            "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com"
-          }
-        }
-      }
-    ]
-  })
-}
-
-resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" {
-  policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
-  role       = aws_iam_role.cloudwatch_observability_role.name
-}
-
-#---------------------------------------------------------------
-# ETCD for TorchX
-#---------------------------------------------------------------
-data "http" "torchx_etcd_yaml" {
-  url = "https://raw.githubusercontent.com/pytorch/torchx/main/resources/etcd.yaml"
-}
-
-data "kubectl_file_documents" "torchx_etcd_yaml" {
-  content = data.http.torchx_etcd_yaml.response_body
-}
-
-resource "kubectl_manifest" "torchx_etcd" {
-  for_each   = var.enable_torchx_etcd ? data.kubectl_file_documents.torchx_etcd_yaml.manifests : {}
-  yaml_body  = each.value
-  depends_on = [module.eks.eks_cluster_id]
-}
-
-#---------------------------------------------------------------
-# Grafana Admin credentials resources
-# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
-#---------------------------------------------------------------
-data "aws_secretsmanager_secret_version" "admin_password_version" {
-  secret_id  = aws_secretsmanager_secret.grafana.id
-  depends_on = [aws_secretsmanager_secret_version.grafana]
-}
-
-resource "random_password" "grafana" {
-  length           = 16
-  special          = true
-  override_special = "@_"
-}
-
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "grafana" {
-  name_prefix             = "${local.name}-oss-grafana"
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "grafana" {
-  secret_id     = aws_secretsmanager_secret.grafana.id
-  secret_string = random_password.grafana.result
-}
-
-#tfsec:ignore:*
-module "s3_bucket" {
-  source  = "terraform-aws-modules/s3-bucket/aws"
-  version = "~> 3.0"
-
-  bucket_prefix = "${local.name}-logs-"
-  # For example only - please evaluate for your environment
-  force_destroy = true
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# MPI Operator for distributed training on Trainium
-#---------------------------------------------------------------
-data "http" "mpi_operator_yaml" {
-  url = "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml"
-}
-
-data "kubectl_file_documents" "mpi_operator_yaml" {
-  content = data.http.mpi_operator_yaml.response_body
-}
-
-resource "kubectl_manifest" "mpi_operator" {
-  for_each   = var.enable_mpi_operator ? data.kubectl_file_documents.mpi_operator_yaml.manifests : {}
-  yaml_body  = each.value
-  depends_on = [module.eks.eks_cluster_id]
-}
diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf
deleted file mode 100644
index 642fd472b..000000000
--- a/ai-ml/trainium-inferentia/eks.tf
+++ /dev/null
@@ -1,410 +0,0 @@
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 20.17"
-
-  cluster_name    = local.name
-  cluster_version = var.eks_cluster_version
-
-  cluster_endpoint_public_access = true
-
-  enable_efa_support = true
-
-  # Gives Terraform identity admin access to cluster which will
-  # allow deploying resources (Karpenter) into the cluster
-  enable_cluster_creator_admin_permissions = true
-
-  access_entries = var.access_entries
-
-  vpc_id = module.vpc.vpc_id
-  # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
-  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-  substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-  # Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates
-  kms_key_administrators = distinct(concat([
-    "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"],
-    var.kms_key_admin_roles,
-    [data.aws_iam_session_context.current.issuer_arn]
-  ))
-
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 0
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  # security group rule from all ipv4 to nodes for port 22
-  node_security_group_additional_rules = {
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
-    # Update this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-
-    ebs_optimized = true
-    # This block device is used only for root volume. Adjust volume according to your size.
-    # NOTE: Don't use this volume for ML workloads
-    block_device_mappings = {
-      xvda = {
-        device_name = "/dev/xvda"
-        ebs = {
-          volume_size = 100
-          volume_type = "gp3"
-        }
-      }
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  It's recommended to have a Managed Node group for hosting critical add-ons
-    #  It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
-    #  You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
-    core_node_group = {
-      name        = "core-node-group"
-      description = "EKS Core node group for hosting system add-ons"
-      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-        substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-      )
-
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2
-      ami_type     = "AL2_x86_64" # Use this for Graviton AL2_ARM_64
-      min_size     = 3
-      max_size     = 8
-      desired_size = 3
-
-      instance_types = ["m5.2xlarge"]
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "core"
-        workload      = "rayhead"
-      }
-
-      tags = merge(local.tags, {
-        Name = "core-node-grp"
-      })
-    }
-
-    #--------------------------------------------------
-    # Trainium node group for Trn1.32xlarge
-    #--------------------------------------------------
-    # Trainium node group creation can take upto 6 mins
-    trn1-32xl-ng1 = {
-      name        = "trn1-32xl-ng1"
-      description = "Tran1 32xlarge node group for hosting ML workloads"
-      # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ
-      # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf.
-      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-      subnet_ids = [module.vpc.private_subnets[2]]
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
-      # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
-      ami_type       = "AL2_x86_64_GPU" # Contains Neuron driver
-      instance_types = ["trn1.32xlarge"]
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-
-        # Install Neuron monitoring tools
-        yum install aws-neuronx-tools-2.* -y
-        export PATH=/opt/aws/neuron/bin:$PATH
-
-        # Install latest version of aws cli
-        mkdir /awscli \
-        && wget https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O /awscli/awscliv2.zip  \
-        && unzip /awscli/awscliv2.zip -d /awscli/ \
-        && /awscli/aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli --update \
-        && rm -rf /awscli
-      EOT
-
-      min_size     = var.trn1_32xl_min_size
-      max_size     = 4
-      desired_size = var.trn1_32xl_desired_size
-
-      # This will:
-      # 1. Create a placement group to place the instances close to one another
-      # 2. Ignore subnets that reside in AZs that do not support the instance type
-      # 3. Expose all of the available EFA interfaces on the launch template
-      enable_efa_support = true
-
-      labels = {
-        "vpc.amazonaws.com/efa.present" = "true"
-        instance-type                   = "trn1-32xl"
-        provisioner                     = "cluster-autoscaler"
-      }
-
-      taints = [
-        {
-          key    = "aws.amazon.com/neuron",
-          value  = true,
-          effect = "NO_SCHEDULE"
-        }
-      ]
-
-      tags = merge(local.tags, {
-        Name = "trn1-32xl-ng1",
-      })
-    }
-
-    #--------------------------------------------------
-    # Trainium node group for Trn1n.32xlarge
-    #--------------------------------------------------
-    trn1n-32xl-ng = {
-      name        = "trn1n-32xl-ng"
-      description = "trn1n 32xlarge node group for hosting ML workloads"
-      # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ
-      # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf.
-      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-      subnet_ids = [module.vpc.private_subnets[2]]
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
-      # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
-      ami_type       = "AL2_x86_64_GPU" # Contains Neuron driver
-      instance_types = ["trn1n.32xlarge"]
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-
-        # Install Neuron monitoring tools
-        yum install aws-neuronx-tools-2.* -y
-        export PATH=/opt/aws/neuron/bin:$PATH
-      EOT
-
-      min_size     = var.trn1n_32xl_min_size
-      max_size     = 2
-      desired_size = var.trn1n_32xl_desired_size
-
-      # This will:
-      # 1. Create a placement group to place the instances close to one another
-      # 2. Ignore subnets that reside in AZs that do not support the instance type
-      # 3. Expose all of the available EFA interfaces on the launch template
-      enable_efa_support = true
-
-      labels = {
-        instance-type                   = "trn1n-32xl"
-        provisioner                     = "cluster-autoscaler"
-        "vpc.amazonaws.com/efa.present" = "true"
-      }
-
-      taints = [
-        {
-          key    = "aws.amazon.com/neuron",
-          value  = true,
-          effect = "NO_SCHEDULE"
-        }
-      ]
-
-      tags = merge(local.tags, {
-        Name = "trn1n-32xl-ng1",
-      })
-    }
-
-    #--------------------------------------------------
-    # Inferentia2 Spot node group
-    #--------------------------------------------------
-    inf2-24xl-ng = {
-      name        = "inf2-24xl-ng"
-      description = "inf2 24xl node group for ML inference workloads"
-      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-      subnet_ids = [module.vpc.private_subnets[2]]
-
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
-      # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
-      ami_type       = "AL2_x86_64_GPU"
-      capacity_type  = "ON_DEMAND" # Use SPOT for Spot instances
-      instance_types = ["inf2.24xlarge"]
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-
-        # Install Neuron monitoring tools
-        yum install aws-neuronx-tools-2.* -y
-        export PATH=/opt/aws/neuron/bin:$PATH
-      EOT
-
-      min_size     = var.inf2_24xl_min_size
-      max_size     = 2
-      desired_size = var.inf2_24xl_desired_size
-
-      labels = {
-        instanceType    = "inf2-24xl"
-        provisionerType = "cluster-autoscaler"
-      }
-
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 500
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      taints = [
-        {
-          key    = "aws.amazon.com/neuron",
-          value  = "true",
-          effect = "NO_SCHEDULE"
-        }
-      ]
-
-      tags = merge(local.tags, {
-        Name                     = "inf2-24xl-ng",
-        "karpenter.sh/discovery" = local.name
-      })
-    }
-
-    inf2-48xl-ng = {
-      name        = "inf2-48xl-ng"
-      description = "inf2 48x large node group for ML inference workloads"
-      # We use index 2 to select the subnet in AZ1 with the 100.x CIDR:
-      #   module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-      subnet_ids = [module.vpc.private_subnets[2]]
-
-      # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2
-      # ami_id   = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type
-      ami_type       = "AL2_x86_64_GPU"
-      capacity_type  = "ON_DEMAND" # Use SPOT for Spot instances
-      instance_types = ["inf2.48xlarge"]
-
-      pre_bootstrap_user_data = <<-EOT
-        # Mount instance store volumes in RAID-0 for kubelet and containerd
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
-        /bin/setup-local-disks raid0
-
-        # Install Neuron monitoring tools
-        yum install aws-neuronx-tools-2.* -y
-        export PATH=/opt/aws/neuron/bin:$PATH
-      EOT
-
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 500
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      min_size     = var.inf2_48xl_min_size
-      max_size     = 2
-      desired_size = var.inf2_48xl_desired_size
-
-      labels = {
-        instanceType    = "inf2-48xl"
-        provisionerType = "cluster-autoscaler"
-      }
-
-      taints = [
-        {
-          key    = "aws.amazon.com/neuron",
-          value  = true,
-          effect = "NO_SCHEDULE"
-        }
-      ]
-
-      tags = merge(local.tags, {
-        Name = "inf2-48xl-ng",
-      })
-    }
-  }
-
-  tags = merge(local.tags, {
-    # NOTE - if creating multiple security groups with this module, only tag the
-    # security group that Karpenter should utilize with the following tag
-    # (i.e. - at most, only one security group should have this tag in your account)
-    "karpenter.sh/discovery" = local.name
-  })
-}
-
-
-################################################################################
-# Karpenter Controller & Node IAM roles, SQS Queue, Eventbridge Rules
-################################################################################
-
-module "karpenter" {
-  source  = "terraform-aws-modules/eks/aws//modules/karpenter"
-  version = "~> 20.24"
-
-  cluster_name          = module.eks.cluster_name
-  enable_v1_permissions = true
-
-  # Use Pod Identity
-  enable_pod_identity             = true
-  create_pod_identity_association = true
-
-  # Used to attach additional IAM policies to the Karpenter node IAM role
-  node_iam_role_additional_policies = {
-    AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-  }
-
-  tags = local.tags
-}
-
-################################################################################
-# Karpenter Helm chart
-################################################################################
-
-resource "helm_release" "karpenter" {
-  name                = "karpenter"
-  namespace           = "kube-system"
-  create_namespace    = true
-  repository          = "oci://public.ecr.aws/karpenter"
-  repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-  repository_password = data.aws_ecrpublic_authorization_token.token.password
-  chart               = "karpenter"
-  version             = "1.0.6"
-  wait                = true
-
-  values = [
-    <<-EOT
-    settings:
-      clusterName: ${module.eks.cluster_name}
-      clusterEndpoint: ${module.eks.cluster_endpoint}
-      interruptionQueue: ${module.karpenter.queue_name}
-    serviceAccount:
-      name: ${module.karpenter.service_account}
-    EOT
-  ]
-
-  lifecycle {
-    ignore_changes = [
-      repository_password
-    ]
-  }
-}
diff --git a/ai-ml/trainium-inferentia/elastic-cache-redis.tf b/ai-ml/trainium-inferentia/elastic-cache-redis.tf
deleted file mode 100644
index df3c3c6a8..000000000
--- a/ai-ml/trainium-inferentia/elastic-cache-redis.tf
+++ /dev/null
@@ -1,57 +0,0 @@
-#-------------------------------------------
-# For Rayhead High availability cluster
-#-------------------------------------------
-module "elasticache" {
-  create  = var.enable_rayserve_ha_elastic_cache_redis
-  source  = "terraform-aws-modules/elasticache/aws"
-  version = "1.2.0"
-
-  cluster_id               = local.name
-  create_cluster           = true
-  create_replication_group = false
-
-  engine_version = "7.1"
-  node_type      = "cache.t4g.small"
-
-  apply_immediately = true
-
-  # Security Group
-  vpc_id = module.vpc.vpc_id
-  security_group_rules = {
-    ingress_vpc = {
-      # Default type is `ingress`
-      # Default port is based on the default engine port
-      description = "VPC traffic"
-      cidr_ipv4   = module.vpc.vpc_cidr_block
-    }
-
-    ingress_from_eks_worker_node_tcp = {
-      description                  = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node"
-      protocol                     = "tcp"
-      from_port                    = 6379
-      referenced_security_group_id = module.eks.node_security_group_id
-      to_port                      = 6379
-      type                         = "ingress"
-    }
-  }
-
-  # Subnet Group
-  subnet_group_name        = local.name
-  subnet_group_description = "${title(local.name)} subnet group"
-  subnet_ids               = module.vpc.private_subnets
-
-  # Parameter Group
-  create_parameter_group      = true
-  parameter_group_name        = local.name
-  parameter_group_family      = "redis7"
-  parameter_group_description = "${title(local.name)} parameter group"
-  parameters = [
-    {
-      name  = "latency-tracking"
-      value = "yes"
-    }
-  ]
-
-  tags = local.tags
-
-}
diff --git a/ai-ml/trainium-inferentia/fsx-for-lustre.tf b/ai-ml/trainium-inferentia/fsx-for-lustre.tf
deleted file mode 100644
index 6c88aa2cb..000000000
--- a/ai-ml/trainium-inferentia/fsx-for-lustre.tf
+++ /dev/null
@@ -1,118 +0,0 @@
-#---------------------------------------------------------------
-# FSx for Lustre File system Static provisioning
-#    1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600)
-#    2> Create Storage Class for Filesystem (Cluster scoped)
-#    3> Persistent Volume with  Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped)
-#    4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped)
-#---------------------------------------------------------------
-
-#---------------------------------------------------------------
-# Sec group for FSx for Lustre
-#---------------------------------------------------------------
-resource "aws_security_group" "fsx" {
-  count = var.enable_fsx_for_lustre ? 1 : 0
-
-  name        = "${local.name}-fsx"
-  description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem"
-  vpc_id      = module.vpc.vpc_id
-
-  ingress {
-    description = "Allows Lustre traffic between Lustre clients"
-    cidr_blocks = module.vpc.private_subnets_cidr_blocks
-    from_port   = 1021
-    to_port     = 1023
-    protocol    = "tcp"
-  }
-  ingress {
-    description = "Allows Lustre traffic between Lustre clients"
-    cidr_blocks = module.vpc.private_subnets_cidr_blocks
-    from_port   = 988
-    to_port     = 988
-    protocol    = "tcp"
-  }
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# Storage Class - FSx for Lustre
-#---------------------------------------------------------------
-resource "kubectl_manifest" "fsx_storageclass" {
-  count = var.enable_fsx_for_lustre ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: fsx
-provisioner: fsx.csi.aws.com
-parameters:
-  subnetId: ${module.vpc.private_subnets[0]}
-  securityGroupIds: ${aws_security_group.fsx[0].id}
-  deploymentType: PERSISTENT_1
-  automaticBackupRetentionDays: "1"
-  dailyAutomaticBackupStartTime: "00:00"
-  copyTagsToBackups: "true"
-  perUnitStorageThroughput: "200"
-  dataCompressionType: "NONE"
-  weeklyMaintenanceStartTime: "7:09:00"
-  fileSystemTypeVersion: "2.12"
-mountOptions:
-  - flock
-YAML
-
-  depends_on = [module.eks.eks_cluster_id]
-}
-
-#---------------------------------------------------------------
-# Static PV for FSx for Lustre
-# Don't change the metadata.name `fsx-claim` as this is referenced in lib/trn1_dist_ddp.py script
-#---------------------------------------------------------------
-resource "kubectl_manifest" "static_pv" {
-  count = var.enable_fsx_for_lustre ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: fsx-claim
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: fsx
-  resources:
-    requests:
-      storage: 1200Gi
-YAML
-
-  depends_on = [resource.kubectl_manifest.fsx_storageclass]
-}
-
-#---------------------------------------------------------------
-# AWS CLI Command shell pod to copy the files Training dataset from S3 to FSx for Lustre
-#---------------------------------------------------------------
-resource "kubectl_manifest" "cmd_shell_fsx" {
-  count = var.enable_fsx_for_lustre ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: Pod
-metadata:
-  name: cmd-shell
-spec:
-  containers:
-  - name: app
-    image: public.ecr.aws/data-on-eks/cmd-shell:py3-awscli2
-    command: ["/bin/sh", "-c"]
-    args: ["while true; do sleep 30; done"]
-    volumeMounts:
-    - name: persistent-storage
-      mountPath: /data
-  volumes:
-  - name: persistent-storage
-    persistentVolumeClaim:
-      claimName: fsx-claim
-  restartPolicy: Never
-YAML
-
-  depends_on = [resource.kubectl_manifest.static_pv]
-}
diff --git a/ai-ml/trainium-inferentia/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/trainium-inferentia/helm-values/aws-cloudwatch-metrics-values.yaml
deleted file mode 100644
index ae3c41d44..000000000
--- a/ai-ml/trainium-inferentia/helm-values/aws-cloudwatch-metrics-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-resources:
-  limits:
-    cpu: 500m
-    memory: 2Gi
-  requests:
-    cpu: 200m
-    memory: 1Gi
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/trainium-inferentia/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/trainium-inferentia/helm-values/aws-for-fluentbit-values.yaml
deleted file mode 100644
index 0bea5188d..000000000
--- a/ai-ml/trainium-inferentia/helm-values/aws-for-fluentbit-values.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-global:
-
-#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
-# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
-hostNetwork: true
-dnsPolicy: ClusterFirstWithHostNet
-
-service:
-  parsersFiles:
-    - /fluent-bit/parsers/parsers.conf
-  extraParsers: |
-    [PARSER]
-        Name    kubernetes
-        Format  regex
-        Regex   ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
-
-input:
-  name: "tail"
-  enabled: true
-  tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
-  path: "/var/log/containers/*.log"
-  db: "/var/log/flb_kube.db"
-  memBufLimit: 5MB
-  skipLongLines: "On"
-  refreshInterval: 10
-  extraInputs: |
-    multiline.parser  docker, cri
-    Tag_Regex         (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
-
-
-# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
-filter:
-  name: "kubernetes"
-  match: "systempods.*"
-  kubeURL: "https://kubernetes.default.svc.cluster.local:443"
-  mergeLog: "On"
-  mergeLogKey: "log_processed"
-  keepLog: "On"
-  k8sLoggingParser: "On"
-  k8sLoggingExclude: "Off"
-  bufferSize: "0"
-  extraFilters: |
-    Kube_Tag_Prefix     systempods.
-    Regex_Parser        kubernetes
-    Labels              On
-    Annotations         Off
-    Use_Kubelet         true
-    Kubelet_Port        10250
-    Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-    Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
-
-# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
-# cloudWatch:
-#   enabled: false
-
-# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
-cloudWatchLogs:
-  enabled: true
-  match: "systempods.*"
-  region: ${region}
-  logGroupName: ${cloudwatch_log_group}
-  autoCreateGroup: false
-  extraOutputs: |
-    log_key               log
-
-#----------------------------------------------------------#
-# OUTPUT logs to S3
-#----------------------------------------------------------#
-
-# This is an example for writing logs to S3 bucket.
-# This example writes system pod logs and spark logs into dedicated prefix.
-# This second output is using the rewrite_tag filter commented above
-
-additionalOutputs: |
-  [OUTPUT]
-      Name                            s3
-      Match                           systempods.*
-      region                          ${region}
-      bucket                          ${s3_bucket_name}
-      total_file_size                 100M
-      s3_key_format                   /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
-      s3_key_format_tag_delimiters    ..
-      store_dir                       /home/ec2-user/buffer
-      upload_timeout                  10m
-      log_key                         log
-
-
-# Resource config for large clusters
-resources:
-  limits:
-    cpu: 1000m
-    memory: 1500Mi
-  requests:
-    cpu: 500m
-    memory: 500Mi
-
-## Assign a PriorityClassName to pods if set
-priorityClassName: system-node-critical
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml b/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml
deleted file mode 100644
index 1b1d9af71..000000000
--- a/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Best practice to update the resource requests and limits for each add-on
-resources:
-   limits:
-     cpu: 1000m
-     memory: 1G
-   requests:
-     cpu: 200m
-     memory: 512Mi
-
-# Best practice to updateStrategy for each add-on
-updateStrategy:
-  type: RollingUpdate
-  rollingUpdate:
-    maxSurge: 0
-    maxUnavailable: 1
diff --git a/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml b/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml
deleted file mode 100644
index 10ae9bfc2..000000000
--- a/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-controller:
-  service:
-    externalTrafficPolicy: "Local"
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-type: external # nlb-ip or external
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Valid values are internal, internet-facing
-    targetPorts:
-      http: http
-      https: http
diff --git a/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml b/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml
deleted file mode 100644
index 8582145fd..000000000
--- a/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml
+++ /dev/null
@@ -1,139 +0,0 @@
-hub:
-  extraConfig:
-    jupyterhub_config.py: |-
-      c.KubeSpawner.start_timeout = 1200
-
-proxy:
-  https:
-    enabled: false
-    type: offload
-  service:
-    type: ClusterIP
-singleuser:
-  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
-  profileList:
-    - display_name: Trainium (trn1)
-      description: "Trainium | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch 1.13.1 + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          karpenter.sh/nodepool: trainium-trn1 # Label is added by the karpenter to the nodes. `trainium-trn1` is the nodepool name created by this blueprint
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated"
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        # trn1.32xlarge | 16 Neurons (32 cores) | 512 GB Accelerator memory | 128 vCPus and 512 GiB
-        cpu_guarantee: 100
-        mem_guarantee: 450G
-        cpu_limit: 120
-        mem_limit: 500G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "16"
-        cmd: "start-singleuser.sh"
-    - display_name: Inferentia (inf2)
-      description: "Inferentia | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch 1.13.1 + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          karpenter.sh/nodepool: inferentia-inf2 # Label is added by the karpenter to the nodes. `inferentia-inf2` is the nodepool name created by this blueprint
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 90     # 96 vCPU for inf2.24x large
-        mem_guarantee: 300G   # 384Gib for inf2.24x large
-        cpu_limit: 90
-        mem_limit: 300G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "6"   # 12 NeuronCores , 384 GB Memory, vCPU 192, Mem 768 GB
-        cmd: null
-  storage:
-    type: "static"
-    static:
-      pvcName: "efs-persist"
-      subPath: "home/{username}"
-    extraVolumes:
-    - name: jupyterhub-shared
-      persistentVolumeClaim:
-        claimName: efs-persist-shared
-    extraVolumeMounts:
-    - name: jupyterhub-shared
-      mountPath: /home/shared
-      readOnly: false
-  serviceAccountName: ${jupyter_single_user_sa_name}
-  allowPrivilegeEscalation: true
-  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
-    securityContext:
-        fsGroup: 100
-  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
-    GRANT_SUDO: "yes"
-    NOTEBOOK_ARGS: "--allow-root"
-    CHOWN_HOME: "yes"
-    CHOWN_HOME_OPTS: "-R"
-    CHOWN_EXTRA: "/home/shared"
-  uid: 0
-  fsGid: 0
-  cmd: null
-
-# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-scheduling:
-  userScheduler:
-    enabled: true
-  podPriority:
-    enabled: true
-  userPlaceholder:
-    enabled: false
-    replicas: 1
-  userPods:
-    nodeAffinity:
-      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
-
-prePuller:
-  hook:
-    enabled: false
-  continuous:
-    # NOTE: if used with Karpenter, also add user-placeholders
-    enabled: false
-
-global:
-  safeToShowValues: false
diff --git a/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml b/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml
deleted file mode 100644
index 498fb2824..000000000
--- a/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: ${storage_class_type}
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
diff --git a/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml b/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml
deleted file mode 100644
index 026d97a6a..000000000
--- a/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# HA config for metrics-server
-image:
-  repository: registry.k8s.io/metrics-server/metrics-server
-  pullPolicy: IfNotPresent
-
-serviceAccount:
-  create: true
-  name: metrics-server
-
-rbac:
-  create: true
-  pspEnabled: false
-
-apiService:
-  create: true
-
-podLabels:
-  k8s-app: metrics-server
-
-# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true
-replicas: 2
-
-updateStrategy:
-   type: RollingUpdate
-   rollingUpdate:
-     maxSurge: 0
-     maxUnavailable: 1
-
-podDisruptionBudget:
-  enabled: true
-  minAvailable: 1
-
-defaultArgs:
-  - --cert-dir=/tmp
-  - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
-  - --kubelet-use-node-status-port
-  - --metric-resolution=15s
-
-resources:
-  requests:
-    cpu: 200m
-    memory: 512Mi
-
-affinity:
-  podAntiAffinity:
-    requiredDuringSchedulingIgnoredDuringExecution:
-      - labelSelector:
-          matchLabels:
-            k8s-app: metrics-server
-        namespaces:
-          - kube-system
-        topologyKey: kubernetes.io/hostname
diff --git a/ai-ml/trainium-inferentia/jupyterhub.tf b/ai-ml/trainium-inferentia/jupyterhub.tf
deleted file mode 100644
index 8d2754597..000000000
--- a/ai-ml/trainium-inferentia/jupyterhub.tf
+++ /dev/null
@@ -1,181 +0,0 @@
-#-----------------------------------------------------------------------------------------
-# JupyterHub Single User IRSA, maybe that block could be incorporated in add-on registry
-#-----------------------------------------------------------------------------------------
-resource "kubernetes_namespace_v1" "jupyterhub" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  metadata {
-    name = "jupyterhub"
-  }
-}
-
-module "jupyterhub_single_user_irsa" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-
-  role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa"
-
-  role_policy_arns = {
-    policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances.
-  }
-
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["${kubernetes_namespace_v1.jupyterhub[0].metadata[0].name}:jupyterhub-single-user"]
-    }
-  }
-}
-
-resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  metadata {
-    name        = "${module.eks.cluster_name}-jupyterhub-single-user"
-    namespace   = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name
-    annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa[0].iam_role_arn }
-  }
-
-  automount_service_account_token = true
-}
-
-resource "kubernetes_secret_v1" "jupyterhub_single_user" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  metadata {
-    name      = "${module.eks.cluster_name}-jupyterhub-single-user-secret"
-    namespace = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name
-    annotations = {
-      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name
-      "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name
-    }
-  }
-
-  type = "kubernetes.io/service-account-token"
-}
-
-#---------------------------------------------------------------
-# EFS Filesystem for private volumes per user
-# This will be replaced with Dynamic EFS provision using EFS CSI Driver
-#---------------------------------------------------------------
-resource "aws_efs_file_system" "efs" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  creation_token = "efs-jupyter-single-user"
-  encrypted      = true
-
-  tags = local.tags
-}
-
-resource "aws_efs_mount_target" "efs_mt" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  file_system_id  = aws_efs_file_system.efs[0].id
-  subnet_id       = module.vpc.private_subnets[2]
-  security_groups = [aws_security_group.efs[0].id]
-}
-
-resource "aws_security_group" "efs" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  name        = "${local.name}-efs"
-  description = "Allow inbound NFS traffic from private subnets of the VPC"
-  vpc_id      = module.vpc.vpc_id
-
-  ingress {
-    description = "Allow NFS 2049/tcp"
-    cidr_blocks = module.vpc.vpc_secondary_cidr_blocks
-    from_port   = 2049
-    to_port     = 2049
-    protocol    = "tcp"
-  }
-
-  tags = local.tags
-}
-
-resource "kubectl_manifest" "pv" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: efs-persist
-  namespace: jupyterhub
-spec:
-  capacity:
-    storage: 123Gi
-  accessModes:
-    - ReadWriteMany
-  nfs:
-    server: ${aws_efs_file_system.efs[0].dns_name}
-    path: "/"
-YAML
-
-  depends_on = [module.eks_blueprints_addons]
-}
-
-resource "kubectl_manifest" "pvc" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: efs-persist
-  namespace: jupyterhub
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: ""
-  resources:
-    requests:
-      storage: 1Gi
-YAML
-
-  depends_on = [module.eks_blueprints_addons]
-}
-
-resource "kubectl_manifest" "pv_shared" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: efs-persist-shared
-  namespace: jupyterhub
-spec:
-  capacity:
-    storage: 123Gi
-  accessModes:
-    - ReadWriteMany
-  nfs:
-    server: ${aws_efs_file_system.efs[0].dns_name}
-    path: "/"
-YAML
-
-  depends_on = [module.eks_blueprints_addons]
-}
-
-resource "kubectl_manifest" "pvc_shared" {
-  count = var.enable_jupyterhub ? 1 : 0
-
-  yaml_body = <<YAML
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: efs-persist-shared
-  namespace: jupyterhub
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: ""
-  resources:
-    requests:
-      storage: 1Gi
-YAML
-
-  depends_on = [module.eks_blueprints_addons]
-}
diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf
deleted file mode 100755
index 93f09df4e..000000000
--- a/ai-ml/trainium-inferentia/main.tf
+++ /dev/null
@@ -1,75 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-
-  exec {
-    api_version = "client.authentication.k8s.io/v1beta1"
-    command     = "aws"
-    # This requires the awscli to be installed locally where Terraform is executed
-    args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
-  }
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-
-    exec {
-      api_version = "client.authentication.k8s.io/v1beta1"
-      command     = "aws"
-      # This requires the awscli to be installed locally where Terraform is executed
-      args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
-    }
-  }
-}
-
-provider "kubectl" {
-  apply_retry_count      = 5
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  load_config_file       = false
-
-  exec {
-    api_version = "client.authentication.k8s.io/v1beta1"
-    command     = "aws"
-    # This requires the awscli to be installed locally where Terraform is executed
-    args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
-  }
-}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-data "aws_caller_identity" "current" {}
-
-data "aws_iam_session_context" "current" {
-  arn = data.aws_caller_identity.current.arn
-}
-
-locals {
-  name   = var.name
-  region = var.region
-  # Trn1 and Inf2 instances are available in specific AZs in us-east-1,
-  # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used.
-  az_mapping = {
-    "us-west-2" = ["usw2-az4", "usw2-az1"],
-    "us-east-1" = ["use1-az6", "use1-az5"],
-    "us-east-2" = ["use2-az3", "use2-az1"]
-  }
-  azs = local.az_mapping[var.region]
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf
deleted file mode 100755
index f03eeab29..000000000
--- a/ai-ml/trainium-inferentia/outputs.tf
+++ /dev/null
@@ -1,9 +0,0 @@
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}"
-}
-
-output "elastic_cache_redis_cluster_arn" {
-  description = "Cluster arn of the cache cluster"
-  value       = module.elasticache.cluster_arn
-}
diff --git a/ai-ml/trainium-inferentia/versions.tf b/ai-ml/trainium-inferentia/versions.tf
deleted file mode 100755
index 264755c06..000000000
--- a/ai-ml/trainium-inferentia/versions.tf
+++ /dev/null
@@ -1,37 +0,0 @@
-terraform {
-  required_version = ">= 1.3.2"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 5.61"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.4.1"
-    }
-    kubectl = {
-      source  = "alekc/kubectl"
-      version = ">= 2.0"
-    }
-    random = {
-      source  = "hashicorp/random"
-      version = ">= 3.1"
-    }
-    http = {
-      source  = "hashicorp/http"
-      version = ">= 3.3"
-    }
-  }
-
-  # ##  Used for end-to-end testing on project; update to suit your needs
-  # backend "s3" {
-  #   bucket = "doeks-github-actions-e2e-test-state"
-  #   region = "us-west-2"
-  #   key    = "e2e/trainium-inferentia/terraform.tfstate"
-  # }
-}
diff --git a/ai-ml/trainium-inferentia/vpc.tf b/ai-ml/trainium-inferentia/vpc.tf
deleted file mode 100755
index 59c3da89c..000000000
--- a/ai-ml/trainium-inferentia/vpc.tf
+++ /dev/null
@@ -1,53 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts.
-# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements
-
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = var.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets     = local.public_subnets
-  enable_nat_gateway = true
-  single_nat_gateway = true
-  #-------------------------------
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-    # Tags subnets for Karpenter auto-discovery
-    "karpenter.sh/discovery" = local.name
-  }
-
-  tags = local.tags
-}

From 06de4c60ca5f569bfc3f6df88f420c8e22a88a6e Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:00:20 -0800
Subject: [PATCH 03/16] add 12xlarge to g5

---
 ai-ml/infrastructure/terraform/addons.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index 93cc471cb..3b7a936ed 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -351,7 +351,7 @@ module "data_addons" {
             values: ["g5"]
           - key: "karpenter.k8s.aws/instance-size"
             operator: In
-            values: [ "2xlarge", "4xlarge", "8xlarge" ]
+            values: [ "2xlarge", "4xlarge", "8xlarge", "12xlarge" ]
           - key: "kubernetes.io/arch"
             operator: In
             values: ["amd64"]

From 0119abd81dc5a8bcbc102b914a5791ebc2b50cfd Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:20:04 -0800
Subject: [PATCH 04/16] add in emr and amp

---
 ai-ml/infrastructure/terraform/addons.tf    |  12 +-
 ai-ml/infrastructure/terraform/amp.tf       | 137 ++++++++++++++++++++
 ai-ml/infrastructure/terraform/eks.tf       |   8 +-
 ai-ml/infrastructure/terraform/emr-eks.tf   |  22 ++++
 ai-ml/infrastructure/terraform/variables.tf |  13 +-
 5 files changed, 185 insertions(+), 7 deletions(-)
 create mode 100644 ai-ml/infrastructure/terraform/amp.tf
 create mode 100644 ai-ml/infrastructure/terraform/emr-eks.tf

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index 3b7a936ed..9ba662a22 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -155,9 +155,13 @@ module "eks_blueprints_addons" {
   enable_kube_prometheus_stack = var.enable_kube_prometheus_stack
   kube_prometheus_stack = {
     values = [
-      templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
-        storage_class_type = kubernetes_storage_class.default_gp3.id
-      })
+        var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
+        region              = local.region
+        amp_sa              = local.amp_ingest_service_account
+        amp_irsa            = module.amp_ingest_irsa[0].iam_role_arn
+        amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
+        amp_url             = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}"
+      }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {storage_class_type = kubernetes_storage_class.default_gp3.id})
     ]
     chart_version = "48.1.1"
     set_sensitive = [
@@ -640,7 +644,7 @@ resource "kubectl_manifest" "dcgm" {
   yaml_body = file("${path.module}/monitoring/dcgm.yaml")
 }
 
-resource "kubectl_manifest" "dcgm" {
+resource "kubectl_manifest" "dcgm_service" {
   yaml_body = file("${path.module}/monitoring/dcgm-service.yaml")
 }
 
diff --git a/ai-ml/infrastructure/terraform/amp.tf b/ai-ml/infrastructure/terraform/amp.tf
new file mode 100644
index 000000000..96df2a495
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/amp.tf
@@ -0,0 +1,137 @@
+#IAM Policy for Amazon Prometheus & Grafana
+resource "aws_iam_policy" "grafana" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  description = "IAM policy for Grafana Pod"
+  name_prefix = format("%s-%s-", local.name, "grafana")
+  path        = "/"
+  policy      = data.aws_iam_policy_document.grafana[0].json
+}
+
+data "aws_iam_policy_document" "grafana" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  statement {
+    sid       = "AllowReadingMetricsFromCloudWatch"
+    effect    = "Allow"
+    resources = ["*"]
+
+    actions = [
+      "cloudwatch:DescribeAlarmsForMetric",
+      "cloudwatch:ListMetrics",
+      "cloudwatch:GetMetricData",
+      "cloudwatch:GetMetricStatistics"
+    ]
+  }
+
+  statement {
+    sid       = "AllowGetInsightsCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"]
+
+    actions = [
+      "cloudwatch:GetInsightRuleReport",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingAlarmHistoryFromCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"]
+
+    actions = [
+      "cloudwatch:DescribeAlarmHistory",
+      "cloudwatch:DescribeAlarms",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingLogsFromCloudWatch"
+    effect    = "Allow"
+    resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"]
+
+    actions = [
+      "logs:DescribeLogGroups",
+      "logs:GetLogGroupFields",
+      "logs:StartQuery",
+      "logs:StopQuery",
+      "logs:GetQueryResults",
+      "logs:GetLogEvents",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingTagsInstancesRegionsFromEC2"
+    effect    = "Allow"
+    resources = ["*"]
+
+    actions = [
+      "ec2:DescribeTags",
+      "ec2:DescribeInstances",
+      "ec2:DescribeRegions",
+    ]
+  }
+
+  statement {
+    sid       = "AllowReadingResourcesForTags"
+    effect    = "Allow"
+    resources = ["*"]
+    actions   = ["tag:GetResources"]
+  }
+
+  statement {
+    sid    = "AllowListApsWorkspaces"
+    effect = "Allow"
+    resources = [
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*",
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*",
+      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*",
+    ]
+    actions = [
+      "aps:ListWorkspaces",
+      "aps:DescribeWorkspace",
+      "aps:GetMetricMetadata",
+      "aps:GetSeries",
+      "aps:QueryMetrics",
+      "aps:RemoteWrite",
+      "aps:GetLabels"
+    ]
+  }
+}
+
+#------------------------------------------
+# Amazon Prometheus
+#------------------------------------------
+locals {
+  amp_ingest_service_account = "amp-iamproxy-ingest-service-account"
+  amp_namespace              = "kube-prometheus-stack"
+}
+
+resource "aws_prometheus_workspace" "amp" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  alias = format("%s-%s", "amp-ws", local.name)
+  tags  = local.tags
+}
+
+module "amp_ingest_irsa" {
+  count = var.enable_amazon_prometheus ? 1 : 0
+
+  source         = "aws-ia/eks-blueprints-addon/aws"
+  version        = "~> 1.0"
+  create_release = false
+  create_role    = true
+  create_policy  = false
+  role_name      = format("%s-%s", local.name, "amp-ingest")
+  role_policies  = { amp_policy = aws_iam_policy.grafana[0].arn }
+
+  oidc_providers = {
+    this = {
+      provider_arn    = module.eks.oidc_provider_arn
+      namespace       = local.amp_namespace
+      service_account = local.amp_ingest_service_account
+    }
+  }
+
+  tags = local.tags
+}
diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf
index 3543232ec..169b19bac 100644
--- a/ai-ml/infrastructure/terraform/eks.tf
+++ b/ai-ml/infrastructure/terraform/eks.tf
@@ -29,7 +29,13 @@ module "eks" {
         "system:bootstrappers",
         "system:nodes",
       ]
-    }
+    },
+    {
+      # Required for EMR on EKS virtual cluster
+      rolearn  = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSServiceRoleForAmazonEMRContainers"
+      username = "emr-containers"
+      groups   = []
+    },
   ]
   #---------------------------------------
   # Note: This can further restricted to specific required for each Add-on and your application
diff --git a/ai-ml/infrastructure/terraform/emr-eks.tf b/ai-ml/infrastructure/terraform/emr-eks.tf
new file mode 100644
index 000000000..ff9280e99
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/emr-eks.tf
@@ -0,0 +1,22 @@
+module "emr_containers" {
+  source  = "terraform-aws-modules/emr/aws//modules/virtual-cluster"
+  version = "~> 1.0"
+
+  for_each = var.enable_amazon_emr ? toset(["ml-team-a", "ml-team-b"]) : toset([])
+
+  eks_cluster_id    = module.eks.cluster_name
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  name      = "${module.eks.cluster_name}-emr-${each.value}"
+  namespace = "emr-${each.value}"
+
+  role_name                = "${module.eks.cluster_name}-emr-${each.value}"
+  iam_role_use_name_prefix = false
+  iam_role_description     = "EMR Execution Role for emr-${each.value}"
+  # NOTE: S3 full access added only for testing purpose. You should modify this policy to restrict access to S3 buckets
+  iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
+
+  cloudwatch_log_group_name = "/emr-on-eks-logs/${module.eks.cluster_name}/emr-${each.value}/"
+
+  tags = merge(local.tags, { Name = "emr-${each.value}" })
+}
diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf
index f0606cd0e..60618324a 100644
--- a/ai-ml/infrastructure/terraform/variables.tf
+++ b/ai-ml/infrastructure/terraform/variables.tf
@@ -1,6 +1,6 @@
 variable "name" {
   description = "Name of the VPC and EKS Cluster"
-  default     = "ai-stack"
+  default     = "ml-stack"
   type        = string
 }
 
@@ -57,7 +57,16 @@ variable "deploy_fsx_volume" {
   type        = bool
   default     = false
 }
-
+variable "enable_amazon_prometheus" {
+  description = "Enable Amazon Prometheus"
+  type        = bool
+  default     = false
+}
+variable "enable_amazon_emr" {
+  description = "Enable Amazon EMR"
+  type        = bool
+  default     = false
+}
 # Addon Variables
 variable "enable_kube_prometheus_stack" {
   description = "Enable Kube Prometheus addon"

From da356f463e4c48db0429b05b061a201a93f56a8a Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:20:51 -0800
Subject: [PATCH 05/16] add missing kube-prometheus-amp-enable.yaml file

---
 .../kube-prometheus-amp-enable.yaml           | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml

diff --git a/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml
new file mode 100644
index 000000000..078f33318
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml
@@ -0,0 +1,65 @@
+prometheus:
+  serviceAccount:
+    create: true
+    name: ${amp_sa}
+    annotations:
+      eks.amazonaws.com/role-arn: ${amp_irsa}
+  prometheusSpec:
+    remoteWrite:
+      - url: ${amp_remotewrite_url}
+        sigv4:
+          region: ${region}
+        queueConfig:
+          maxSamplesPerSend: 1000
+          maxShards: 200
+          capacity: 2500
+    retention: 5h
+    scrapeInterval: 30s
+    evaluationInterval: 30s
+    scrapeTimeout: 10s
+    storageSpec:
+      volumeClaimTemplate:
+        metadata:
+          name: data
+        spec:
+          storageClassName: gp2
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+    # Scrape Cost metrics for Kubecost add-on
+    # additionalScrapeConfigs:
+    #   - job_name: kubecost
+    #     honor_labels: true
+    #     scrape_interval: 1m
+    #     scrape_timeout: 10s
+    #     metrics_path: /metrics
+    #     scheme: http
+    #     dns_sd_configs:
+    #       - names:
+    #           - kubecost-cost-analyzer.kubecost.svc
+    #         type: 'A'
+    #         port: 9003
+alertmanager:
+  enabled: false
+
+grafana:
+  enabled: true
+  defaultDashboardsEnabled: true
+# Adding AMP datasource to Grafana config
+  serviceAccount:
+    create: false
+    name: ${amp_sa}
+  grafana.ini:
+    auth:
+      sigv4_auth_enabled: true
+  additionalDataSources:
+    - name: AMP
+      editable: true
+      jsonData:
+        sigV4Auth: true
+        sigV4Region: ${region}
+      type: prometheus
+      isDefault: false
+      url: ${amp_url}

From a6c978315955eb277fa36a6e20fefb5e9b2e8041 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 15:28:27 -0800
Subject: [PATCH 06/16] add missing dcgm components

---
 ai-ml/emr-spark-rapids/addons.tf              | 301 ------------------
 ai-ml/emr-spark-rapids/amp.tf                 | 137 --------
 ai-ml/emr-spark-rapids/cleanup.sh             |  55 ----
 ai-ml/emr-spark-rapids/eks.tf                 | 218 -------------
 ai-ml/emr-spark-rapids/emr-eks.tf             |  22 --
 .../aws-cloudwatch-metrics-values.yaml        |  11 -
 .../helm-values/aws-for-fluentbit-values.yaml | 102 ------
 .../cluster-autoscaler-values.yaml            |  25 --
 .../coredns-autoscaler-values.yaml            |  40 ---
 .../kube-prometheus-amp-enable.yaml           |  65 ----
 .../helm-values/kube-prometheus.yaml          |  36 ---
 .../helm-values/kubecost-values.yaml          |  62 ----
 .../helm-values/metrics-server-values.yaml    |  52 ---
 .../helm-values/nvidia-operator-values.yaml   |  96 ------
 ai-ml/emr-spark-rapids/main.tf                |  61 ----
 ai-ml/emr-spark-rapids/outputs.tf             |  51 ---
 ai-ml/emr-spark-rapids/providers.tf           |   0
 ai-ml/emr-spark-rapids/versions.tf            |  33 --
 ai-ml/emr-spark-rapids/vpc.tf                 |  50 ---
 .../terraform/monitoring/dcgm-service.yaml    |  15 +
 .../terraform/monitoring/dcgm.yaml            |  18 +-
 21 files changed, 16 insertions(+), 1434 deletions(-)
 delete mode 100644 ai-ml/emr-spark-rapids/addons.tf
 delete mode 100644 ai-ml/emr-spark-rapids/amp.tf
 delete mode 100755 ai-ml/emr-spark-rapids/cleanup.sh
 delete mode 100644 ai-ml/emr-spark-rapids/eks.tf
 delete mode 100644 ai-ml/emr-spark-rapids/emr-eks.tf
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml
 delete mode 100755 ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml
 delete mode 100644 ai-ml/emr-spark-rapids/main.tf
 delete mode 100644 ai-ml/emr-spark-rapids/outputs.tf
 delete mode 100644 ai-ml/emr-spark-rapids/providers.tf
 delete mode 100644 ai-ml/emr-spark-rapids/versions.tf
 delete mode 100644 ai-ml/emr-spark-rapids/vpc.tf
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml

diff --git a/ai-ml/emr-spark-rapids/addons.tf b/ai-ml/emr-spark-rapids/addons.tf
deleted file mode 100644
index 408277b00..000000000
--- a/ai-ml/emr-spark-rapids/addons.tf
+++ /dev/null
@@ -1,301 +0,0 @@
-#---------------------------------------------------------------
-# IRSA for EBS CSI Driver
-#---------------------------------------------------------------
-module "ebs_csi_driver_irsa" {
-  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-  version               = "~> 5.34"
-  role_name_prefix      = format("%s-%s-", local.name, "ebs-csi-driver")
-  attach_ebs_csi_policy = true
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
-    }
-  }
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# EKS Blueprints Addons
-#---------------------------------------------------------------
-module "eks_blueprints_addons" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.2"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    aws-ebs-csi-driver = {
-      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
-    }
-    coredns = {
-      preserve = true
-    }
-    vpc-cni = {
-      preserve = true
-    }
-    kube-proxy = {
-      preserve = true
-    }
-  }
-
-  #---------------------------------------
-  # Kubernetes Add-ons
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # CoreDNS Autoscaler helps to scale for large EKS Clusters
-  #   Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/
-  #---------------------------------------------------------------
-  enable_cluster_proportional_autoscaler = true
-  cluster_proportional_autoscaler = {
-    values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
-      target = "deployment/coredns"
-    })]
-    description = "Cluster Proportional Autoscaler for CoreDNS Service"
-  }
-
-  #---------------------------------------
-  # Metrics Server
-  #---------------------------------------
-  enable_metrics_server = true
-  metrics_server = {
-    values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Cluster Autoscaler
-  #---------------------------------------
-  enable_cluster_autoscaler = true
-  cluster_autoscaler = {
-    values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
-      aws_region     = var.region,
-      eks_cluster_id = module.eks.cluster_name
-    })]
-  }
-
-  #---------------------------------------
-  # Karpenter Autoscaler for EKS Cluster
-  #---------------------------------------
-  enable_karpenter                  = true
-  karpenter_enable_spot_termination = true
-  karpenter_node = {
-    iam_role_additional_policies = {
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-  karpenter = {
-    chart_version       = "v0.34.0"
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-  }
-
-  #---------------------------------------
-  # CloudWatch metrics for EKS
-  #---------------------------------------
-  enable_aws_cloudwatch_metrics = true
-  aws_cloudwatch_metrics = {
-    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Prommetheus and Grafana stack
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # Install Kafka Monitoring Stack with Prometheus and Grafana
-  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
-  # 2- Grafana Admin user: admin
-  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
-  #---------------------------------------------------------------
-  enable_kube_prometheus_stack = true
-  kube_prometheus_stack = {
-    values = [
-      var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
-        region              = local.region
-        amp_sa              = local.amp_ingest_service_account
-        amp_irsa            = module.amp_ingest_irsa[0].iam_role_arn
-        amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
-        amp_url             = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}"
-      }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {})
-    ]
-    chart_version = "48.1.1"
-    set_sensitive = [
-      {
-        name  = "grafana.adminPassword"
-        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
-      }
-    ],
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "1.33.0" # ensure to update this to the latest/desired version
-
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  enable_karpenter_resources = true
-  karpenter_resources_helm_config = {
-    spark-gpu-karpenter = {
-      values = [
-        <<-EOT
-      name: spark-gpu-karpenter
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodeGroupType: spark-executor-gpu-karpenter
-        taints:
-          - key: nvidia.com/gpu
-            value: "Exists"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["g5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: [ "2xlarge" ]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 30s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    spark-driver-cpu-karpenter = {
-      values = [
-        <<-EOT
-      name: spark-driver-cpu-karpenter
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[3]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodeGroupType: spark-driver-cpu-karpenter
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["m5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 30s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-  }
-
-  #---------------------------------------------------------------
-  # NVIDIA GPU Operator Add-on
-  #---------------------------------------------------------------
-  enable_nvidia_gpu_operator = var.enable_nvidia_gpu_operator
-
-  nvidia_gpu_operator_helm_config = {
-    version = "v23.9.1"
-    values  = [templatefile("${path.module}/helm-values/nvidia-operator-values.yaml", {})]
-  }
-
-  #---------------------------------------------------------------
-  # NVIDIA Device Plugin Add-on
-  #---------------------------------------------------------------
-  # Enable only when NVIDIA GPU Operator is disabled
-  enable_nvidia_device_plugin = !(var.enable_nvidia_gpu_operator)
-
-}
-
-#---------------------------------------------------------------
-# Grafana Admin credentials resources
-#---------------------------------------------------------------
-data "aws_secretsmanager_secret_version" "admin_password_version" {
-  secret_id  = aws_secretsmanager_secret.grafana.id
-  depends_on = [aws_secretsmanager_secret_version.grafana]
-}
-
-resource "random_password" "grafana" {
-  length           = 16
-  special          = true
-  override_special = "@_"
-}
-
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "grafana" {
-  name                    = "${local.name}-grafana"
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "grafana" {
-  secret_id     = aws_secretsmanager_secret.grafana.id
-  secret_string = random_password.grafana.result
-}
-
-#---------------------------------------------------------------
-# S3 bucket for Spark jobs
-#---------------------------------------------------------------
-module "s3_bucket" {
-  source  = "terraform-aws-modules/s3-bucket/aws"
-  version = "~> 3.0"
-
-  bucket_prefix = "${local.name}-spark-"
-
-  # For example only - please evaluate for your environment
-  force_destroy = true
-
-  server_side_encryption_configuration = {
-    rule = {
-      apply_server_side_encryption_by_default = {
-        sse_algorithm = "AES256"
-      }
-    }
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/emr-spark-rapids/amp.tf b/ai-ml/emr-spark-rapids/amp.tf
deleted file mode 100644
index 96df2a495..000000000
--- a/ai-ml/emr-spark-rapids/amp.tf
+++ /dev/null
@@ -1,137 +0,0 @@
-#IAM Policy for Amazon Prometheus & Grafana
-resource "aws_iam_policy" "grafana" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  description = "IAM policy for Grafana Pod"
-  name_prefix = format("%s-%s-", local.name, "grafana")
-  path        = "/"
-  policy      = data.aws_iam_policy_document.grafana[0].json
-}
-
-data "aws_iam_policy_document" "grafana" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  statement {
-    sid       = "AllowReadingMetricsFromCloudWatch"
-    effect    = "Allow"
-    resources = ["*"]
-
-    actions = [
-      "cloudwatch:DescribeAlarmsForMetric",
-      "cloudwatch:ListMetrics",
-      "cloudwatch:GetMetricData",
-      "cloudwatch:GetMetricStatistics"
-    ]
-  }
-
-  statement {
-    sid       = "AllowGetInsightsCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"]
-
-    actions = [
-      "cloudwatch:GetInsightRuleReport",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingAlarmHistoryFromCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"]
-
-    actions = [
-      "cloudwatch:DescribeAlarmHistory",
-      "cloudwatch:DescribeAlarms",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingLogsFromCloudWatch"
-    effect    = "Allow"
-    resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"]
-
-    actions = [
-      "logs:DescribeLogGroups",
-      "logs:GetLogGroupFields",
-      "logs:StartQuery",
-      "logs:StopQuery",
-      "logs:GetQueryResults",
-      "logs:GetLogEvents",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingTagsInstancesRegionsFromEC2"
-    effect    = "Allow"
-    resources = ["*"]
-
-    actions = [
-      "ec2:DescribeTags",
-      "ec2:DescribeInstances",
-      "ec2:DescribeRegions",
-    ]
-  }
-
-  statement {
-    sid       = "AllowReadingResourcesForTags"
-    effect    = "Allow"
-    resources = ["*"]
-    actions   = ["tag:GetResources"]
-  }
-
-  statement {
-    sid    = "AllowListApsWorkspaces"
-    effect = "Allow"
-    resources = [
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*",
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*",
-      "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*",
-    ]
-    actions = [
-      "aps:ListWorkspaces",
-      "aps:DescribeWorkspace",
-      "aps:GetMetricMetadata",
-      "aps:GetSeries",
-      "aps:QueryMetrics",
-      "aps:RemoteWrite",
-      "aps:GetLabels"
-    ]
-  }
-}
-
-#------------------------------------------
-# Amazon Prometheus
-#------------------------------------------
-locals {
-  amp_ingest_service_account = "amp-iamproxy-ingest-service-account"
-  amp_namespace              = "kube-prometheus-stack"
-}
-
-resource "aws_prometheus_workspace" "amp" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  alias = format("%s-%s", "amp-ws", local.name)
-  tags  = local.tags
-}
-
-module "amp_ingest_irsa" {
-  count = var.enable_amazon_prometheus ? 1 : 0
-
-  source         = "aws-ia/eks-blueprints-addon/aws"
-  version        = "~> 1.0"
-  create_release = false
-  create_role    = true
-  create_policy  = false
-  role_name      = format("%s-%s", local.name, "amp-ingest")
-  role_policies  = { amp_policy = aws_iam_policy.grafana[0].arn }
-
-  oidc_providers = {
-    this = {
-      provider_arn    = module.eks.oidc_provider_arn
-      namespace       = local.amp_namespace
-      service_account = local.amp_ingest_service_account
-    }
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/emr-spark-rapids/cleanup.sh b/ai-ml/emr-spark-rapids/cleanup.sh
deleted file mode 100755
index 1f084486e..000000000
--- a/ai-ml/emr-spark-rapids/cleanup.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-set -o errexit
-set -o pipefail
-
-read -p "Enter the region: " region
-export AWS_DEFAULT_REGION=$region
-
-targets=(
-  "module.emr_containers"
-  "module.eks_data_addons"
-  "module.eks_blueprints_addons"
-)
-
-#-------------------------------------------
-# Helpful to delete the stuck in "Terminating" namespaces
-# Rerun the cleanup.sh script to detect and delete the stuck resources
-#-------------------------------------------
-terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
-
-# If there are no terminating namespaces, exit the script
-if [[ -z $terminating_namespaces ]]; then
-    echo "No terminating namespaces found"
-fi
-
-for ns in $terminating_namespaces; do
-    echo "Terminating namespace: $ns"
-    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
-done
-
-#-------------------------------------------
-# Terraform destroy per module target
-#-------------------------------------------
-for target in "${targets[@]}"
-do
-  terraform destroy -target="$target" -auto-approve
-  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
-  if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-    echo "SUCCESS: Terraform destroy of $target completed successfully"
-  else
-    echo "FAILED: Terraform destroy of $target failed"
-    exit 1
-  fi
-done
-
-#-------------------------------------------
-# Terraform destroy full
-#-------------------------------------------
-terraform destroy -auto-approve
-destroy_output=$(terraform destroy -auto-approve 2>&1)
-if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-  echo "SUCCESS: Terraform destroy of all targets completed successfully"
-else
-  echo "FAILED: Terraform destroy of all targets failed"
-  exit 1
-fi
diff --git a/ai-ml/emr-spark-rapids/eks.tf b/ai-ml/emr-spark-rapids/eks.tf
deleted file mode 100644
index 4d01ae4fe..000000000
--- a/ai-ml/emr-spark-rapids/eks.tf
+++ /dev/null
@@ -1,218 +0,0 @@
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 19.21"
-
-  cluster_name    = local.name
-  cluster_version = var.eks_cluster_version
-
-  #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
-  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
-
-  vpc_id = module.vpc.vpc_id
-  # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
-  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-  manage_aws_auth_configmap = true
-  aws_auth_roles = [
-    {
-      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
-      username = "system:node:{{EC2PrivateDNSName}}"
-      groups = [
-        "system:bootstrappers",
-        "system:nodes",
-      ]
-    },
-    {
-      # Required for EMR on EKS virtual cluster
-      rolearn  = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSServiceRoleForAmazonEMRContainers"
-      username = "emr-containers"
-      groups   = []
-    },
-  ]
-
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 1025
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  # Extend node-to-node security group rules
-  node_security_group_additional_rules = {
-    ingress_self_all = {
-      description = "Node to node all ports/protocols"
-      protocol    = "-1"
-      from_port   = 0
-      to_port     = 0
-      type        = "ingress"
-      self        = true
-    }
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
-    # Change this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  We recommend to have a MNG to place your critical workloads and add-ons
-    #  Then rely on Karpenter to scale your workloads
-    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
-    core_node_group = {
-      name        = "core-node-group"
-      description = "EKS managed node group example launch template"
-      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-      min_size     = 3
-      max_size     = 9
-      desired_size = 3
-
-      ami_type       = "AL2_x86_64"
-      instance_types = ["m5.xlarge"]
-
-      ebs_optimized = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        WorkerType                       = "ON_DEMAND"
-        NodeGroupType                    = "core"
-        "nvidia.com/gpu.deploy.operands" = false
-      }
-
-      tags = {
-        Name                     = "core-node-grp",
-        "karpenter.sh/discovery" = local.name
-      }
-    }
-
-    spark_driver_ng = {
-      name        = "spark-driver-ng"
-      description = "Spark managed node group for Driver pods"
-      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
-
-      ami_type = "AL2_x86_64"
-
-      min_size     = 1
-      max_size     = 8
-      desired_size = 1
-
-      force_update_version = true
-      instance_types       = ["m5.xlarge"] # 4 vCPU and 16GB
-
-      ebs_optimized = true
-
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        WorkerType                       = "ON_DEMAND"
-        NodeGroupType                    = "spark-driver-cpu-ca"
-        "nvidia.com/gpu.deploy.operands" = false
-      }
-
-      tags = {
-        Name = "spark-driver-ca"
-      }
-    }
-    spark_gpu_ng = {
-      name        = "spark-gpu-ng"
-      description = "Spark managed GPU node group for executor pods with launch template"
-      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
-
-      ami_type = "AL2_x86_64_GPU"
-
-      # NVMe instance store volumes are automatically enumerated and assigned a device
-      pre_bootstrap_user_data = <<-EOT
-        cat <<-EOF > /etc/profile.d/bootstrap.sh
-        #!/bin/sh
-
-        # Configure NVMe volumes in RAID0 configuration
-        # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126
-        # Mount will be: /mnt/k8s-disks
-        export LOCAL_DISKS='raid0'
-
-        # Source extra environment variables in bootstrap script
-        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-      EOT
-
-      # Change min_size, max_size and desired_size to 8 before running xgboost example
-      min_size     = 0
-      max_size     = 8
-      desired_size = 0
-
-      capacity_type  = "ON_DEMAND"
-      instance_types = ["g5.2xlarge"]
-
-      ebs_optimized = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "spark-executor-gpu-ca"
-      }
-
-      taints = [{
-        key    = "nvidia.com/gpu",
-        value  = "EXISTS",
-        effect = "NO_SCHEDULE"
-      }]
-
-      tags = {
-        Name = "spark-gpu",
-      }
-    }
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/emr-spark-rapids/emr-eks.tf b/ai-ml/emr-spark-rapids/emr-eks.tf
deleted file mode 100644
index aa1ec0282..000000000
--- a/ai-ml/emr-spark-rapids/emr-eks.tf
+++ /dev/null
@@ -1,22 +0,0 @@
-module "emr_containers" {
-  source  = "terraform-aws-modules/emr/aws//modules/virtual-cluster"
-  version = "~> 1.0"
-
-  for_each = toset(["ml-team-a", "ml-team-b"])
-
-  eks_cluster_id    = module.eks.cluster_name
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  name      = "${module.eks.cluster_name}-emr-${each.value}"
-  namespace = "emr-${each.value}"
-
-  role_name                = "${module.eks.cluster_name}-emr-${each.value}"
-  iam_role_use_name_prefix = false
-  iam_role_description     = "EMR Execution Role for emr-${each.value}"
-  # NOTE: S3 full access added only for testing purpose. You should modify this policy to restrict access to S3 buckets
-  iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
-
-  cloudwatch_log_group_name = "/emr-on-eks-logs/${module.eks.cluster_name}/emr-${each.value}/"
-
-  tags = merge(local.tags, { Name = "emr-${each.value}" })
-}
diff --git a/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml
deleted file mode 100644
index ae3c41d44..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-resources:
-  limits:
-    cpu: 500m
-    memory: 2Gi
-  requests:
-    cpu: 200m
-    memory: 1Gi
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml
deleted file mode 100755
index 0bea5188d..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-global:
-
-#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
-# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
-hostNetwork: true
-dnsPolicy: ClusterFirstWithHostNet
-
-service:
-  parsersFiles:
-    - /fluent-bit/parsers/parsers.conf
-  extraParsers: |
-    [PARSER]
-        Name    kubernetes
-        Format  regex
-        Regex   ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
-
-input:
-  name: "tail"
-  enabled: true
-  tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
-  path: "/var/log/containers/*.log"
-  db: "/var/log/flb_kube.db"
-  memBufLimit: 5MB
-  skipLongLines: "On"
-  refreshInterval: 10
-  extraInputs: |
-    multiline.parser  docker, cri
-    Tag_Regex         (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
-
-
-# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
-filter:
-  name: "kubernetes"
-  match: "systempods.*"
-  kubeURL: "https://kubernetes.default.svc.cluster.local:443"
-  mergeLog: "On"
-  mergeLogKey: "log_processed"
-  keepLog: "On"
-  k8sLoggingParser: "On"
-  k8sLoggingExclude: "Off"
-  bufferSize: "0"
-  extraFilters: |
-    Kube_Tag_Prefix     systempods.
-    Regex_Parser        kubernetes
-    Labels              On
-    Annotations         Off
-    Use_Kubelet         true
-    Kubelet_Port        10250
-    Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-    Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
-
-# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
-# cloudWatch:
-#   enabled: false
-
-# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
-cloudWatchLogs:
-  enabled: true
-  match: "systempods.*"
-  region: ${region}
-  logGroupName: ${cloudwatch_log_group}
-  autoCreateGroup: false
-  extraOutputs: |
-    log_key               log
-
-#----------------------------------------------------------#
-# OUTPUT logs to S3
-#----------------------------------------------------------#
-
-# This is an example for writing logs to S3 bucket.
-# This example writes system pod logs and spark logs into dedicated prefix.
-# This second output is using the rewrite_tag filter commented above
-
-additionalOutputs: |
-  [OUTPUT]
-      Name                            s3
-      Match                           systempods.*
-      region                          ${region}
-      bucket                          ${s3_bucket_name}
-      total_file_size                 100M
-      s3_key_format                   /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log
-      s3_key_format_tag_delimiters    ..
-      store_dir                       /home/ec2-user/buffer
-      upload_timeout                  10m
-      log_key                         log
-
-
-# Resource config for large clusters
-resources:
-  limits:
-    cpu: 1000m
-    memory: 1500Mi
-  requests:
-    cpu: 500m
-    memory: 500Mi
-
-## Assign a PriorityClassName to pods if set
-priorityClassName: system-node-critical
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml b/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml
deleted file mode 100644
index 5a42794f2..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-autoDiscovery:
-  clusterName: ${eks_cluster_id}
-
-awsRegion: ${aws_region}
-
-cloudProvider: aws
-
-extraArgs:
-  aws-use-static-instance-list: true
-
-# Best practice to update the resource requests and limits for each add-on
-resources:
-   limits:
-     cpu: 1000m
-     memory: 1G
-   requests:
-     cpu: 200m
-     memory: 512Mi
-
-# Best practice to updateStrategy for each add-on
-updateStrategy:
-  type: RollingUpdate
-  rollingUpdate:
-    maxSurge: 0
-    maxUnavailable: 1
diff --git a/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml b/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml
deleted file mode 100644
index 64cb540bf..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-nameOverride: kube-dns-autoscaler
-
-# Formula for controlling the replicas. Adjust according to your needs
-#  replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) )
-#  replicas = min(replicas, max)
-#  replicas = max(replicas, min)
-config:
-  linear:
-    coresPerReplica: 256
-    nodesPerReplica: 16
-    min: 1
-    max: 100
-    preventSinglePointFailure: true
-    includeUnschedulableNodes: true
-
-# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive).
-options:
-  target: ${target}
-
-serviceAccount:
-  create: true
-  name: kube-dns-autoscaler
-
-podSecurityContext:
-  seccompProfile:
-    type: RuntimeDefault
-  supplementalGroups: [ 65534 ]
-  fsGroup: 65534
-
-resources:
-  limits:
-    cpu: 100m
-    memory: 128Mi
-  requests:
-    cpu: 100m
-    memory: 128Mi
-
-tolerations:
-  - key: "CriticalAddonsOnly"
-    operator: "Exists"
diff --git a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml
deleted file mode 100644
index 078f33318..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-prometheus:
-  serviceAccount:
-    create: true
-    name: ${amp_sa}
-    annotations:
-      eks.amazonaws.com/role-arn: ${amp_irsa}
-  prometheusSpec:
-    remoteWrite:
-      - url: ${amp_remotewrite_url}
-        sigv4:
-          region: ${region}
-        queueConfig:
-          maxSamplesPerSend: 1000
-          maxShards: 200
-          capacity: 2500
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: gp2
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-    # Scrape Cost metrics for Kubecost add-on
-    # additionalScrapeConfigs:
-    #   - job_name: kubecost
-    #     honor_labels: true
-    #     scrape_interval: 1m
-    #     scrape_timeout: 10s
-    #     metrics_path: /metrics
-    #     scheme: http
-    #     dns_sd_configs:
-    #       - names:
-    #           - kubecost-cost-analyzer.kubecost.svc
-    #         type: 'A'
-    #         port: 9003
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
-# Adding AMP datasource to Grafana config
-  serviceAccount:
-    create: false
-    name: ${amp_sa}
-  grafana.ini:
-    auth:
-      sigv4_auth_enabled: true
-  additionalDataSources:
-    - name: AMP
-      editable: true
-      jsonData:
-        sigV4Auth: true
-        sigV4Region: ${region}
-      type: prometheus
-      isDefault: false
-      url: ${amp_url}
diff --git a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml b/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml
deleted file mode 100644
index 54c1f690f..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-prometheus:
-  prometheusSpec:
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: gp2
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 50Gi
-    # Scrape Cost metrics for Kubecost add-on
-    # additionalScrapeConfigs:
-    #   - job_name: kubecost
-    #     honor_labels: true
-    #     scrape_interval: 1m
-    #     scrape_timeout: 10s
-    #     metrics_path: /metrics
-    #     scheme: http
-    #     dns_sd_configs:
-    #       - names:
-    #           - kubecost-cost-analyzer.kubecost.svc
-    #         type: 'A'
-    #         port: 9003
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
diff --git a/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml b/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml
deleted file mode 100644
index f781ec5ce..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-
-# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090
-
-global:
-  # pricingCsv:
-  #   enabled: false
-  #   location:
-  #     provider: "AWS"
-  #     region: "us-east-1"
-  #     URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
-  #     csvAccessCredentials: pricing-schema-access-secret
-
-  # This Prometheus setup is reusing the existing Prometheus deployment
-  # Check for more docs under https://guide.kubecost.com/hc/en-us/articles/4407595941015
-  prometheus:
-    fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090
-    enabled: false
-
-# If you have node-exporter and/or KSM running on your cluster, follow this step to disable the Kubecost included versions.
-prometheus:
-  nodeExporter:
-    enabled: false
-  serviceAccounts:
-    nodeExporter:
-      create: false
-  kubeStateMetrics:
-    enabled: false
-
-#imageVersion: prod-1.96.0 # commented to use the latest
-
-kubecostFrontend:
-  image: public.ecr.aws/kubecost/frontend
-  resources:
-    requests:
-      cpu: "200m"
-      memory: "512Mi"
-
-kubecostMetrics:
-  emitPodAnnotations: true
-  emitNamespaceAnnotations: true
-
-kubecostModel:
-  image: public.ecr.aws/kubecost/cost-model
-  resources:
-    requests:
-      cpu: "500m"
-      memory: "512Mi"
-
-# Set this to false if you're bringing your own service account.
-#serviceAccount:
-#  create: false
-#  name: kubecost-cost-analyzer
-#  annotations:
-#    eks.amazonaws.com/role-arn: <iam-role-arn>
-
-# Define persistence volume for cost-analyzer
-persistentVolume:
-  size: 32Gi
-  dbSize: 32.0Gi
-  enabled: true # Note that setting this to false means configurations will be wiped out on pod restart.
-  storageClass: gp2
-  # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost
diff --git a/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml b/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml
deleted file mode 100644
index 026d97a6a..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# HA config for metrics-server
-image:
-  repository: registry.k8s.io/metrics-server/metrics-server
-  pullPolicy: IfNotPresent
-
-serviceAccount:
-  create: true
-  name: metrics-server
-
-rbac:
-  create: true
-  pspEnabled: false
-
-apiService:
-  create: true
-
-podLabels:
-  k8s-app: metrics-server
-
-# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true
-replicas: 2
-
-updateStrategy:
-   type: RollingUpdate
-   rollingUpdate:
-     maxSurge: 0
-     maxUnavailable: 1
-
-podDisruptionBudget:
-  enabled: true
-  minAvailable: 1
-
-defaultArgs:
-  - --cert-dir=/tmp
-  - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
-  - --kubelet-use-node-status-port
-  - --metric-resolution=15s
-
-resources:
-  requests:
-    cpu: 200m
-    memory: 512Mi
-
-affinity:
-  podAntiAffinity:
-    requiredDuringSchedulingIgnoredDuringExecution:
-      - labelSelector:
-          matchLabels:
-            k8s-app: metrics-server
-        namespaces:
-          - kube-system
-        topologyKey: kubernetes.io/hostname
diff --git a/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml b/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml
deleted file mode 100644
index 532ee11ff..000000000
--- a/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml
+++ /dev/null
@@ -1,96 +0,0 @@
-# Default values for gpu-operator.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-daemonsets:
-  labels: {}
-  annotations: {}
-  priorityClassName: system-node-critical
-  tolerations:
-    - key: nvidia.com/gpu
-      operator: Exists
-      effect: NoSchedule
-    - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
-
-operator:
-  repository: nvcr.io/nvidia
-  priorityClassName: system-node-critical
-  defaultRuntime: containerd
-  image: gpu-operator
-  cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs
-  # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
-  # to be passed during helm upgrade.
-  upgradeCRD: false
-  resources:
-    limits:
-      cpu: 500m
-      memory: 350Mi
-    requests:
-      cpu: 200m
-      memory: 100Mi
-
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html
-# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers.
-# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed"
-
-driver:
-  enabled: false  # Disabled as we are using latest EKS AMI 1.29 which comes with NVIDIA drivers pre-installed
-  # repository: nvcr.io/nvidia
-  # image: driver
-  # # Commented this as latest Ubuntu AMIs are failing with this option enabled
-  # # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version.
-  # manager:
-  #   image: k8s-driver-manager
-  #   repository: nvcr.io/nvidia/cloud-native
-
-# to ensure containers can properly access GPUs
-toolkit:
-  enabled: true
-
-# to discover and advertise GPU resources to kubelet
-devicePlugin:
-  enabled: true
-
-dcgm:
-  enabled: false
-
-# to monitor the GPU(s) on the node
-dcgmExporter:
-  enabled: true
-
-gfd:
-  enabled: true
-
-migManager:
-  enabled: false
-
-nodeStatusExporter:
-  enabled: false
-
-gds:
-  enabled: false
-
-vgpuManager:
-  enabled: false
-
-vgpuDeviceManager:
-  enabled: false
-
-vfioManager:
-  enabled: false
-
-sandboxDevicePlugin:
-  enabled: false
-
-node-feature-discovery:
-  enableNodeFeatureApi: true
-  worker:
-    tolerations:
-      - key: "node-role.kubernetes.io/master"
-        operator: "Equal"
-        value: ""
-        effect: "NoSchedule"
-      - key: nvidia.com/gpu
-        operator: Exists
-        effect: NoSchedule
-      - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes
diff --git a/ai-ml/emr-spark-rapids/main.tf b/ai-ml/emr-spark-rapids/main.tf
deleted file mode 100644
index 809cc6343..000000000
--- a/ai-ml/emr-spark-rapids/main.tf
+++ /dev/null
@@ -1,61 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-
-provider "kubectl" {
-  apply_retry_count      = 30
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  load_config_file       = false
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-data "aws_availability_zones" "available" {}
-
-data "aws_caller_identity" "current" {}
-data "aws_partition" "current" {}
-
-locals {
-  name   = var.name
-  region = var.region
-
-  # Only two AZs for this example
-  azs = slice(data.aws_availability_zones.available.names, 0, 2)
-
-  account_id = data.aws_caller_identity.current.account_id
-  partition  = data.aws_partition.current.partition
-
-  tags = merge(var.tags, {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  })
-}
diff --git a/ai-ml/emr-spark-rapids/outputs.tf b/ai-ml/emr-spark-rapids/outputs.tf
deleted file mode 100644
index 8645d7977..000000000
--- a/ai-ml/emr-spark-rapids/outputs.tf
+++ /dev/null
@@ -1,51 +0,0 @@
-################################################################################
-# Cluster
-################################################################################
-
-output "cluster_arn" {
-  description = "The Amazon Resource Name (ARN) of the cluster"
-  value       = module.eks.cluster_arn
-}
-
-output "cluster_name" {
-  description = "The Amazon Resource Name (ARN) of the cluster"
-  value       = module.eks.cluster_id
-}
-
-output "oidc_provider_arn" {
-  description = "The ARN of the OIDC Provider if `enable_irsa = true`"
-  value       = module.eks.oidc_provider_arn
-}
-
-################################################################################
-# EKS Managed Node Group
-################################################################################
-
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}"
-}
-
-output "emr_on_eks" {
-  description = "EMR on EKS"
-  value       = module.emr_containers
-}
-
-################################################################################
-# AMP
-################################################################################
-
-output "amp_workspace_id" {
-  description = "The id of amp"
-  value       = aws_prometheus_workspace.amp[0].id
-}
-
-output "grafana_secret_name" {
-  description = "Grafana password secret name"
-  value       = aws_secretsmanager_secret.grafana.name
-}
-
-output "s3_bucket_id" {
-  description = "S3 bucket for Spark input and output data"
-  value       = module.s3_bucket.s3_bucket_id
-}
diff --git a/ai-ml/emr-spark-rapids/providers.tf b/ai-ml/emr-spark-rapids/providers.tf
deleted file mode 100644
index e69de29bb..000000000
diff --git a/ai-ml/emr-spark-rapids/versions.tf b/ai-ml/emr-spark-rapids/versions.tf
deleted file mode 100644
index 182b26581..000000000
--- a/ai-ml/emr-spark-rapids/versions.tf
+++ /dev/null
@@ -1,33 +0,0 @@
-terraform {
-  required_version = ">= 1.0"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 4.47"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.4"
-    }
-    kubectl = {
-      source  = "gavinbunney/kubectl"
-      version = ">= 1.14"
-    }
-    random = {
-      source  = "hashicorp/random"
-      version = ">= 3.3"
-    }
-  }
-
-  # ##  Used for end-to-end testing on project; update to suit your needs
-  # backend "s3" {
-  #   bucket = "doeks-github-actions-e2e-test-state"
-  #   region = "us-west-2"
-  #   key    = "e2e/emr-spark-rapids/terraform.tfstate"
-  # }
-}
diff --git a/ai-ml/emr-spark-rapids/vpc.tf b/ai-ml/emr-spark-rapids/vpc.tf
deleted file mode 100644
index e7e6473ee..000000000
--- a/ai-ml/emr-spark-rapids/vpc.tf
+++ /dev/null
@@ -1,50 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = var.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets     = local.public_subnets
-  enable_nat_gateway = true
-  single_nat_gateway = true
-  #-------------------------------
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-    # Tags subnets for Karpenter auto-discovery
-    "karpenter.sh/discovery" = local.name
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml
new file mode 100644
index 000000000..9217d4437
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml
@@ -0,0 +1,15 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: "dcgm-exporter"
+  namespace: nvidia-device-plugin
+  labels:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "3.6.1"
+spec:
+  selector:
+    app.kubernetes.io/name: "dcgm-exporter"
+    app.kubernetes.io/version: "3.6.1"
+  ports:
+    - name: "metrics"
+      port: 9400
diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
index c3ffe67d3..0fd459865 100644
--- a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
+++ b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml
@@ -16,7 +16,7 @@ apiVersion: apps/v1
 kind: DaemonSet
 metadata:
   name: "dcgm-exporter"
-  namespace: kube-system
+  namespace: nvidia-device-plugin
   labels:
     app.kubernetes.io/name: "dcgm-exporter"
     app.kubernetes.io/version: "3.6.1"
@@ -64,19 +64,3 @@ spec:
         - key: nvidia.com/gpu
           operator: Exists
           effect: NoSchedule
----
-kind: Service
-apiVersion: v1
-metadata:
-  name: "dcgm-exporter"
-  namespace: kube-system
-  labels:
-    app.kubernetes.io/name: "dcgm-exporter"
-    app.kubernetes.io/version: "3.6.1"
-spec:
-  selector:
-    app.kubernetes.io/name: "dcgm-exporter"
-    app.kubernetes.io/version: "3.6.1"
-  ports:
-    - name: "metrics"
-      port: 9400

From b79fd2c10bd517bb0375375cc3ac85e80a75fab6 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 28 Jan 2025 20:56:19 -0800
Subject: [PATCH 07/16] initial move from genai to ai

---
 .../excalidraw/llama2-raytrain.excalidraw           |   0
 website/docs/{gen-ai => ai}/excalidraw/llama3.svg   |   0
 .../excalidraw/nvidia-triton-vllm.excalidraw        |   0
 .../excalidraw/stable-diffusion-inf2.excalidraw     |   0
 website/docs/{gen-ai => ai}/index.md                |   6 +++---
 .../inference/GPUs/nvidia-nim-llama3.md             |   0
 .../inference/GPUs/stablediffusion-gpus.md          |   0
 .../inference/GPUs/vLLM-NVIDIATritonServer.md       |   0
 .../{gen-ai => ai}/inference/GPUs/vLLM-rayserve.md  |   0
 .../inference/Neuron/Mistral-7b-inf2.md             |   0
 .../{gen-ai => ai}/inference/Neuron/llama2-inf2.md  |   0
 .../{gen-ai => ai}/inference/Neuron/llama3-inf2.md  |   0
 .../{gen-ai => ai}/inference/Neuron/rayserve-ha.md  |   0
 .../inference/Neuron/stablediffusion-inf2.md        |   0
 .../inference/Neuron/vllm-ray-inf2.md               |   0
 .../docs/{gen-ai => ai}/inference/_category_.json   |   0
 .../{gen-ai => ai}/inference/img/answer-1-contd.png | Bin
 .../docs/{gen-ai => ai}/inference/img/answer-1.png  | Bin
 .../inference/img/excali-draw-sdxl-inf2.png         | Bin
 .../{gen-ai => ai}/inference/img/gradio-app-gpu.png | Bin
 .../inference/img/gradio-llama-ai-chat.png          | Bin
 .../inference/img/gradio-llama2-13b-chat.png        | Bin
 .../{gen-ai => ai}/inference/img/gradio-test-ft.png | Bin
 .../inference/img/head-pod-deleted.png              | Bin
 .../inference/img/llama-2-chat-ouput.png            | Bin
 .../inference/img/llama2-13b-response.png           | Bin
 .../{gen-ai => ai}/inference/img/llama2-inf2.png    | Bin
 .../docs/{gen-ai => ai}/inference/img/llama3.png    | Bin
 .../{gen-ai => ai}/inference/img/llma27b-hg.png     | Bin
 .../{gen-ai => ai}/inference/img/mistral-conv-1.png | Bin
 .../{gen-ai => ai}/inference/img/mistral-conv-2.png | Bin
 .../{gen-ai => ai}/inference/img/mistral-gradio.png | Bin
 .../inference/img/mistral-sample-prompt-1.png       | Bin
 .../{gen-ai => ai}/inference/img/mistral7b-hg.png   | Bin
 .../inference/img/neuron-monitor-cwci.png           | Bin
 .../inference/img/nim-dashboard-2.png               | Bin
 .../{gen-ai => ai}/inference/img/nim-dashboard.png  | Bin
 .../inference/img/nim-ngc-api-key.png               | Bin
 .../inference/img/nim-on-eks-arch.png               | Bin
 .../inference/img/openweb-ui-nim-1.png              | Bin
 .../inference/img/openweb-ui-nim-2.png              | Bin
 .../inference/img/openweb-ui-ray-vllm-inf2-1.png    | Bin
 .../inference/img/openweb-ui-ray-vllm-inf2-2.png    | Bin
 .../img/ray-dashboard-deployed-mistral-inf2.png     | Bin
 .../img/ray-dashboard-deploying-mistral-inf2.png    | Bin
 .../inference/img/ray-dashboard-sdxl.png            | Bin
 .../img/ray-dashboard-vllm-llama3-inf2.png          | Bin
 .../inference/img/ray-dashboard-vllm-mistral.png    | Bin
 .../{gen-ai => ai}/inference/img/ray-dashboard.png  | Bin
 .../inference/img/ray-deplo-logs-vllm-mistral.png   | Bin
 .../inference/img/ray-grafana-dashboard.png         | Bin
 .../{gen-ai => ai}/inference/img/ray-head-ha-1.png  | Bin
 .../{gen-ai => ai}/inference/img/ray-head-ha-2.png  | Bin
 .../inference/img/ray-logs-vllm-llama3-inf2.png     | Bin
 .../{gen-ai => ai}/inference/img/ray-prometheus.png | Bin
 .../inference/img/ray-serve-gpu-sd-cluster.png      | Bin
 .../inference/img/ray-serve-gpu-sd.png              | Bin
 .../img/ray-serve-inf2-mistral-cluster.png          | Bin
 .../{gen-ai => ai}/inference/img/ray-vllm-inf2.png  | Bin
 .../inference/img/rayserve-llama2-13b-dashboard.png | Bin
 .../inference/img/stable-diffusion-xl-gradio.png    | Bin
 .../inference/img/stable-diffusion-xl-prompt_3.png  | Bin
 .../inference/img/triton-architecture.png           | Bin
 .../inference/img/triton-grafana-dash2.png          | Bin
 .../inference/img/triton-internals.png              | Bin
 .../inference/img/triton-observability.png          | Bin
 .../inference/img/worker-pod-running.png            | Bin
 website/docs/{gen-ai => ai}/ml-platforms-eks.png    | Bin
 .../docs/{gen-ai => ai}/training/GPUs/bionemo.md    |   0
 .../{gen-ai => ai}/training/Neuron/BERT-Large.md    |   0
 .../docs/{gen-ai => ai}/training/Neuron/Llama2.md   |   0
 .../training/Neuron/RayTrain-Llama2.md              |   0
 .../docs/{gen-ai => ai}/training/_category_.json    |   0
 .../training/img/Llama2-RayTrain-Trn1.png           | Bin
 .../{gen-ai => ai}/training/img/llama2-trainium.png | Bin
 .../training/img/raytrain-precomplilation1.png      | Bin
 .../training/img/raytrain-precomplilation2.png      | Bin
 .../training/img/raytrain-precomplilation3.png      | Bin
 .../training/img/raytrain-testdata-lens.png         | Bin
 .../training/img/raytrain-testdata-raydash.png      | Bin
 .../training/img/raytrain-testdata-raydash1.png     | Bin
 .../training/img/raytrain-testdata-raydash2.png     | Bin
 .../training/img/raytrain-testdata-raydash3.png     | Bin
 .../training/img/raytrain-training-progress1.png    | Bin
 .../training/img/raytrain-training-progress2.png    | Bin
 .../training/img/raytrain-training-progress3.png    | Bin
 website/docs/blueprints/ai-ml/index.md              |  11 ++---------
 website/docusaurus.config.js                        |   2 +-
 website/sidebars.js                                 |   2 +-
 website/src/pages/index.js                          |   4 ++--
 90 files changed, 9 insertions(+), 16 deletions(-)
 rename website/docs/{gen-ai => ai}/excalidraw/llama2-raytrain.excalidraw (100%)
 rename website/docs/{gen-ai => ai}/excalidraw/llama3.svg (100%)
 rename website/docs/{gen-ai => ai}/excalidraw/nvidia-triton-vllm.excalidraw (100%)
 rename website/docs/{gen-ai => ai}/excalidraw/stable-diffusion-inf2.excalidraw (100%)
 rename website/docs/{gen-ai => ai}/index.md (81%)
 rename website/docs/{gen-ai => ai}/inference/GPUs/nvidia-nim-llama3.md (100%)
 rename website/docs/{gen-ai => ai}/inference/GPUs/stablediffusion-gpus.md (100%)
 rename website/docs/{gen-ai => ai}/inference/GPUs/vLLM-NVIDIATritonServer.md (100%)
 rename website/docs/{gen-ai => ai}/inference/GPUs/vLLM-rayserve.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/Mistral-7b-inf2.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/llama2-inf2.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/llama3-inf2.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/rayserve-ha.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/stablediffusion-inf2.md (100%)
 rename website/docs/{gen-ai => ai}/inference/Neuron/vllm-ray-inf2.md (100%)
 rename website/docs/{gen-ai => ai}/inference/_category_.json (100%)
 rename website/docs/{gen-ai => ai}/inference/img/answer-1-contd.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/answer-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/excali-draw-sdxl-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/gradio-app-gpu.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/gradio-llama-ai-chat.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/gradio-llama2-13b-chat.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/gradio-test-ft.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/head-pod-deleted.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/llama-2-chat-ouput.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/llama2-13b-response.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/llama2-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/llama3.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/llma27b-hg.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/mistral-conv-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/mistral-conv-2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/mistral-gradio.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/mistral-sample-prompt-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/mistral7b-hg.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/neuron-monitor-cwci.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/nim-dashboard-2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/nim-dashboard.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/nim-ngc-api-key.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/nim-on-eks-arch.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-nim-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-nim-2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-ray-vllm-inf2-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-ray-vllm-inf2-2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-deployed-mistral-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-deploying-mistral-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-sdxl.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-vllm-llama3-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-vllm-mistral.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-deplo-logs-vllm-mistral.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-grafana-dashboard.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-head-ha-1.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-head-ha-2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-logs-vllm-llama3-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-prometheus.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-serve-gpu-sd-cluster.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-serve-gpu-sd.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-serve-inf2-mistral-cluster.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/ray-vllm-inf2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/rayserve-llama2-13b-dashboard.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/stable-diffusion-xl-gradio.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/stable-diffusion-xl-prompt_3.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/triton-architecture.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/triton-grafana-dash2.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/triton-internals.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/triton-observability.png (100%)
 rename website/docs/{gen-ai => ai}/inference/img/worker-pod-running.png (100%)
 rename website/docs/{gen-ai => ai}/ml-platforms-eks.png (100%)
 rename website/docs/{gen-ai => ai}/training/GPUs/bionemo.md (100%)
 rename website/docs/{gen-ai => ai}/training/Neuron/BERT-Large.md (100%)
 rename website/docs/{gen-ai => ai}/training/Neuron/Llama2.md (100%)
 rename website/docs/{gen-ai => ai}/training/Neuron/RayTrain-Llama2.md (100%)
 rename website/docs/{gen-ai => ai}/training/_category_.json (100%)
 rename website/docs/{gen-ai => ai}/training/img/Llama2-RayTrain-Trn1.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/llama2-trainium.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation1.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation2.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation3.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-lens.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash1.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash2.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash3.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress1.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress2.png (100%)
 rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress3.png (100%)

diff --git a/website/docs/gen-ai/excalidraw/llama2-raytrain.excalidraw b/website/docs/ai/excalidraw/llama2-raytrain.excalidraw
similarity index 100%
rename from website/docs/gen-ai/excalidraw/llama2-raytrain.excalidraw
rename to website/docs/ai/excalidraw/llama2-raytrain.excalidraw
diff --git a/website/docs/gen-ai/excalidraw/llama3.svg b/website/docs/ai/excalidraw/llama3.svg
similarity index 100%
rename from website/docs/gen-ai/excalidraw/llama3.svg
rename to website/docs/ai/excalidraw/llama3.svg
diff --git a/website/docs/gen-ai/excalidraw/nvidia-triton-vllm.excalidraw b/website/docs/ai/excalidraw/nvidia-triton-vllm.excalidraw
similarity index 100%
rename from website/docs/gen-ai/excalidraw/nvidia-triton-vllm.excalidraw
rename to website/docs/ai/excalidraw/nvidia-triton-vllm.excalidraw
diff --git a/website/docs/gen-ai/excalidraw/stable-diffusion-inf2.excalidraw b/website/docs/ai/excalidraw/stable-diffusion-inf2.excalidraw
similarity index 100%
rename from website/docs/gen-ai/excalidraw/stable-diffusion-inf2.excalidraw
rename to website/docs/ai/excalidraw/stable-diffusion-inf2.excalidraw
diff --git a/website/docs/gen-ai/index.md b/website/docs/ai/index.md
similarity index 81%
rename from website/docs/gen-ai/index.md
rename to website/docs/ai/index.md
index 7b531f2f9..1e5bed78d 100644
--- a/website/docs/gen-ai/index.md
+++ b/website/docs/ai/index.md
@@ -3,9 +3,9 @@ sidebar_position: 1
 sidebar_label: Overview
 ---
 
-# Generative AI on EKS
+# AI on EKS
 
-Welcome to generative AI on [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/), your gateway to harnessing the power of Large Language Models (LLMs) for a wide range of applications. This introduction page serves as your starting point to explore our offerings for Training, Fine-tuning, and Inference using various LLMs, including BERT-Large, Llama2, Stable Diffusion, and more.
+Welcome to AI on [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/), your gateway to harnessing the power of Large Language Models (LLMs) for a wide range of applications. This introduction page serves as your starting point to explore our offerings for Training, Fine-tuning, and Inference using various LLMs, including BERT-Large, Llama2, Stable Diffusion, and more.
 
 Our platform provides multiple patterns for users to scale their generative AI workloads on EKS using a comprehensive suite of open-source ML tools/frameworks.
 
@@ -28,4 +28,4 @@ Unlock the potential of LLMs for powerful inference tasks. Our Inference resourc
 ## Storage and Data Management
 Efficient data storage and management are fundamental to successful AI/ML operations. Our platform integrates with AWS storage solutions such as S3, EBS, EFS, and FSx to ensure scalable and reliable data handling. Utilize MLflow for model registry and versioning, and manage container images with Amazon ECR. This ensures a seamless workflow from model development to deployment, with robust data management practices to support your ML lifecycle.
 
-Whether you're an experienced practitioner or new to the field, our generative AI on EKS capabilities empower you to harness the latest advancements in language modeling. Dive into each section to begin your journey, and explore how you can leverage these tools and frameworks to build, fine-tune, and deploy powerful AI models on Amazon EKS.
+Whether you're an experienced practitioner or new to the field, our AI on EKS capabilities empower you to harness the latest advancements in language modeling. Dive into each section to begin your journey, and explore how you can leverage these tools and frameworks to build, fine-tune, and deploy powerful AI models on Amazon EKS.
diff --git a/website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md b/website/docs/ai/inference/GPUs/nvidia-nim-llama3.md
similarity index 100%
rename from website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md
rename to website/docs/ai/inference/GPUs/nvidia-nim-llama3.md
diff --git a/website/docs/gen-ai/inference/GPUs/stablediffusion-gpus.md b/website/docs/ai/inference/GPUs/stablediffusion-gpus.md
similarity index 100%
rename from website/docs/gen-ai/inference/GPUs/stablediffusion-gpus.md
rename to website/docs/ai/inference/GPUs/stablediffusion-gpus.md
diff --git a/website/docs/gen-ai/inference/GPUs/vLLM-NVIDIATritonServer.md b/website/docs/ai/inference/GPUs/vLLM-NVIDIATritonServer.md
similarity index 100%
rename from website/docs/gen-ai/inference/GPUs/vLLM-NVIDIATritonServer.md
rename to website/docs/ai/inference/GPUs/vLLM-NVIDIATritonServer.md
diff --git a/website/docs/gen-ai/inference/GPUs/vLLM-rayserve.md b/website/docs/ai/inference/GPUs/vLLM-rayserve.md
similarity index 100%
rename from website/docs/gen-ai/inference/GPUs/vLLM-rayserve.md
rename to website/docs/ai/inference/GPUs/vLLM-rayserve.md
diff --git a/website/docs/gen-ai/inference/Neuron/Mistral-7b-inf2.md b/website/docs/ai/inference/Neuron/Mistral-7b-inf2.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/Mistral-7b-inf2.md
rename to website/docs/ai/inference/Neuron/Mistral-7b-inf2.md
diff --git a/website/docs/gen-ai/inference/Neuron/llama2-inf2.md b/website/docs/ai/inference/Neuron/llama2-inf2.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/llama2-inf2.md
rename to website/docs/ai/inference/Neuron/llama2-inf2.md
diff --git a/website/docs/gen-ai/inference/Neuron/llama3-inf2.md b/website/docs/ai/inference/Neuron/llama3-inf2.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/llama3-inf2.md
rename to website/docs/ai/inference/Neuron/llama3-inf2.md
diff --git a/website/docs/gen-ai/inference/Neuron/rayserve-ha.md b/website/docs/ai/inference/Neuron/rayserve-ha.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/rayserve-ha.md
rename to website/docs/ai/inference/Neuron/rayserve-ha.md
diff --git a/website/docs/gen-ai/inference/Neuron/stablediffusion-inf2.md b/website/docs/ai/inference/Neuron/stablediffusion-inf2.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/stablediffusion-inf2.md
rename to website/docs/ai/inference/Neuron/stablediffusion-inf2.md
diff --git a/website/docs/gen-ai/inference/Neuron/vllm-ray-inf2.md b/website/docs/ai/inference/Neuron/vllm-ray-inf2.md
similarity index 100%
rename from website/docs/gen-ai/inference/Neuron/vllm-ray-inf2.md
rename to website/docs/ai/inference/Neuron/vllm-ray-inf2.md
diff --git a/website/docs/gen-ai/inference/_category_.json b/website/docs/ai/inference/_category_.json
similarity index 100%
rename from website/docs/gen-ai/inference/_category_.json
rename to website/docs/ai/inference/_category_.json
diff --git a/website/docs/gen-ai/inference/img/answer-1-contd.png b/website/docs/ai/inference/img/answer-1-contd.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/answer-1-contd.png
rename to website/docs/ai/inference/img/answer-1-contd.png
diff --git a/website/docs/gen-ai/inference/img/answer-1.png b/website/docs/ai/inference/img/answer-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/answer-1.png
rename to website/docs/ai/inference/img/answer-1.png
diff --git a/website/docs/gen-ai/inference/img/excali-draw-sdxl-inf2.png b/website/docs/ai/inference/img/excali-draw-sdxl-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/excali-draw-sdxl-inf2.png
rename to website/docs/ai/inference/img/excali-draw-sdxl-inf2.png
diff --git a/website/docs/gen-ai/inference/img/gradio-app-gpu.png b/website/docs/ai/inference/img/gradio-app-gpu.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/gradio-app-gpu.png
rename to website/docs/ai/inference/img/gradio-app-gpu.png
diff --git a/website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png b/website/docs/ai/inference/img/gradio-llama-ai-chat.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png
rename to website/docs/ai/inference/img/gradio-llama-ai-chat.png
diff --git a/website/docs/gen-ai/inference/img/gradio-llama2-13b-chat.png b/website/docs/ai/inference/img/gradio-llama2-13b-chat.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/gradio-llama2-13b-chat.png
rename to website/docs/ai/inference/img/gradio-llama2-13b-chat.png
diff --git a/website/docs/gen-ai/inference/img/gradio-test-ft.png b/website/docs/ai/inference/img/gradio-test-ft.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/gradio-test-ft.png
rename to website/docs/ai/inference/img/gradio-test-ft.png
diff --git a/website/docs/gen-ai/inference/img/head-pod-deleted.png b/website/docs/ai/inference/img/head-pod-deleted.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/head-pod-deleted.png
rename to website/docs/ai/inference/img/head-pod-deleted.png
diff --git a/website/docs/gen-ai/inference/img/llama-2-chat-ouput.png b/website/docs/ai/inference/img/llama-2-chat-ouput.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/llama-2-chat-ouput.png
rename to website/docs/ai/inference/img/llama-2-chat-ouput.png
diff --git a/website/docs/gen-ai/inference/img/llama2-13b-response.png b/website/docs/ai/inference/img/llama2-13b-response.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/llama2-13b-response.png
rename to website/docs/ai/inference/img/llama2-13b-response.png
diff --git a/website/docs/gen-ai/inference/img/llama2-inf2.png b/website/docs/ai/inference/img/llama2-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/llama2-inf2.png
rename to website/docs/ai/inference/img/llama2-inf2.png
diff --git a/website/docs/gen-ai/inference/img/llama3.png b/website/docs/ai/inference/img/llama3.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/llama3.png
rename to website/docs/ai/inference/img/llama3.png
diff --git a/website/docs/gen-ai/inference/img/llma27b-hg.png b/website/docs/ai/inference/img/llma27b-hg.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/llma27b-hg.png
rename to website/docs/ai/inference/img/llma27b-hg.png
diff --git a/website/docs/gen-ai/inference/img/mistral-conv-1.png b/website/docs/ai/inference/img/mistral-conv-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/mistral-conv-1.png
rename to website/docs/ai/inference/img/mistral-conv-1.png
diff --git a/website/docs/gen-ai/inference/img/mistral-conv-2.png b/website/docs/ai/inference/img/mistral-conv-2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/mistral-conv-2.png
rename to website/docs/ai/inference/img/mistral-conv-2.png
diff --git a/website/docs/gen-ai/inference/img/mistral-gradio.png b/website/docs/ai/inference/img/mistral-gradio.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/mistral-gradio.png
rename to website/docs/ai/inference/img/mistral-gradio.png
diff --git a/website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png b/website/docs/ai/inference/img/mistral-sample-prompt-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png
rename to website/docs/ai/inference/img/mistral-sample-prompt-1.png
diff --git a/website/docs/gen-ai/inference/img/mistral7b-hg.png b/website/docs/ai/inference/img/mistral7b-hg.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/mistral7b-hg.png
rename to website/docs/ai/inference/img/mistral7b-hg.png
diff --git a/website/docs/gen-ai/inference/img/neuron-monitor-cwci.png b/website/docs/ai/inference/img/neuron-monitor-cwci.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/neuron-monitor-cwci.png
rename to website/docs/ai/inference/img/neuron-monitor-cwci.png
diff --git a/website/docs/gen-ai/inference/img/nim-dashboard-2.png b/website/docs/ai/inference/img/nim-dashboard-2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/nim-dashboard-2.png
rename to website/docs/ai/inference/img/nim-dashboard-2.png
diff --git a/website/docs/gen-ai/inference/img/nim-dashboard.png b/website/docs/ai/inference/img/nim-dashboard.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/nim-dashboard.png
rename to website/docs/ai/inference/img/nim-dashboard.png
diff --git a/website/docs/gen-ai/inference/img/nim-ngc-api-key.png b/website/docs/ai/inference/img/nim-ngc-api-key.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/nim-ngc-api-key.png
rename to website/docs/ai/inference/img/nim-ngc-api-key.png
diff --git a/website/docs/gen-ai/inference/img/nim-on-eks-arch.png b/website/docs/ai/inference/img/nim-on-eks-arch.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/nim-on-eks-arch.png
rename to website/docs/ai/inference/img/nim-on-eks-arch.png
diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-1.png b/website/docs/ai/inference/img/openweb-ui-nim-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/openweb-ui-nim-1.png
rename to website/docs/ai/inference/img/openweb-ui-nim-1.png
diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-2.png b/website/docs/ai/inference/img/openweb-ui-nim-2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/openweb-ui-nim-2.png
rename to website/docs/ai/inference/img/openweb-ui-nim-2.png
diff --git a/website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-1.png b/website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-1.png
rename to website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-1.png
diff --git a/website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-2.png b/website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-2.png
rename to website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-2.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png b/website/docs/ai/inference/img/ray-dashboard-deployed-mistral-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png
rename to website/docs/ai/inference/img/ray-dashboard-deployed-mistral-inf2.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png b/website/docs/ai/inference/img/ray-dashboard-deploying-mistral-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png
rename to website/docs/ai/inference/img/ray-dashboard-deploying-mistral-inf2.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-sdxl.png b/website/docs/ai/inference/img/ray-dashboard-sdxl.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard-sdxl.png
rename to website/docs/ai/inference/img/ray-dashboard-sdxl.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-vllm-llama3-inf2.png b/website/docs/ai/inference/img/ray-dashboard-vllm-llama3-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard-vllm-llama3-inf2.png
rename to website/docs/ai/inference/img/ray-dashboard-vllm-llama3-inf2.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-vllm-mistral.png b/website/docs/ai/inference/img/ray-dashboard-vllm-mistral.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard-vllm-mistral.png
rename to website/docs/ai/inference/img/ray-dashboard-vllm-mistral.png
diff --git a/website/docs/gen-ai/inference/img/ray-dashboard.png b/website/docs/ai/inference/img/ray-dashboard.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-dashboard.png
rename to website/docs/ai/inference/img/ray-dashboard.png
diff --git a/website/docs/gen-ai/inference/img/ray-deplo-logs-vllm-mistral.png b/website/docs/ai/inference/img/ray-deplo-logs-vllm-mistral.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-deplo-logs-vllm-mistral.png
rename to website/docs/ai/inference/img/ray-deplo-logs-vllm-mistral.png
diff --git a/website/docs/gen-ai/inference/img/ray-grafana-dashboard.png b/website/docs/ai/inference/img/ray-grafana-dashboard.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-grafana-dashboard.png
rename to website/docs/ai/inference/img/ray-grafana-dashboard.png
diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-1.png b/website/docs/ai/inference/img/ray-head-ha-1.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-head-ha-1.png
rename to website/docs/ai/inference/img/ray-head-ha-1.png
diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-2.png b/website/docs/ai/inference/img/ray-head-ha-2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-head-ha-2.png
rename to website/docs/ai/inference/img/ray-head-ha-2.png
diff --git a/website/docs/gen-ai/inference/img/ray-logs-vllm-llama3-inf2.png b/website/docs/ai/inference/img/ray-logs-vllm-llama3-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-logs-vllm-llama3-inf2.png
rename to website/docs/ai/inference/img/ray-logs-vllm-llama3-inf2.png
diff --git a/website/docs/gen-ai/inference/img/ray-prometheus.png b/website/docs/ai/inference/img/ray-prometheus.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-prometheus.png
rename to website/docs/ai/inference/img/ray-prometheus.png
diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png b/website/docs/ai/inference/img/ray-serve-gpu-sd-cluster.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png
rename to website/docs/ai/inference/img/ray-serve-gpu-sd-cluster.png
diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png b/website/docs/ai/inference/img/ray-serve-gpu-sd.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png
rename to website/docs/ai/inference/img/ray-serve-gpu-sd.png
diff --git a/website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png b/website/docs/ai/inference/img/ray-serve-inf2-mistral-cluster.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png
rename to website/docs/ai/inference/img/ray-serve-inf2-mistral-cluster.png
diff --git a/website/docs/gen-ai/inference/img/ray-vllm-inf2.png b/website/docs/ai/inference/img/ray-vllm-inf2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/ray-vllm-inf2.png
rename to website/docs/ai/inference/img/ray-vllm-inf2.png
diff --git a/website/docs/gen-ai/inference/img/rayserve-llama2-13b-dashboard.png b/website/docs/ai/inference/img/rayserve-llama2-13b-dashboard.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/rayserve-llama2-13b-dashboard.png
rename to website/docs/ai/inference/img/rayserve-llama2-13b-dashboard.png
diff --git a/website/docs/gen-ai/inference/img/stable-diffusion-xl-gradio.png b/website/docs/ai/inference/img/stable-diffusion-xl-gradio.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/stable-diffusion-xl-gradio.png
rename to website/docs/ai/inference/img/stable-diffusion-xl-gradio.png
diff --git a/website/docs/gen-ai/inference/img/stable-diffusion-xl-prompt_3.png b/website/docs/ai/inference/img/stable-diffusion-xl-prompt_3.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/stable-diffusion-xl-prompt_3.png
rename to website/docs/ai/inference/img/stable-diffusion-xl-prompt_3.png
diff --git a/website/docs/gen-ai/inference/img/triton-architecture.png b/website/docs/ai/inference/img/triton-architecture.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/triton-architecture.png
rename to website/docs/ai/inference/img/triton-architecture.png
diff --git a/website/docs/gen-ai/inference/img/triton-grafana-dash2.png b/website/docs/ai/inference/img/triton-grafana-dash2.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/triton-grafana-dash2.png
rename to website/docs/ai/inference/img/triton-grafana-dash2.png
diff --git a/website/docs/gen-ai/inference/img/triton-internals.png b/website/docs/ai/inference/img/triton-internals.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/triton-internals.png
rename to website/docs/ai/inference/img/triton-internals.png
diff --git a/website/docs/gen-ai/inference/img/triton-observability.png b/website/docs/ai/inference/img/triton-observability.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/triton-observability.png
rename to website/docs/ai/inference/img/triton-observability.png
diff --git a/website/docs/gen-ai/inference/img/worker-pod-running.png b/website/docs/ai/inference/img/worker-pod-running.png
similarity index 100%
rename from website/docs/gen-ai/inference/img/worker-pod-running.png
rename to website/docs/ai/inference/img/worker-pod-running.png
diff --git a/website/docs/gen-ai/ml-platforms-eks.png b/website/docs/ai/ml-platforms-eks.png
similarity index 100%
rename from website/docs/gen-ai/ml-platforms-eks.png
rename to website/docs/ai/ml-platforms-eks.png
diff --git a/website/docs/gen-ai/training/GPUs/bionemo.md b/website/docs/ai/training/GPUs/bionemo.md
similarity index 100%
rename from website/docs/gen-ai/training/GPUs/bionemo.md
rename to website/docs/ai/training/GPUs/bionemo.md
diff --git a/website/docs/gen-ai/training/Neuron/BERT-Large.md b/website/docs/ai/training/Neuron/BERT-Large.md
similarity index 100%
rename from website/docs/gen-ai/training/Neuron/BERT-Large.md
rename to website/docs/ai/training/Neuron/BERT-Large.md
diff --git a/website/docs/gen-ai/training/Neuron/Llama2.md b/website/docs/ai/training/Neuron/Llama2.md
similarity index 100%
rename from website/docs/gen-ai/training/Neuron/Llama2.md
rename to website/docs/ai/training/Neuron/Llama2.md
diff --git a/website/docs/gen-ai/training/Neuron/RayTrain-Llama2.md b/website/docs/ai/training/Neuron/RayTrain-Llama2.md
similarity index 100%
rename from website/docs/gen-ai/training/Neuron/RayTrain-Llama2.md
rename to website/docs/ai/training/Neuron/RayTrain-Llama2.md
diff --git a/website/docs/gen-ai/training/_category_.json b/website/docs/ai/training/_category_.json
similarity index 100%
rename from website/docs/gen-ai/training/_category_.json
rename to website/docs/ai/training/_category_.json
diff --git a/website/docs/gen-ai/training/img/Llama2-RayTrain-Trn1.png b/website/docs/ai/training/img/Llama2-RayTrain-Trn1.png
similarity index 100%
rename from website/docs/gen-ai/training/img/Llama2-RayTrain-Trn1.png
rename to website/docs/ai/training/img/Llama2-RayTrain-Trn1.png
diff --git a/website/docs/gen-ai/training/img/llama2-trainium.png b/website/docs/ai/training/img/llama2-trainium.png
similarity index 100%
rename from website/docs/gen-ai/training/img/llama2-trainium.png
rename to website/docs/ai/training/img/llama2-trainium.png
diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation1.png b/website/docs/ai/training/img/raytrain-precomplilation1.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-precomplilation1.png
rename to website/docs/ai/training/img/raytrain-precomplilation1.png
diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation2.png b/website/docs/ai/training/img/raytrain-precomplilation2.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-precomplilation2.png
rename to website/docs/ai/training/img/raytrain-precomplilation2.png
diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation3.png b/website/docs/ai/training/img/raytrain-precomplilation3.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-precomplilation3.png
rename to website/docs/ai/training/img/raytrain-precomplilation3.png
diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-lens.png b/website/docs/ai/training/img/raytrain-testdata-lens.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-testdata-lens.png
rename to website/docs/ai/training/img/raytrain-testdata-lens.png
diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash.png b/website/docs/ai/training/img/raytrain-testdata-raydash.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash.png
rename to website/docs/ai/training/img/raytrain-testdata-raydash.png
diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash1.png b/website/docs/ai/training/img/raytrain-testdata-raydash1.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash1.png
rename to website/docs/ai/training/img/raytrain-testdata-raydash1.png
diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash2.png b/website/docs/ai/training/img/raytrain-testdata-raydash2.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash2.png
rename to website/docs/ai/training/img/raytrain-testdata-raydash2.png
diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash3.png b/website/docs/ai/training/img/raytrain-testdata-raydash3.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash3.png
rename to website/docs/ai/training/img/raytrain-testdata-raydash3.png
diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress1.png b/website/docs/ai/training/img/raytrain-training-progress1.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-training-progress1.png
rename to website/docs/ai/training/img/raytrain-training-progress1.png
diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress2.png b/website/docs/ai/training/img/raytrain-training-progress2.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-training-progress2.png
rename to website/docs/ai/training/img/raytrain-training-progress2.png
diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress3.png b/website/docs/ai/training/img/raytrain-training-progress3.png
similarity index 100%
rename from website/docs/gen-ai/training/img/raytrain-training-progress3.png
rename to website/docs/ai/training/img/raytrain-training-progress3.png
diff --git a/website/docs/blueprints/ai-ml/index.md b/website/docs/blueprints/ai-ml/index.md
index 379fa8f73..15e6ea212 100644
--- a/website/docs/blueprints/ai-ml/index.md
+++ b/website/docs/blueprints/ai-ml/index.md
@@ -32,15 +32,8 @@ By choosing Amazon EKS, you gain access to a robust infrastructure that can hand
 
 ## Deploying Generative AI Models on Amazon EKS
 
-Deploying Generative AI models on Amazon EKS is supported through two major blueprints:
-
-- **For GPUs**: Use the [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark).
-- **For Neuron**: Start with the [Trainium on EKS blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/trainium).
-
-In addition to these, this section provides other valuable ML blueprints:
+Deploying an AI stack on EKS starts with infrastructure [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark). This blueprint provides a customizable environment on which to build an AI platform. For task specific workloads, AI on EKS has a few blueprints that preconfigures the environment:
 
+- **For Ray**: Use the [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark).
 - **NVIDIA Spark RAPIDS**: For Spark on GPU workloads, refer to the [NVIDIA Spark RAPIDS blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/emr-spark-rapids).
-
-- **JupyterHub on EKS**: Explore the [JupyterHub blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jupyterhub), which showcases Time Slicing and MIG features, as well as multi-tenant configurations with profiles. This is ideal for deploying large-scale JupyterHub platforms on EKS.
-
 - **Additional Patterns**: For other patterns using NVIDIA Triton server, NVIDIA NGC, and more, refer to the [Gen AI page](https://awslabs.github.io/data-on-eks/docs/gen-ai).
diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js
index 845b739da..6a06345c0 100644
--- a/website/docusaurus.config.js
+++ b/website/docusaurus.config.js
@@ -54,7 +54,7 @@ const config = {
         },
         items: [
           { type: 'doc', docId: 'introduction/intro', position: 'left', label: 'Introduction' },
-          { type: 'doc', docId: 'gen-ai/index', position: 'left', label: 'Gen AI' },
+          { type: 'doc', docId: 'ai/index', position: 'left', label: 'Gen AI' },
           { type: 'doc', docId: 'blueprints/amazon-emr-on-eks/index', position: 'left', label: 'Blueprints' },
           { type: 'doc', docId: 'bestpractices/intro', position: 'left', label: 'Best Practices' },
           { type: 'doc', docId: 'benchmarks/emr-on-eks', position: 'left', label: 'Benchmarks' },
diff --git a/website/sidebars.js b/website/sidebars.js
index e69016bab..01144305f 100644
--- a/website/sidebars.js
+++ b/website/sidebars.js
@@ -16,7 +16,7 @@ const sidebars = {
   // By default, Docusaurus generates a sidebar from the docs folder structure
   // docs: [{type: 'autogenerated', dirName: '.'}],
   // But you can create a sidebar manually
-    genai: [{type: 'autogenerated', dirName: 'gen-ai'}],
+    ai: [{type: 'autogenerated', dirName: 'ai'}],
     blueprints: [{type: 'autogenerated', dirName: 'blueprints'}],
     bestpractices: [{type: 'autogenerated', dirName: 'bestpractices'}],
     benchmarks: [{type: 'autogenerated', dirName: 'benchmarks'}],
diff --git a/website/src/pages/index.js b/website/src/pages/index.js
index d9b7d05d0..e0ed10e81 100644
--- a/website/src/pages/index.js
+++ b/website/src/pages/index.js
@@ -35,8 +35,8 @@ function HomepageHeader() {
           </Link>
           <Link
             className={clsx("button button--lg", styles.buttonGenAI)}
-            to="/docs/gen-ai">
-            Explore Gen AI
+            to="/docs/ai">
+            AI on EKS
           </Link>
         </div>
       </div>

From c66a3514efd141c16982568072b7de8c235162e7 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Wed, 29 Jan 2025 11:10:45 -0800
Subject: [PATCH 08/16] set ai stack defaults and add jupyterhub

---
 ai-ml/infrastructure/terraform/addons.tf      |  72 ++---
 ai-ml/infrastructure/terraform/cognito.tf     | 224 +++++++++++++++
 .../terraform/helm-values/efs/Chart.yaml      |   5 +
 .../helm-values/efs/templates/efs-pv.yaml     |  14 +
 .../helm-values/efs/templates/efs-pvc.yaml    |  11 +
 .../terraform/helm-values/efs/values.yaml     |   5 +
 .../jupyterhub-values-cognito.yaml            | 264 ++++++++++++++++++
 .../helm-values/jupyterhub-values-dummy.yaml  | 219 +++++++++++++++
 .../helm-values/jupyterhub-values-oauth.yaml  | 232 +++++++++++++++
 ai-ml/infrastructure/terraform/jupyterhub.tf  | 163 +++++++++++
 ai-ml/infrastructure/terraform/variables.tf   |  74 ++++-
 11 files changed, 1245 insertions(+), 38 deletions(-)
 create mode 100644 ai-ml/infrastructure/terraform/cognito.tf
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml
 create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/values.yaml
 create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
 create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
 create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
 create mode 100644 ai-ml/infrastructure/terraform/jupyterhub.tf

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index 9ba662a22..57e63d062 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -1,3 +1,14 @@
+# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM)
+data "aws_acm_certificate" "issued" {
+  count    = var.jupyter_hub_auth_mechanism != "dummy" ? 1 : 0
+  domain   = var.acm_certificate_domain
+  statuses = ["ISSUED"]
+}
+
+locals {
+  cognito_custom_domain = var.cognito_custom_domain
+}
+
 #---------------------------------------------------------------
 # GP3 Encrypted Storage Class
 #---------------------------------------------------------------
@@ -83,7 +94,7 @@ module "eks_blueprints_addons" {
       preserve = true
     }
   }
-
+  enable_aws_efs_csi_driver = var.enable_aws_efs_csi_driver
   #---------------------------------------
   # AWS Load Balancer Controller Add-on
   #---------------------------------------
@@ -204,9 +215,21 @@ module "data_addons" {
   #---------------------------------------------------------------
   enable_jupyterhub = var.enable_jupyterhub
   jupyterhub_helm_config = {
-    namespace        = kubernetes_namespace_v1.jupyterhub.id
-    create_namespace = false
-    values           = [file("${path.module}/helm-values/jupyterhub-values.yaml")]
+    values = [templatefile("${path.module}/helm-values/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", {
+      ssl_cert_arn                = try(data.aws_acm_certificate.issued[0].arn, "")
+      jupyterdomain               = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "")
+      authorize_url               = var.oauth_domain != "" ? "${var.oauth_domain}/auth" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
+      token_url                   = var.oauth_domain != "" ? "${var.oauth_domain}/token" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "")
+      userdata_url                = var.oauth_domain != "" ? "${var.oauth_domain}/userinfo" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "")
+      username_key                = try(var.oauth_username_key, "")
+      client_id                   = var.oauth_jupyter_client_id != "" ? var.oauth_jupyter_client_id : try(aws_cognito_user_pool_client.user_pool_client[0].id, "")
+      client_secret               = var.oauth_jupyter_client_secret != "" ? var.oauth_jupyter_client_secret : try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "")
+      user_pool_id                = try(aws_cognito_user_pool.pool[0].id, "")
+      identity_pool_id            = try(aws_cognito_identity_pool.identity_pool[0].id, "")
+      jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
+      region                      = var.region
+    })]
+    version = "3.2.1"
   }
 
   enable_volcano = var.enable_volcano
@@ -551,36 +574,6 @@ module "data_addons" {
   ]
 }
 
-
-#---------------------------------------------------------------
-# Additional Resources
-#---------------------------------------------------------------
-
-resource "kubernetes_namespace_v1" "jupyterhub" {
-  metadata {
-    name = "jupyterhub"
-  }
-}
-
-
-resource "kubernetes_secret_v1" "huggingface_token" {
-  metadata {
-    name      = "hf-token"
-    namespace = kubernetes_namespace_v1.jupyterhub.id
-  }
-
-  data = {
-    token = var.huggingface_token
-  }
-}
-
-resource "kubernetes_config_map_v1" "notebook" {
-  metadata {
-    name      = "notebook"
-    namespace = kubernetes_namespace_v1.jupyterhub.id
-  }
-}
-
 #---------------------------------------------------------------
 # MLflow Tracking Add-on
 #---------------------------------------------------------------
@@ -648,6 +641,17 @@ resource "kubectl_manifest" "dcgm_service" {
   yaml_body = file("${path.module}/monitoring/dcgm-service.yaml")
 }
 
+resource "kubectl_manifest" "efs_sc" {
+  count = var.enable_aws_efs_csi_driver ? 1 : 0
+  yaml_body = <<YAML
+    apiVersion: storage.k8s.io/v1
+    kind: StorageClass
+    metadata:
+      name: efs-sc
+    provisioner: efs.csi.aws.com
+  YAML
+}
+
 data "aws_iam_policy_document" "karpenter_controller_policy" {
   statement {
     actions = [
diff --git a/ai-ml/infrastructure/terraform/cognito.tf b/ai-ml/infrastructure/terraform/cognito.tf
new file mode 100644
index 000000000..57338986b
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/cognito.tf
@@ -0,0 +1,224 @@
+#---------------------------------------------------------------
+# Lambda function for pre token generation
+#----------------------------------------------------------------
+
+data "aws_iam_policy_document" "assume_role" {
+  statement {
+    effect = "Allow"
+    principals {
+      type        = "Service"
+      identifiers = ["lambda.amazonaws.com", "cognito-idp.amazonaws.com"]
+    }
+    actions = ["sts:AssumeRole"]
+  }
+}
+
+data "aws_iam_policy" "lambda_execution_policy" {
+  arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
+}
+
+resource "aws_iam_role" "iam_for_lambda" {
+  count              = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  name               = "iam_for_lambda"
+  assume_role_policy = data.aws_iam_policy_document.assume_role.json
+}
+
+resource "aws_iam_role_policy_attachment" "lambda_policy_attachment" {
+  count      = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  role       = aws_iam_role.iam_for_lambda[0].name
+  policy_arn = data.aws_iam_policy.lambda_execution_policy.arn
+}
+
+data "archive_file" "lambda" {
+  type        = "zip"
+  output_path = "/tmp/lambda.zip"
+  source {
+    filename = "index.mjs"
+    content  = <<-EOF
+    export const handler = async (event) => {
+        event.response = {
+          claimsOverrideDetails: {
+            claimsToAddOrOverride: {
+              department: "engineering",
+            },
+          },
+        };
+
+        return event;
+    };
+
+    EOF
+  }
+}
+
+resource "aws_lambda_function" "pretoken_trigger" {
+  count            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  function_name    = "pretoken-trigger-function"
+  filename         = data.archive_file.lambda.output_path
+  source_code_hash = data.archive_file.lambda.output_base64sha256
+
+  runtime = "nodejs18.x"
+  handler = "index.handler"
+
+  role = aws_iam_role.iam_for_lambda[0].arn
+}
+
+#---------------------------------------------------------------
+# Cognito pool, domain and client creation.
+# This can be used
+# Auth integration later.
+#----------------------------------------------------------------
+resource "aws_cognito_user_pool" "pool" {
+  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  name  = "jupyterhub-userpool"
+
+  username_attributes      = ["email"]
+  auto_verified_attributes = ["email"]
+
+  password_policy {
+    minimum_length = 6
+  }
+
+  lambda_config {
+    pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn
+  }
+}
+
+resource "aws_cognito_user_pool_domain" "domain" {
+  count        = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  domain       = local.cognito_custom_domain
+  user_pool_id = aws_cognito_user_pool.pool[0].id
+}
+
+resource "aws_cognito_user_pool_client" "user_pool_client" {
+  count                 = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  name                  = "jupyter-client"
+  access_token_validity = 1
+  token_validity_units {
+    access_token = "days"
+  }
+  callback_urls                        = ["https://${var.jupyterhub_domain}/hub/oauth_callback"]
+  user_pool_id                         = aws_cognito_user_pool.pool[0].id
+  allowed_oauth_flows_user_pool_client = true
+  allowed_oauth_flows                  = ["code"]
+  allowed_oauth_scopes                 = ["openid", "email"]
+  generate_secret                      = true
+  supported_identity_providers = [
+    "COGNITO"
+  ]
+
+  depends_on = [aws_cognito_user_pool_domain.domain]
+}
+
+#---------------------------------------------------------------
+# Cognito identity pool creation.
+#----------------------------------------------------------------
+resource "aws_cognito_identity_pool" "identity_pool" {
+  count                            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  identity_pool_name               = "jupyterhub-identity-pool"
+  allow_unauthenticated_identities = false
+  cognito_identity_providers {
+    client_id               = aws_cognito_user_pool_client.user_pool_client[0].id
+    provider_name           = aws_cognito_user_pool.pool[0].endpoint
+    server_side_token_check = true
+  }
+
+  depends_on = [aws_cognito_user_pool_client.user_pool_client]
+}
+
+resource "aws_s3_bucket" "jupyterhub_bucket" {
+  count         = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  bucket_prefix = "jupyterhub-test-bucket-"
+}
+
+resource "aws_s3_object" "engineering_object" {
+  count  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  bucket = aws_s3_bucket.jupyterhub_bucket[0].id
+  key    = "engineering/"
+}
+
+resource "aws_s3_object" "legal_object" {
+  count  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  bucket = aws_s3_bucket.jupyterhub_bucket[0].id
+  key    = "legal/"
+}
+
+#---------------------------------------------------------------
+# IAM role for a team member from the engineering department
+# In theory there would be other departments such as "legal"
+#----------------------------------------------------------------
+resource "aws_iam_role" "cognito_authenticated_engineering_role" {
+  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+
+  name = "EngineeringTeamRole"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17",
+    Statement = [
+      {
+        Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"],
+        Effect = "Allow",
+        Principal = {
+          Federated = "cognito-identity.amazonaws.com"
+        },
+        Condition = {
+          StringEquals = {
+            "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id
+          },
+          "ForAnyValue:StringLike" : {
+            "cognito-identity.amazonaws.com:amr" : "authenticated"
+          }
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy" "s3_cognito_engineering_policy" {
+  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  name  = "s3_cognito_engineering_policy"
+  role  = aws_iam_role.cognito_authenticated_engineering_role[0].id
+
+  policy = <<-EOF
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": ["s3:List*"],
+      "Resource": "*",
+      "Condition": {
+        "StringEquals": {
+          "s3:prefix": "$${aws:PrincipalTag/department}"
+        }
+      }
+    }
+  ]
+}
+EOF
+}
+
+resource "aws_cognito_identity_pool_provider_principal_tag" "example" {
+  count                  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  identity_pool_id       = aws_cognito_identity_pool.identity_pool[0].id
+  identity_provider_name = aws_cognito_user_pool.pool[0].endpoint
+  use_defaults           = false
+  principal_tags = {
+    department = "department"
+  }
+}
+
+resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" {
+  count      = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  name       = "S3ReadOnlyAccessAttachment"
+  policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
+  roles      = [aws_iam_role.cognito_authenticated_engineering_role[0].name]
+}
+
+resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" {
+  count            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id
+  roles = {
+    authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml
new file mode 100644
index 000000000..e69ed7f3d
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+name: efs
+description: Helm chart for efs options on the cluster
+version: 0.0.1
+appVersion: 0.0.1
diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml
new file mode 100644
index 000000000..3098ce85e
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: {{ .Values.pv.name }}
+spec:
+  capacity:
+    storage: 123Gi
+  accessModes:
+    - ReadWriteMany
+  storageClassName: efs-sc
+  persistentVolumeReclaimPolicy: Retain
+  csi:
+    driver: efs.csi.aws.com
+    volumeHandle: {{ .Values.pv.volumeHandle }}
diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml
new file mode 100644
index 000000000..1ab334f6d
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Values.pvc.name }}
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: efs-sc
+  resources:
+    requests:
+      storage: 1Gi
diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml
new file mode 100644
index 000000000..c0fee0a22
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml
@@ -0,0 +1,5 @@
+pv:
+  name: efs-persist
+  volumeHandle:
+pvc:
+  name: efs-persist
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
new file mode 100755
index 000000000..4e2073836
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
@@ -0,0 +1,264 @@
+hub:
+  db:
+    pvc:
+      storage: 50Gi
+      storageClassName: gp3
+  authenticatePrometheus: false
+  command: ["sh", "-c", "pip install boto3 && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py"]
+  config:
+    GenericOAuthenticator:
+      oauth_callback_url: ${jupyterdomain}
+      client_id: ${client_id}
+      client_secret: ${client_secret}
+      authorize_url: ${authorize_url}
+      token_url: ${token_url}
+      userdata_url: ${userdata_url}
+      scope:
+        - openid
+        - email
+      username_key: "username"
+      login_service : "AWS Cognito"
+      userdata_method: "POST"
+    JupyterHub:
+      authenticator_class: generic-oauth
+  extraConfig:
+    jupyterhub_config.py: |-
+      c.KubeSpawner.start_timeout = 1200
+      c.Authenticator.enable_auth_state = True
+
+    cognito_config.py: |-
+      import boto3
+      def auth_state_hook(spawner, auth_state):
+        client_idp = boto3.client('cognito-idp', region_name="${region}")
+        auth_response = client_idp.initiate_auth(
+          AuthFlow="REFRESH_TOKEN_AUTH",
+          AuthParameters={
+            "REFRESH_TOKEN": auth_state['refresh_token'],
+            "SECRET_HASH": "${client_secret}"
+          },
+          ClientId="${client_id}"
+        )
+        id_token = auth_response["AuthenticationResult"]["IdToken"]
+        client_identity = boto3.client("cognito-identity", region_name="${region}")
+        identity_response = client_identity.get_id(
+          IdentityPoolId="${identity_pool_id}",
+          Logins={
+            f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token
+          }
+        )
+        identity_id = identity_response['IdentityId']
+        credentials = client_identity.get_credentials_for_identity(
+          IdentityId=identity_id,
+          Logins={
+            f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token
+          }
+        )
+        key = credentials["Credentials"]["AccessKeyId"]
+        secret = credentials["Credentials"]["SecretKey"]
+        token = credentials["Credentials"]["SessionToken"]
+        spawner.environment['AWS_ACCESS_KEY_ID'] = key
+        spawner.environment['AWS_SECRET_ACCESS_KEY'] = secret
+        spawner.environment['AWS_SESSION_TOKEN'] = token
+
+      c.Spawner.auth_state_hook = auth_state_hook
+
+proxy:
+  https:
+    enabled: true
+    type: offload
+  service:
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn}
+      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
+      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
+      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+
+singleuser:
+  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
+  profileList:
+    - display_name: Data Engineering (CPU)
+      description: "PySpark Notebooks | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pyspark350:
+              display_name: "PySpark 3.5.0 + Python 3.11"
+              default: true
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.5.0
+            pyspark341:
+              display_name: "PySpark 3.4.1 + Python 3.11"
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.4.1
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    # NOTE:
+    - display_name: Trainium (trn1)
+      description: "Trainium | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch 1.13.1 + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: "start-singleuser.sh"
+    - display_name: Inferentia (inf2)
+      description: "Inferentia | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 20
+        mem_guarantee: 100G
+        cpu_limit: 20
+        mem_limit: 100G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: null
+    - display_name: Data Science (GPU + Time-Slicing - G5)
+      default: true
+      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
+      kubespawner_override:
+        # namespace: data-team-a
+        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_limit: 2
+        mem_limit: 4G
+        cpu_guarantee: 2
+        mem_guarantee: 4G
+        cmd: "start-singleuser.sh"
+    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
+    # Hence, this profile relies on Managed node groups with GPU MIG enabled
+    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
+      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_guarantees:
+          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
+        # extra_resource_limits:
+        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+    - display_name: Data Science (GPU - P4d.24xlarge)
+      description: "GPU with P4d instances | Karpenter Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "8"
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+  storage:
+    type: "static"
+    static:
+      pvcName: "efs-persist"
+      subPath: "{username}"
+    extraVolumes:
+    - name: jupyterhub-shared
+      persistentVolumeClaim:
+        claimName: efs-persist-shared
+    extraVolumeMounts:
+    - name: jupyterhub-shared
+      mountPath: /home/shared
+      readOnly: false
+  serviceAccountName: ${jupyter_single_user_sa_name}
+  allowPrivilegeEscalation: true
+  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
+    securityContext:
+        fsGroup: 100
+  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
+    GRANT_SUDO: "yes"
+    NOTEBOOK_ARGS: "--allow-root"
+    CHOWN_HOME: "yes"
+    CHOWN_HOME_OPTS: "-R"
+    CHOWN_EXTRA: "/home/shared"
+    HUGGING_FACE_HUB_TOKEN:
+      valueFrom:
+        secretKeyRef:
+          name: hf-token
+          key: token
+  uid: 0
+  fsGid: 0
+  cmd: null
+
+# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+scheduling:
+  userScheduler:
+    enabled: true
+  podPriority:
+    enabled: true
+  userPlaceholder:
+    enabled: false
+    replicas: 1
+#  userPods:
+#    nodeAffinity:
+#      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
+
+prePuller:
+  hook:
+    enabled: false
+  continuous:
+    # NOTE: if used with Karpenter, also add user-placeholders
+    enabled: false
+
+global:
+  safeToShowValues: false
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
new file mode 100755
index 000000000..0d1fcdc4e
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
@@ -0,0 +1,219 @@
+hub:
+  db:
+    pvc:
+      storage: 50Gi
+      storageClassName: gp3
+  authenticatePrometheus: false
+
+proxy:
+  https:
+    enabled: false
+    type: offload
+  service:
+    type: ClusterIP
+    # Disabled LoadBalancer type
+#    annotations:
+#      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "ssl_cert_arn"
+#      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
+#      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
+#      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
+#      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+#      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+#      service.beta.kubernetes.io/aws-load-balancer-type: external
+#      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+#      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+singleuser:
+  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
+  profileList:
+    - display_name: Elyra (CPU)
+      description: "Elyra Notebooks | Karpenter Autoscaling"
+      kubespawner_override:
+        image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    - display_name: Data Engineering (CPU)
+      description: "PySpark Notebooks | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pyspark350:
+              display_name: "PySpark 3.5.0 + Python 3.11"
+              default: true
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.5.0
+            pyspark341:
+              display_name: "PySpark 3.4.1 + Python 3.11"
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.4.1
+      kubespawner_override:
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    # NOTE:
+    - display_name: Trainium (trn1)
+      description: "Trainium | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch 1.13.1 + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: "start-singleuser.sh"
+    - display_name: Inferentia (inf2)
+      description: "Inferentia | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 20
+        mem_guarantee: 100G
+        cpu_limit: 20
+        mem_limit: 100G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: null
+    - display_name: Data Science (GPU + Time-Slicing - G5)
+      default: true
+      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
+      kubespawner_override:
+        # namespace: data-team-a
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        node_selector:
+          NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_limit: 2
+        mem_limit: 4G
+        cpu_guarantee: 2
+        mem_guarantee: 4G
+        cmd: "start-singleuser.sh"
+    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
+    # Hence, this profile relies on Managed node groups with GPU MIG enabled
+    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
+      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_guarantees:
+          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
+        # extra_resource_limits:
+        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+    - display_name: Data Science (GPU - P4d.24xlarge)
+      description: "GPU with P4d instances | Karpenter Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "8"
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+  storage:
+    type: "static"
+    static:
+      pvcName: "efs-persist"
+      subPath: "{username}"
+    extraVolumes:
+    - name: jupyterhub-shared
+      persistentVolumeClaim:
+        claimName: efs-persist-shared
+    extraVolumeMounts:
+    - name: jupyterhub-shared
+      mountPath: /home/shared
+  serviceAccountName: ${jupyter_single_user_sa_name}
+  allowPrivilegeEscalation: true
+  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
+    securityContext:
+        fsGroup: 100
+  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
+    GRANT_SUDO: "yes"
+    NOTEBOOK_ARGS: "--allow-root"
+    CHOWN_HOME: "yes"
+    CHOWN_HOME_OPTS: "-R"
+    CHOWN_EXTRA: "/home/shared"
+    HUGGING_FACE_HUB_TOKEN:
+      valueFrom:
+        secretKeyRef:
+          name: hf-token
+          key: token
+  uid: 0
+  fsGid: 0
+  cmd: null
+
+# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+scheduling:
+  userScheduler:
+    enabled: true
+  podPriority:
+    enabled: true
+  userPlaceholder:
+    enabled: false
+    replicas: 1
+#  userPods:
+#    nodeAffinity:
+#      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
+
+prePuller:
+  hook:
+    enabled: false
+  continuous:
+    # NOTE: if used with Karpenter, also add user-placeholders
+    enabled: false
+
+global:
+  safeToShowValues: false
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
new file mode 100755
index 000000000..486a750a8
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
@@ -0,0 +1,232 @@
+hub:
+  db:
+    pvc:
+      storage: 50Gi
+      storageClassName: gp3
+  authenticatePrometheus: false
+  config:
+    GenericOAuthenticator:
+      oauth_callback_url: ${jupyterdomain}
+      client_id: ${client_id}
+      client_secret: ${client_secret}
+      authorize_url: ${authorize_url}
+      token_url: ${token_url}
+      userdata_url: ${userdata_url}
+      scope:
+        - openid
+        - profile
+      username_key: "${username_key}"
+      login_service: "oauth"
+      allow_all: true # Allows all oauth authenticated users to use Jupyterhub. For finer grained control, you can use `allowed_users`: https://jupyterhub.readthedocs.io/en/stable/tutorial/getting-started/authenticators-users-basics.html#deciding-who-is-allowed
+    JupyterHub:
+      authenticator_class: generic-oauth
+proxy:
+  https:
+    enabled: true
+    type: offload
+  service:
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn}
+      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
+      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
+      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+
+singleuser:
+  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
+  profileList:
+    - display_name: Elyra (CPU)
+      description: "Elyra Notebooks | Karpenter Autoscaling"
+      kubespawner_override:
+        image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    - display_name: Data Engineering (CPU)
+      description: "PySpark Notebooks | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pyspark350:
+              display_name: "PySpark 3.5.0 + Python 3.11"
+              default: true
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.5.0
+            pyspark341:
+              display_name: "PySpark 3.4.1 + Python 3.11"
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.4.1
+      kubespawner_override:
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    # NOTE:
+    - display_name: Trainium (trn1)
+      description: "Trainium | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch 1.13.1 + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: "start-singleuser.sh"
+    - display_name: Inferentia (inf2)
+      description: "Inferentia | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        tolerations:
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+        cpu_guarantee: 20
+        mem_guarantee: 100G
+        cpu_limit: 20
+        mem_limit: 100G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: null
+    - display_name: Data Science (GPU + Time-Slicing - G5)
+      default: true
+      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
+      kubespawner_override:
+        # namespace: data-team-a
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_limit: 2
+        mem_limit: 4G
+        cpu_guarantee: 2
+        mem_guarantee: 4G
+        cmd: "start-singleuser.sh"
+    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
+    # Hence, this profile relies on Managed node groups with GPU MIG enabled
+    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
+      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_guarantees:
+          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
+        # extra_resource_limits:
+        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+    - display_name: Data Science (GPU - P4d.24xlarge)
+      description: "GPU with P4d instances | Karpenter Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "8"
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+  storage:
+    type: "static"
+    static:
+      pvcName: "efs-persist"
+      subPath: "home/{username}"
+    extraVolumes:
+      - name: jupyterhub-shared
+        persistentVolumeClaim:
+          claimName: efs-persist-shared
+    extraVolumeMounts:
+      - name: jupyterhub-shared
+        mountPath: /home/shared
+        readOnly: false
+  serviceAccountName: ${jupyter_single_user_sa_name}
+  allowPrivilegeEscalation: true
+  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
+    securityContext:
+      fsGroup: 100
+  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
+    GRANT_SUDO: "yes"
+    NOTEBOOK_ARGS: "--allow-root"
+    CHOWN_HOME: "yes"
+    CHOWN_HOME_OPTS: "-R"
+    CHOWN_EXTRA: "/home/shared"
+    HUGGING_FACE_HUB_TOKEN:
+      valueFrom:
+        secretKeyRef:
+          name: hf-token
+          key: token
+  uid: 0
+  fsGid: 0
+  cmd: null
+
+# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+scheduling:
+  userScheduler:
+    enabled: true
+  podPriority:
+    enabled: true
+  userPlaceholder:
+    enabled: false
+    replicas: 1
+  userPods:
+    nodeAffinity:
+      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
+
+prePuller:
+  hook:
+    enabled: false
+  continuous:
+    # NOTE: if used with Karpenter, also add user-placeholders
+    enabled: false
+
+global:
+  safeToShowValues: false
diff --git a/ai-ml/infrastructure/terraform/jupyterhub.tf b/ai-ml/infrastructure/terraform/jupyterhub.tf
new file mode 100644
index 000000000..7170c9ff9
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/jupyterhub.tf
@@ -0,0 +1,163 @@
+#-----------------------------------------------------------------------------------------
+# JupyterHub Sinlgle User IRSA, maybe that block could be incorporated in add-on registry
+#-----------------------------------------------------------------------------------------
+resource "kubernetes_namespace" "jupyterhub" {
+  metadata {
+    name = "jupyterhub"
+  }
+}
+
+module "jupyterhub_single_user_irsa" {
+  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
+
+  role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa"
+
+  role_policy_arns = {
+    policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances.
+  }
+
+  oidc_providers = {
+    main = {
+      provider_arn               = module.eks.oidc_provider_arn
+      namespace_service_accounts = ["${kubernetes_namespace.jupyterhub.metadata[0].name}:jupyterhub-single-user"]
+    }
+  }
+}
+
+resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" {
+  metadata {
+    name        = "${module.eks.cluster_name}-jupyterhub-single-user"
+    namespace   = kubernetes_namespace.jupyterhub.metadata[0].name
+    annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa.iam_role_arn }
+  }
+
+  automount_service_account_token = true
+}
+
+resource "kubernetes_secret_v1" "jupyterhub_single_user" {
+  metadata {
+    name      = "${module.eks.cluster_name}-jupyterhub-single-user-secret"
+    namespace = kubernetes_namespace.jupyterhub.metadata[0].name
+    annotations = {
+      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
+      "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub.metadata[0].name
+    }
+  }
+
+  type = "kubernetes.io/service-account-token"
+}
+
+#---------------------------------------------------------------
+# EFS Filesystem for private volumes per user
+# This will be replaced with Dynamic EFS provision using EFS CSI Driver
+#---------------------------------------------------------------
+resource "aws_efs_file_system" "efs" {
+  encrypted = true
+
+  tags = local.tags
+}
+
+#---------------------------------------------------------------
+# module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
+# We use index 2 and 3 to select the subnet in AZ1 with the 100.x CIDR:
+# Create EFS mount targets for the 3rd  subnet
+resource "aws_efs_mount_target" "efs_mt_1" {
+  file_system_id  = aws_efs_file_system.efs.id
+  subnet_id       = module.vpc.private_subnets[2]
+  security_groups = [aws_security_group.efs.id]
+}
+
+# Create EFS mount target for the 4th subnet
+resource "aws_efs_mount_target" "efs_mt_2" {
+  file_system_id  = aws_efs_file_system.efs.id
+  subnet_id       = module.vpc.private_subnets[3]
+  security_groups = [aws_security_group.efs.id]
+}
+
+resource "aws_security_group" "efs" {
+  name        = "${local.name}-efs"
+  description = "Allow inbound NFS traffic from private subnets of the VPC"
+  vpc_id      = module.vpc.vpc_id
+
+  ingress {
+    description = "Allow NFS 2049/tcp"
+    cidr_blocks = module.vpc.vpc_secondary_cidr_blocks
+    from_port   = 2049
+    to_port     = 2049
+    protocol    = "tcp"
+  }
+
+  tags = local.tags
+}
+
+#---------------------------------------
+# EFS Configuration
+#---------------------------------------
+module "efs_config" {
+  source  = "aws-ia/eks-blueprints-addons/aws"
+  version = "~> 1.2"
+
+  cluster_name      = module.eks.cluster_name
+  cluster_endpoint  = module.eks.cluster_endpoint
+  cluster_version   = module.eks.cluster_version
+  oidc_provider_arn = module.eks.oidc_provider_arn
+
+  helm_releases = {
+    efs = {
+      name             = "efs"
+      description      = "A Helm chart for storage configurations"
+      namespace        = "jupyterhub"
+      create_namespace = false
+      chart            = "${path.module}/helm-values/efs"
+      chart_version    = "0.0.1"
+      values = [
+        <<-EOT
+          pv:
+            name: efs-persist
+            volumeHandle: ${aws_efs_file_system.efs.id}:/home
+          pvc:
+            name: efs-persist
+        EOT
+      ]
+    }
+    efs-shared = {
+      name             = "efs-shared"
+      description      = "A Helm chart for shared storage configurations"
+      namespace        = "jupyterhub"
+      create_namespace = false
+      chart            = "${path.module}/helm-values/efs"
+      chart_version    = "0.0.1"
+      values = [
+        <<-EOT
+          pv:
+            name: efs-persist-shared
+            volumeHandle: ${aws_efs_file_system.efs.id}:/shared
+          pvc:
+            name: efs-persist-shared
+        EOT
+      ]
+    }
+  }
+
+  depends_on = [kubernetes_namespace.jupyterhub]
+}
+#---------------------------------------------------------------
+# Additional Resources
+#---------------------------------------------------------------
+resource "kubernetes_secret_v1" "huggingface_token" {
+  metadata {
+    name      = "hf-token"
+    namespace = kubernetes_namespace.jupyterhub.metadata[0].name
+  }
+
+  data = {
+    token = var.huggingface_token
+  }
+}
+
+resource "kubernetes_config_map_v1" "notebook" {
+  metadata {
+    name      = "notebook"
+    namespace = kubernetes_namespace.jupyterhub.metadata[0].name
+  }
+}
diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf
index 60618324a..d768410c7 100644
--- a/ai-ml/infrastructure/terraform/variables.tf
+++ b/ai-ml/infrastructure/terraform/variables.tf
@@ -1,6 +1,6 @@
 variable "name" {
   description = "Name of the VPC and EKS Cluster"
-  default     = "ml-stack"
+  default     = "ai-stack"
   type        = string
 }
 
@@ -35,13 +35,18 @@ variable "secondary_cidr_blocks" {
 variable "enable_aws_cloudwatch_metrics" {
   description = "Enable AWS Cloudwatch Metrics addon"
   type        = bool
-  default     = true
+  default     = false
 }
 variable "bottlerocket_data_disk_snapshot_id" {
   description = "Bottlerocket Data Disk Snapshot ID"
   type        = string
   default     = ""
 }
+variable "enable_aws_efs_csi_driver" {
+  description = "Enable AWS EFS CSI Driver"
+  type        = bool
+  default     = false
+}
 variable "enable_aws_efa_k8s_device_plugin" {
   description = "Enable AWS EFA K8s Device Plugin"
   type        = bool
@@ -101,12 +106,12 @@ variable "enable_jupyterhub" {
 variable "enable_volcano" {
   description = "Enable Volcano"
   type        = bool
-  default     = true
+  default     = false
 }
 variable "enable_kuberay_operator" {
   description = "Enable KubeRay Operator"
   type        = bool
-  default     = true
+  default     = false
 }
 variable "huggingface_token" {
   description = "Hugging Face Secret Token"
@@ -114,3 +119,64 @@ variable "huggingface_token" {
   default     = "DUMMY_TOKEN_REPLACE_ME"
   sensitive   = true
 }
+
+# Jupyterhub Specific Variables
+
+# NOTE: You need to use private domain or public domain name with ACM certificate
+# Data-on-EKS website docs will show you how to create free public domain name with ACM certificate for testing purpose only
+# Example of public domain name(<subdomain-name>.<domain-name>.com): eks.jupyter-doeks.dynamic-dns.com
+variable "jupyter_hub_auth_mechanism" {
+  type        = string
+  description = "Allowed values: cognito, dummy, oauth"
+  default     = "dummy"
+}
+
+#  Domain name is public so make sure you use a unique while deploying, Only needed if auth mechanism is set to cognito
+variable "cognito_custom_domain" {
+  description = "Cognito domain prefix for Hosted UI authentication endpoints"
+  type        = string
+  default     = "eks"
+}
+
+# Only needed if auth mechanism is set to cognito
+variable "acm_certificate_domain" {
+  type        = string
+  description = "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to cognito or oauth. This is the domain for jupyterhub
+variable "jupyterhub_domain" {
+  type        = string
+  description = "Enter domain name for jupyterhub to be hosted,  e.g. eks.example.com. Only needed if auth mechanism is set to cognito or oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the root path for the oidc endpoints
+variable "oauth_domain" {
+  type        = string
+  description = "Enter oauth domain and endpoint, e.g. https://keycloak.example.com/realms/master/protocol/openid-connect. Only needed if auth mechanism is set to oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the id of the client
+variable "oauth_jupyter_client_id" {
+  type        = string
+  description = "Enter oauth client id for jupyterhub, e.g. jupyterhub. Only needed if auth mechanism is set to oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the secret for the client
+variable "oauth_jupyter_client_secret" {
+  type        = string
+  description = "Enter oauth client secret. Only needed if auth mechanism is set to oauth"
+  default     = ""
+  sensitive   = true
+}
+
+# Only needed if auth mechanism is set to oauth. This is the key to use for looking up the username.
+variable "oauth_username_key" {
+  type        = string
+  description = "oauth field for the username. e.g. 'preferred_username' Only needed if auth mechanism is set to oauth"
+  default     = ""
+}

From 07cffe3d7e3624deb34ee7c4c946eae460247ae5 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Wed, 29 Jan 2025 12:40:20 -0800
Subject: [PATCH 09/16] jark stack consolidation

---
 ai-ml/infrastructure/terraform/cleanup.sh | 10 +++-
 ai-ml/infrastructure/terraform/install.sh | 10 +++-
 ai-ml/jark-stack/install.sh               |  6 ++
 ai-ml/jark-stack/terraform/cleanup.sh     | 71 -----------------------
 ai-ml/jark-stack/terraform/install.sh     | 33 -----------
 ai-ml/jark-stack/terraform/variables.tf   | 60 -------------------
 6 files changed, 22 insertions(+), 168 deletions(-)
 create mode 100755 ai-ml/jark-stack/install.sh
 delete mode 100755 ai-ml/jark-stack/terraform/cleanup.sh
 delete mode 100755 ai-ml/jark-stack/terraform/install.sh
 delete mode 100644 ai-ml/jark-stack/terraform/variables.tf

diff --git a/ai-ml/infrastructure/terraform/cleanup.sh b/ai-ml/infrastructure/terraform/cleanup.sh
index b09efd384..bbf91142d 100755
--- a/ai-ml/infrastructure/terraform/cleanup.sh
+++ b/ai-ml/infrastructure/terraform/cleanup.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+TERRAFORM_COMMAND="terraform destroy -auto-approve"
+# Check if blueprint.tfvars exists
+if [ -f "blueprint.tfvars" ]; then
+  TERRAFORM_COMMAND="$TERRAFORM_COMMAND -var-file=blueprint.tfvars"
+fi
+
 echo "Destroying RayService..."
 
 # Delete the Ingress/SVC before removing the addons
@@ -25,7 +31,7 @@ targets=(
 for target in "${targets[@]}"
 do
   echo "Destroying module $target..."
-  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty)
+  destroy_output=$($TERRAFORM_COMMAND -target="$target" 2>&1 | tee /dev/tty)
   if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
     echo "SUCCESS: Terraform destroy of $target completed successfully"
   else
@@ -62,7 +68,7 @@ for sg in $(aws ec2 describe-security-groups \
 
 ## Final destroy to catch any remaining resources
 echo "Destroying remaining resources..."
-destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
+destroy_output=$($TERRAFORM_COMMAND -var="region=$region" 2>&1 | tee /dev/tty)
 if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
   echo "SUCCESS: Terraform destroy of all modules completed successfully"
 else
diff --git a/ai-ml/infrastructure/terraform/install.sh b/ai-ml/infrastructure/terraform/install.sh
index 1814a9044..af8345f6a 100755
--- a/ai-ml/infrastructure/terraform/install.sh
+++ b/ai-ml/infrastructure/terraform/install.sh
@@ -9,11 +9,17 @@ targets=(
 # Initialize Terraform
 terraform init -upgrade
 
+TERRAFORM_COMMAND="terraform apply -auto-approve"
+# Check if blueprint.tfvars exists
+if [ -f "blueprint.tfvars" ]; then
+  TERRAFORM_COMMAND="$TERRAFORM_COMMAND -var-file=blueprint.tfvars"
+fi
+
 # Apply modules in sequence
 for target in "${targets[@]}"
 do
   echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
+  apply_output=$( $TERRAFORM_COMMAND -target="$target" 2>&1 | tee /dev/tty)
   if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
     echo "SUCCESS: Terraform apply of $target completed successfully"
   else
@@ -24,7 +30,7 @@ done
 
 # Final apply to catch any remaining resources
 echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
+apply_output=$( $TERRAFORM_COMMAND 2>&1 | tee /dev/tty)
 if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
   echo "SUCCESS: Terraform apply of all modules completed successfully"
 else
diff --git a/ai-ml/jark-stack/install.sh b/ai-ml/jark-stack/install.sh
new file mode 100755
index 000000000..77838a56e
--- /dev/null
+++ b/ai-ml/jark-stack/install.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copy the base infrastructure into the folder
+cp -r ../infrastructure/terraform/* ./terraform
+
+cd terraform
+source ./install.sh
diff --git a/ai-ml/jark-stack/terraform/cleanup.sh b/ai-ml/jark-stack/terraform/cleanup.sh
deleted file mode 100755
index b09efd384..000000000
--- a/ai-ml/jark-stack/terraform/cleanup.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-echo "Destroying RayService..."
-
-# Delete the Ingress/SVC before removing the addons
-TMPFILE=$(mktemp)
-terraform output -raw configure_kubectl > "$TMPFILE"
-# check if TMPFILE contains the string "No outputs found"
-if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then
-  echo "No outputs found, skipping kubectl delete"
-  source "$TMPFILE"
-  kubectl delete -f src/service/ray-service.yaml
-fi
-
-
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.data_addons"
-  "module.eks_blueprints_addons"
-  "module.eks"
-  "module.vpc"
-)
-
-# Destroy modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Destroying module $target..."
-  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
-    echo "SUCCESS: Terraform destroy of $target completed successfully"
-  else
-    echo "FAILED: Terraform destroy of $target failed"
-    exit 1
-  fi
-done
-
-echo "Destroying Load Balancers..."
-
-for arn in $(aws resourcegroupstaggingapi get-resources \
-  --resource-type-filters elasticloadbalancing:loadbalancer \
-  --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
-  --query 'ResourceTagMappingList[].ResourceARN' \
-  --output text); do \
-    aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \
-  done
-
-echo "Destroying Target Groups..."
-for arn in $(aws resourcegroupstaggingapi get-resources \
-  --resource-type-filters elasticloadbalancing:targetgroup \
-  --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \
-  --query 'ResourceTagMappingList[].ResourceARN' \
-  --output text); do \
-    aws elbv2 delete-target-group --target-group-arn "$arn"; \
-  done
-
-echo "Destroying Security Groups..."
-for sg in $(aws ec2 describe-security-groups \
-  --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \
-  --query 'SecurityGroups[].GroupId' --output text); do \
-    aws ec2 delete-security-group --group-id "$sg"; \
-  done
-
-## Final destroy to catch any remaining resources
-echo "Destroying remaining resources..."
-destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then
-  echo "SUCCESS: Terraform destroy of all modules completed successfully"
-else
-  echo "FAILED: Terraform destroy of all modules failed"
-  exit 1
-fi
diff --git a/ai-ml/jark-stack/terraform/install.sh b/ai-ml/jark-stack/terraform/install.sh
deleted file mode 100755
index 1814a9044..000000000
--- a/ai-ml/jark-stack/terraform/install.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.vpc"
-  "module.eks"
-)
-
-# Initialize Terraform
-terraform init -upgrade
-
-# Apply modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-    echo "SUCCESS: Terraform apply of $target completed successfully"
-  else
-    echo "FAILED: Terraform apply of $target failed"
-    exit 1
-  fi
-done
-
-# Final apply to catch any remaining resources
-echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-  echo "SUCCESS: Terraform apply of all modules completed successfully"
-else
-  echo "FAILED: Terraform apply of all modules failed"
-  exit 1
-fi
diff --git a/ai-ml/jark-stack/terraform/variables.tf b/ai-ml/jark-stack/terraform/variables.tf
deleted file mode 100644
index cfc27f17c..000000000
--- a/ai-ml/jark-stack/terraform/variables.tf
+++ /dev/null
@@ -1,60 +0,0 @@
-variable "name" {
-  description = "Name of the VPC and EKS Cluster"
-  default     = "jark-stack"
-  type        = string
-}
-
-# NOTE: Trainium and Inferentia are only available in us-west-2 and us-east-1 regions
-variable "region" {
-  description = "region"
-  default     = "us-west-2"
-  type        = string
-}
-
-variable "eks_cluster_version" {
-  description = "EKS Cluster version"
-  default     = "1.30"
-  type        = string
-}
-
-# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
-variable "vpc_cidr" {
-  description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range"
-  default     = "10.1.0.0/21"
-  type        = string
-}
-
-# RFC6598 range 100.64.0.0/10
-# Note you can only /16 range to VPC. You can add multiples of /16 if required
-variable "secondary_cidr_blocks" {
-  description = "Secondary CIDR blocks to be attached to VPC"
-  default     = ["100.64.0.0/16"]
-  type        = list(string)
-}
-
-variable "huggingface_token" {
-  description = "Hugging Face Secret Token"
-  type        = string
-  default     = "DUMMY_TOKEN_REPLACE_ME"
-  sensitive   = true
-}
-
-variable "enable_aws_efa_k8s_device_plugin" {
-  description = "Enable AWS EFA K8s Device Plugin"
-  type        = bool
-  default     = false
-}
-
-variable "enable_kubecost" {
-  description = "Enable Kubecost addon"
-  type        = bool
-  default     = false
-}
-
-
-variable "bottlerocket_data_disk_snpashot_id" {
-  description = "Bottlerocket Data Disk Snapshot ID"
-  type        = string
-  default     = ""
-
-}

From 5566f806f1ef70dcddb466214beeea271234e604 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:04:56 -0800
Subject: [PATCH 10/16] consolidated blueprints

---
 ai-ml/bionemo/install.sh                      |  36 +-
 ai-ml/emr-spark-rapids/install.sh             |  35 +-
 .../monitoring/serviceMonitor-dcgm.yaml       |  26 +
 .../{terraform => }/src/app/Dockerfile        |   0
 .../jark-stack/{terraform => }/src/app/run.sh |   0
 .../{terraform => }/src/app/streamlit.py      |   0
 .../{terraform => }/src/app/streamlit.yaml    |   0
 .../{terraform => }/src/notebook/Dockerfile   |   0
 .../src/notebook/dogbooth.ipynb               |   0
 .../{terraform => }/src/service/Dockerfile    |   0
 .../{terraform => }/src/service/dogbooth.py   |   0
 .../src/service/ray-service.yaml              |   0
 ai-ml/jupyterhub/addons.tf                    | 539 ------------------
 ai-ml/jupyterhub/cleanup.sh                   |  51 --
 ai-ml/jupyterhub/cognito.tf                   | 224 --------
 .../helm/aws-for-fluentbit/values.yaml        |  80 ---
 .../helm/cluster-autoscaler/values.yaml       |  25 -
 ai-ml/jupyterhub/helm/efs/Chart.yaml          |   5 -
 .../jupyterhub/helm/efs/templates/efs-pv.yaml |  12 -
 .../helm/efs/templates/efs-pvc.yaml           |  11 -
 ai-ml/jupyterhub/helm/efs/values.yaml         |   5 -
 .../jupyterhub/jupyterhub-values-cognito.yaml | 304 ----------
 .../jupyterhub/jupyterhub-values-dummy.yaml   | 259 ---------
 .../jupyterhub/jupyterhub-values-oauth.yaml   | 273 ---------
 .../helm/kube-prometheus-stack/values.yaml    |  80 ---
 ai-ml/jupyterhub/helm/kubecost/values.yaml    |  65 ---
 .../helm/metrics-server/values.yaml           |  52 --
 ai-ml/jupyterhub/install.sh                   |  35 +-
 ai-ml/jupyterhub/jupyterhub.tf                | 143 -----
 ai-ml/jupyterhub/main.tf                      | 157 -----
 ai-ml/jupyterhub/outputs.tf                   |   4 -
 ai-ml/jupyterhub/variables.tf                 |  91 ---
 ai-ml/jupyterhub/versions.tf                  |  27 -
 ai-ml/jupyterhub/vpc.tf                       |  53 --
 ai-ml/mlflow/cleanup.sh                       |  45 --
 ai-ml/mlflow/install.sh                       |  39 +-
 ai-ml/mlflow/variables.tf                     |  44 --
 37 files changed, 42 insertions(+), 2678 deletions(-)
 create mode 100644 ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml
 rename ai-ml/jark-stack/{terraform => }/src/app/Dockerfile (100%)
 rename ai-ml/jark-stack/{terraform => }/src/app/run.sh (100%)
 rename ai-ml/jark-stack/{terraform => }/src/app/streamlit.py (100%)
 rename ai-ml/jark-stack/{terraform => }/src/app/streamlit.yaml (100%)
 rename ai-ml/jark-stack/{terraform => }/src/notebook/Dockerfile (100%)
 rename ai-ml/jark-stack/{terraform => }/src/notebook/dogbooth.ipynb (100%)
 rename ai-ml/jark-stack/{terraform => }/src/service/Dockerfile (100%)
 rename ai-ml/jark-stack/{terraform => }/src/service/dogbooth.py (100%)
 rename ai-ml/jark-stack/{terraform => }/src/service/ray-service.yaml (100%)
 delete mode 100755 ai-ml/jupyterhub/addons.tf
 delete mode 100755 ai-ml/jupyterhub/cleanup.sh
 delete mode 100644 ai-ml/jupyterhub/cognito.tf
 delete mode 100644 ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/efs/Chart.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/efs/values.yaml
 delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml
 delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml
 delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/kubecost/values.yaml
 delete mode 100644 ai-ml/jupyterhub/helm/metrics-server/values.yaml
 delete mode 100644 ai-ml/jupyterhub/jupyterhub.tf
 delete mode 100755 ai-ml/jupyterhub/main.tf
 delete mode 100755 ai-ml/jupyterhub/outputs.tf
 delete mode 100755 ai-ml/jupyterhub/variables.tf
 delete mode 100755 ai-ml/jupyterhub/versions.tf
 delete mode 100755 ai-ml/jupyterhub/vpc.tf
 delete mode 100755 ai-ml/mlflow/cleanup.sh
 delete mode 100644 ai-ml/mlflow/variables.tf

diff --git a/ai-ml/bionemo/install.sh b/ai-ml/bionemo/install.sh
index 8430565fc..77838a56e 100755
--- a/ai-ml/bionemo/install.sh
+++ b/ai-ml/bionemo/install.sh
@@ -1,34 +1,6 @@
 #!/bin/bash
+# Copy the base infrastructure into the folder
+cp -r ../infrastructure/terraform/* ./terraform
 
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.vpc"
-  "module.eks"
-)
-
-# Initialize Terraform
-echo "Initializing ..."
-terraform init --upgrade || echo "\"terraform init\" failed"
-
-# Apply modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-    echo "SUCCESS: Terraform apply of $target completed successfully"
-  else
-    echo "FAILED: Terraform apply of $target failed"
-    exit 1
-  fi
-done
-
-# Final apply to catch any remaining resources
-echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-  echo "SUCCESS: Terraform apply of all modules completed successfully"
-else
-  echo "FAILED: Terraform apply of all modules failed"
-  exit 1
-fi
+cd terraform
+source ./install.sh
diff --git a/ai-ml/emr-spark-rapids/install.sh b/ai-ml/emr-spark-rapids/install.sh
index b87db5117..77838a56e 100755
--- a/ai-ml/emr-spark-rapids/install.sh
+++ b/ai-ml/emr-spark-rapids/install.sh
@@ -1,33 +1,6 @@
 #!/bin/bash
+# Copy the base infrastructure into the folder
+cp -r ../infrastructure/terraform/* ./terraform
 
-echo "Initializing ..."
-terraform init || echo "\"terraform init\" failed"
-
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.vpc"
-  "module.eks"
-)
-
-# Apply modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-    echo "SUCCESS: Terraform apply of $target completed successfully"
-  else
-    echo "FAILED: Terraform apply of $target failed"
-    exit 1
-  fi
-done
-
-# Final apply to catch any remaining resources
-echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-  echo "SUCCESS: Terraform apply of all modules completed successfully"
-else
-  echo "FAILED: Terraform apply of all modules failed"
-  exit 1
-fi
+cd terraform
+source ./install.sh
diff --git a/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml
new file mode 100644
index 000000000..681298d40
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml
@@ -0,0 +1,26 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    release: kube-prometheus-stack
+  name: dcgm-exporter
+  namespace: nvidia-device-plugin
+spec:
+  endpoints:
+    - honorLabels: false
+      interval: 15s
+      path: /metrics
+      port: metrics
+      relabelings:
+        - action: replace
+          sourceLabels:
+            - __meta_ec2_instance_id
+          targetLabel: instance
+  namespaceSelector:
+    matchNames:
+      - kube-system
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: dcgm-exporter
+      app.kubernetes.io/instance: dcgm-exporter
+      app.kubernetes.io/name: dcgm-exporter
diff --git a/ai-ml/jark-stack/terraform/src/app/Dockerfile b/ai-ml/jark-stack/src/app/Dockerfile
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/app/Dockerfile
rename to ai-ml/jark-stack/src/app/Dockerfile
diff --git a/ai-ml/jark-stack/terraform/src/app/run.sh b/ai-ml/jark-stack/src/app/run.sh
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/app/run.sh
rename to ai-ml/jark-stack/src/app/run.sh
diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.py b/ai-ml/jark-stack/src/app/streamlit.py
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/app/streamlit.py
rename to ai-ml/jark-stack/src/app/streamlit.py
diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.yaml b/ai-ml/jark-stack/src/app/streamlit.yaml
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/app/streamlit.yaml
rename to ai-ml/jark-stack/src/app/streamlit.yaml
diff --git a/ai-ml/jark-stack/terraform/src/notebook/Dockerfile b/ai-ml/jark-stack/src/notebook/Dockerfile
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/notebook/Dockerfile
rename to ai-ml/jark-stack/src/notebook/Dockerfile
diff --git a/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb b/ai-ml/jark-stack/src/notebook/dogbooth.ipynb
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb
rename to ai-ml/jark-stack/src/notebook/dogbooth.ipynb
diff --git a/ai-ml/jark-stack/terraform/src/service/Dockerfile b/ai-ml/jark-stack/src/service/Dockerfile
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/service/Dockerfile
rename to ai-ml/jark-stack/src/service/Dockerfile
diff --git a/ai-ml/jark-stack/terraform/src/service/dogbooth.py b/ai-ml/jark-stack/src/service/dogbooth.py
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/service/dogbooth.py
rename to ai-ml/jark-stack/src/service/dogbooth.py
diff --git a/ai-ml/jark-stack/terraform/src/service/ray-service.yaml b/ai-ml/jark-stack/src/service/ray-service.yaml
similarity index 100%
rename from ai-ml/jark-stack/terraform/src/service/ray-service.yaml
rename to ai-ml/jark-stack/src/service/ray-service.yaml
diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf
deleted file mode 100755
index ed8f7db44..000000000
--- a/ai-ml/jupyterhub/addons.tf
+++ /dev/null
@@ -1,539 +0,0 @@
-# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM)
-data "aws_acm_certificate" "issued" {
-  count    = var.jupyter_hub_auth_mechanism != "dummy" ? 1 : 0
-  domain   = var.acm_certificate_domain
-  statuses = ["ISSUED"]
-}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-locals {
-  cognito_custom_domain = var.cognito_custom_domain
-}
-
-#---------------------------------------------------------------
-# GP3 Encrypted Storage Class
-#---------------------------------------------------------------
-resource "kubernetes_annotations" "disable_gp2" {
-  annotations = {
-    "storageclass.kubernetes.io/is-default-class" : "false"
-  }
-  api_version = "storage.k8s.io/v1"
-  kind        = "StorageClass"
-  metadata {
-    name = "gp2"
-  }
-  force = true
-
-  depends_on = [module.eks.eks_cluster_id]
-}
-
-resource "kubernetes_storage_class" "default_gp3" {
-  metadata {
-    name = "gp3"
-    annotations = {
-      "storageclass.kubernetes.io/is-default-class" : "true"
-    }
-  }
-
-  storage_provisioner    = "ebs.csi.aws.com"
-  reclaim_policy         = "Delete"
-  allow_volume_expansion = true
-  volume_binding_mode    = "WaitForFirstConsumer"
-  parameters = {
-    fsType    = "ext4"
-    encrypted = true
-    type      = "gp3"
-  }
-
-  depends_on = [kubernetes_annotations.disable_gp2]
-}
-
-#---------------------------------------------------------------
-# IRSA for EBS CSI Driver
-#---------------------------------------------------------------
-module "ebs_csi_driver_irsa" {
-  source                = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-  version               = "~> 5.20"
-  role_name_prefix      = format("%s-%s", local.name, "ebs-csi-driver-")
-  attach_ebs_csi_policy = true
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
-    }
-  }
-  tags = local.tags
-}
-
-module "eks_blueprints_addons" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.2"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-  #---------------------------------------
-  # Amazon EKS Managed Add-ons
-  #---------------------------------------
-  eks_addons = {
-    aws-ebs-csi-driver = {
-      service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
-    }
-    coredns    = {}
-    kube-proxy = {}
-    # VPC CNI uses worker node IAM role policies
-    vpc-cni = {}
-  }
-
-  #---------------------------------------
-  # Metrics Server
-  #---------------------------------------
-  enable_metrics_server = true
-  metrics_server = {
-    timeout = "300"
-    values  = [templatefile("${path.module}/helm/metrics-server/values.yaml", {})]
-  }
-
-  #---------------------------------------
-  # Cluster Autoscaler
-  #---------------------------------------
-  enable_cluster_autoscaler = true
-  cluster_autoscaler = {
-    timeout     = "300"
-    create_role = true
-    values = [templatefile("${path.module}/helm/cluster-autoscaler/values.yaml", {
-      aws_region     = var.region,
-      eks_cluster_id = module.eks.cluster_name
-    })]
-  }
-
-  #---------------------------------------
-  # Karpenter Autoscaler for EKS Cluster
-  #---------------------------------------
-  enable_karpenter                  = true
-  karpenter_enable_spot_termination = true
-  karpenter_node = {
-    iam_role_additional_policies = {
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-  karpenter = {
-    chart_version       = "0.37.0"
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-  }
-
-  #---------------------------------------
-  # AWS Load Balancer Controller Add-on
-  #---------------------------------------
-  enable_aws_load_balancer_controller = true
-  # turn off the mutating webhook for services because we are using
-  # service.beta.kubernetes.io/aws-load-balancer-type: external
-  aws_load_balancer_controller = {
-    set = [{
-      name  = "enableServiceMutatorWebhook"
-      value = "false"
-    }]
-  }
-
-  #---------------------------------------
-  # Prometheus and Grafana stack
-  #---------------------------------------
-  #---------------------------------------------------------------
-  # Install Monitoring Stack with Prometheus and Grafana
-  # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
-  # 2- Grafana Admin user: admin
-  # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <output.grafana_secret_name> --region $AWS_REGION --query "SecretString" --output text`
-  #---------------------------------------------------------------
-  enable_kube_prometheus_stack = true
-  kube_prometheus_stack = {
-    values        = [templatefile("${path.module}/helm/kube-prometheus-stack/values.yaml", {})]
-    chart_version = "48.1.1"
-    set_sensitive = [
-      {
-        name  = "grafana.adminPassword"
-        value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
-      }
-    ],
-  }
-  #---------------------------------------
-  # AWS for FluentBit
-  #---------------------------------------
-  enable_aws_for_fluentbit = true
-  aws_for_fluentbit_cw_log_group = {
-    use_name_prefix   = false
-    name              = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
-    retention_in_days = 30
-  }
-  aws_for_fluentbit = {
-    values = [templatefile("${path.module}/helm/aws-for-fluentbit/values.yaml", {
-      region               = local.region,
-      cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
-      cluster_name         = module.eks.cluster_name
-    })]
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# Data on EKS Kubernetes Addons
-#---------------------------------------------------------------
-module "eks_data_addons" {
-  source  = "aws-ia/eks-data-addons/aws"
-  version = "1.33.0" # ensure to update this to the latest/desired version
-
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  #---------------------------------------------------------------
-  # Enable Neuron Device Plugin
-  #---------------------------------------------------------------
-  enable_aws_neuron_device_plugin = true
-
-  #---------------------------------------------------------------
-  # NVIDIA Device Plugin Add-on
-  #---------------------------------------------------------------
-  enable_nvidia_device_plugin = true
-  nvidia_device_plugin_helm_config = {
-    version = "v0.15.0"
-    name    = "nvidia-device-plugin"
-    values = [
-      <<-EOT
-        mixedStrategy: "mixed"
-        config:
-          map:
-            default: |-
-              version: v1
-              flags:
-                migStrategy: none
-              sharing:
-                timeSlicing:
-                  resources:
-                  - name: nvidia.com/gpu
-                    replicas: 4
-            nvidia-a100g: |-
-              version: v1
-              flags:
-                migStrategy: mixed
-              sharing:
-                timeSlicing:
-                  resources:
-                  - name: nvidia.com/gpu
-                    replicas: 8
-                  - name: nvidia.com/mig-1g.5gb
-                    replicas: 2
-                  - name: nvidia.com/mig-2g.10gb
-                    replicas: 2
-                  - name: nvidia.com/mig-3g.20gb
-                    replicas: 3
-                  - name: nvidia.com/mig-7g.40gb
-                    replicas: 7
-        gfd:
-          enabled: true
-        nfd:
-          worker:
-            tolerations:
-              - key: nvidia.com/gpu
-                operator: Exists
-                effect: NoSchedule
-              - operator: "Exists"
-              - key: "hub.jupyter.org/dedicated"
-                operator: "Equal"
-                value: "user"
-                effect: "NoSchedule"
-        tolerations:
-          - key: CriticalAddonsOnly
-            operator: Exists
-          - key: nvidia.com/gpu
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated"
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-      EOT
-    ]
-  }
-
-  #---------------------------------------------------------------
-  # JupyterHub Add-on
-  #---------------------------------------------------------------
-  enable_jupyterhub = true
-  jupyterhub_helm_config = {
-    values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", {
-      ssl_cert_arn                = try(data.aws_acm_certificate.issued[0].arn, "")
-      jupyterdomain               = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "")
-      authorize_url               = var.oauth_domain != "" ? "${var.oauth_domain}/auth" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
-      token_url                   = var.oauth_domain != "" ? "${var.oauth_domain}/token" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "")
-      userdata_url                = var.oauth_domain != "" ? "${var.oauth_domain}/userinfo" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "")
-      username_key                = try(var.oauth_username_key, "")
-      client_id                   = var.oauth_jupyter_client_id != "" ? var.oauth_jupyter_client_id : try(aws_cognito_user_pool_client.user_pool_client[0].id, "")
-      client_secret               = var.oauth_jupyter_client_secret != "" ? var.oauth_jupyter_client_secret : try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "")
-      user_pool_id                = try(aws_cognito_user_pool.pool[0].id, "")
-      identity_pool_id            = try(aws_cognito_identity_pool.identity_pool[0].id, "")
-      jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
-      region                      = var.region
-    })]
-    version = "3.2.1"
-  }
-
-  #---------------------------------------------------------------
-  # Kubecost Add-on
-  #---------------------------------------------------------------
-  enable_kubecost = true
-  kubecost_helm_config = {
-    values              = [templatefile("${path.module}/helm/kubecost/values.yaml", {})]
-    repository_username = data.aws_ecrpublic_authorization_token.token.user_name
-    repository_password = data.aws_ecrpublic_authorization_token.token.password
-  }
-
-  #---------------------------------------------------------------
-  # Karpenter Resources Add-on
-  #---------------------------------------------------------------
-  enable_karpenter_resources = true
-  karpenter_resources_helm_config = {
-    karpenter-resources-ts = {
-      values = [
-        <<-EOT
-      name: gpu-ts
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodePool: gpu-ts
-          - hub.jupyter.org/node-purpose: user
-        taints:
-          - key: hub.jupyter.org/dedicated
-            value: "user"
-            effect: "NoSchedule"
-          - key: nvidia.com/gpu
-            value: "Exists"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["g5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 60s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    karpenter-resources-mig = {
-      values = [
-        <<-EOT
-      name: gpu-mig
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodePool: gpu-mig
-          - hub.jupyter.org/node-purpose: user
-        taints:
-          - key: hub.jupyter.org/dedicated
-            value: "user"
-            effect: "NoSchedule"
-          - key: nvidia.com/gpu
-            value: "Exists"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["p4d"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["24xlarge"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 60s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    karpenter-resources-inf = {
-      values = [
-        <<-EOT
-      name: inferentia
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodePool: inferentia
-          - hub.jupyter.org/node-purpose: user
-        taints:
-          - key: aws.amazon.com/neuroncore
-            value: "true"
-            effect: "NoSchedule"
-          - key: aws.amazon.com/neuron
-            value: "true"
-            effect: "NoSchedule"
-          - key: hub.jupyter.org/dedicated
-            value: "user"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["inf2"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["8xlarge", "24xlarge"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 60s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    karpenter-resources-trn = {
-      values = [
-        <<-EOT
-      name: trainium
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[2]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodePool: trainium
-          - hub.jupyter.org/node-purpose: user
-        taints:
-          - key: aws.amazon.com/neuroncore
-            value: "true"
-            effect: "NoSchedule"
-          - key: aws.amazon.com/neuron
-            value: "true"
-            effect: "NoSchedule"
-          - key: hub.jupyter.org/dedicated
-            value: "user"
-            effect: "NoSchedule"
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["trn1"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: ["32xlarge"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 60s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-    x86-cpu-karpenter = {
-      values = [
-        <<-EOT
-      name: x86-cpu-karpenter
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          id: ${module.vpc.private_subnets[3]}
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        instanceStorePolicy: RAID0
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodePool: default
-          - hub.jupyter.org/node-purpose: user
-        requirements:
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["m5"]
-          - key: "karpenter.k8s.aws/instance-size"
-            operator: In
-            values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 60s
-          expireAfter: 720h
-        weight: 100
-      EOT
-      ]
-    }
-  }
-}
-
-#---------------------------------------------------------------
-# Grafana Admin credentials resources
-#---------------------------------------------------------------
-data "aws_secretsmanager_secret_version" "admin_password_version" {
-  secret_id  = aws_secretsmanager_secret.grafana.id
-  depends_on = [aws_secretsmanager_secret_version.grafana]
-}
-
-resource "random_password" "grafana" {
-  length           = 16
-  special          = true
-  override_special = "@_"
-}
-
-#tfsec:ignore:aws-ssm-secret-use-customer-key
-resource "aws_secretsmanager_secret" "grafana" {
-  name_prefix             = "${local.name}-grafana-"
-  recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
-}
-
-resource "aws_secretsmanager_secret_version" "grafana" {
-  secret_id     = aws_secretsmanager_secret.grafana.id
-  secret_string = random_password.grafana.result
-}
diff --git a/ai-ml/jupyterhub/cleanup.sh b/ai-ml/jupyterhub/cleanup.sh
deleted file mode 100755
index 8438ddf84..000000000
--- a/ai-ml/jupyterhub/cleanup.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-set -o errexit
-set -o pipefail
-
-targets=(
-  "module.eks_data_addons"
-  "module.eks_blueprints_addons"
-  "module.eks"
-  "module.vpc"
-)
-
-#-------------------------------------------
-# Helpful to delete the stuck in "Terminating" namespaces
-# Rerun the cleanup.sh script to detect and delete the stuck resources
-#-------------------------------------------
-terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
-
-# If there are no terminating namespaces, exit the script
-if [[ -z $terminating_namespaces ]]; then
-    echo "No terminating namespaces found"
-fi
-
-for ns in $terminating_namespaces; do
-    echo "Terminating namespace: $ns"
-    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
-done
-
-#-------------------------------------------
-# Terraform destroy per module target
-#-------------------------------------------
-for target in "${targets[@]}"
-do
-  destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-    echo "SUCCESS: Terraform destroy of $target completed successfully"
-  else
-    echo "FAILED: Terraform destroy of $target failed"
-    exit 1
-  fi
-done
-
-#-------------------------------------------
-# Terraform destroy full
-#-------------------------------------------
-destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-  echo "SUCCESS: Terraform destroy of all targets completed successfully"
-else
-  echo "FAILED: Terraform destroy of all targets failed"
-  exit 1
-fi
diff --git a/ai-ml/jupyterhub/cognito.tf b/ai-ml/jupyterhub/cognito.tf
deleted file mode 100644
index 57338986b..000000000
--- a/ai-ml/jupyterhub/cognito.tf
+++ /dev/null
@@ -1,224 +0,0 @@
-#---------------------------------------------------------------
-# Lambda function for pre token generation
-#----------------------------------------------------------------
-
-data "aws_iam_policy_document" "assume_role" {
-  statement {
-    effect = "Allow"
-    principals {
-      type        = "Service"
-      identifiers = ["lambda.amazonaws.com", "cognito-idp.amazonaws.com"]
-    }
-    actions = ["sts:AssumeRole"]
-  }
-}
-
-data "aws_iam_policy" "lambda_execution_policy" {
-  arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
-}
-
-resource "aws_iam_role" "iam_for_lambda" {
-  count              = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  name               = "iam_for_lambda"
-  assume_role_policy = data.aws_iam_policy_document.assume_role.json
-}
-
-resource "aws_iam_role_policy_attachment" "lambda_policy_attachment" {
-  count      = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  role       = aws_iam_role.iam_for_lambda[0].name
-  policy_arn = data.aws_iam_policy.lambda_execution_policy.arn
-}
-
-data "archive_file" "lambda" {
-  type        = "zip"
-  output_path = "/tmp/lambda.zip"
-  source {
-    filename = "index.mjs"
-    content  = <<-EOF
-    export const handler = async (event) => {
-        event.response = {
-          claimsOverrideDetails: {
-            claimsToAddOrOverride: {
-              department: "engineering",
-            },
-          },
-        };
-
-        return event;
-    };
-
-    EOF
-  }
-}
-
-resource "aws_lambda_function" "pretoken_trigger" {
-  count            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  function_name    = "pretoken-trigger-function"
-  filename         = data.archive_file.lambda.output_path
-  source_code_hash = data.archive_file.lambda.output_base64sha256
-
-  runtime = "nodejs18.x"
-  handler = "index.handler"
-
-  role = aws_iam_role.iam_for_lambda[0].arn
-}
-
-#---------------------------------------------------------------
-# Cognito pool, domain and client creation.
-# This can be used
-# Auth integration later.
-#----------------------------------------------------------------
-resource "aws_cognito_user_pool" "pool" {
-  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  name  = "jupyterhub-userpool"
-
-  username_attributes      = ["email"]
-  auto_verified_attributes = ["email"]
-
-  password_policy {
-    minimum_length = 6
-  }
-
-  lambda_config {
-    pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn
-  }
-}
-
-resource "aws_cognito_user_pool_domain" "domain" {
-  count        = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  domain       = local.cognito_custom_domain
-  user_pool_id = aws_cognito_user_pool.pool[0].id
-}
-
-resource "aws_cognito_user_pool_client" "user_pool_client" {
-  count                 = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  name                  = "jupyter-client"
-  access_token_validity = 1
-  token_validity_units {
-    access_token = "days"
-  }
-  callback_urls                        = ["https://${var.jupyterhub_domain}/hub/oauth_callback"]
-  user_pool_id                         = aws_cognito_user_pool.pool[0].id
-  allowed_oauth_flows_user_pool_client = true
-  allowed_oauth_flows                  = ["code"]
-  allowed_oauth_scopes                 = ["openid", "email"]
-  generate_secret                      = true
-  supported_identity_providers = [
-    "COGNITO"
-  ]
-
-  depends_on = [aws_cognito_user_pool_domain.domain]
-}
-
-#---------------------------------------------------------------
-# Cognito identity pool creation.
-#----------------------------------------------------------------
-resource "aws_cognito_identity_pool" "identity_pool" {
-  count                            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  identity_pool_name               = "jupyterhub-identity-pool"
-  allow_unauthenticated_identities = false
-  cognito_identity_providers {
-    client_id               = aws_cognito_user_pool_client.user_pool_client[0].id
-    provider_name           = aws_cognito_user_pool.pool[0].endpoint
-    server_side_token_check = true
-  }
-
-  depends_on = [aws_cognito_user_pool_client.user_pool_client]
-}
-
-resource "aws_s3_bucket" "jupyterhub_bucket" {
-  count         = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  bucket_prefix = "jupyterhub-test-bucket-"
-}
-
-resource "aws_s3_object" "engineering_object" {
-  count  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  bucket = aws_s3_bucket.jupyterhub_bucket[0].id
-  key    = "engineering/"
-}
-
-resource "aws_s3_object" "legal_object" {
-  count  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  bucket = aws_s3_bucket.jupyterhub_bucket[0].id
-  key    = "legal/"
-}
-
-#---------------------------------------------------------------
-# IAM role for a team member from the engineering department
-# In theory there would be other departments such as "legal"
-#----------------------------------------------------------------
-resource "aws_iam_role" "cognito_authenticated_engineering_role" {
-  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-
-  name = "EngineeringTeamRole"
-
-  assume_role_policy = jsonencode({
-    Version = "2012-10-17",
-    Statement = [
-      {
-        Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"],
-        Effect = "Allow",
-        Principal = {
-          Federated = "cognito-identity.amazonaws.com"
-        },
-        Condition = {
-          StringEquals = {
-            "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id
-          },
-          "ForAnyValue:StringLike" : {
-            "cognito-identity.amazonaws.com:amr" : "authenticated"
-          }
-        }
-      }
-    ]
-  })
-}
-
-resource "aws_iam_role_policy" "s3_cognito_engineering_policy" {
-  count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  name  = "s3_cognito_engineering_policy"
-  role  = aws_iam_role.cognito_authenticated_engineering_role[0].id
-
-  policy = <<-EOF
-{
-  "Version": "2012-10-17",
-  "Statement": [
-    {
-      "Effect": "Allow",
-      "Action": ["s3:List*"],
-      "Resource": "*",
-      "Condition": {
-        "StringEquals": {
-          "s3:prefix": "$${aws:PrincipalTag/department}"
-        }
-      }
-    }
-  ]
-}
-EOF
-}
-
-resource "aws_cognito_identity_pool_provider_principal_tag" "example" {
-  count                  = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  identity_pool_id       = aws_cognito_identity_pool.identity_pool[0].id
-  identity_provider_name = aws_cognito_user_pool.pool[0].endpoint
-  use_defaults           = false
-  principal_tags = {
-    department = "department"
-  }
-}
-
-resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" {
-  count      = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  name       = "S3ReadOnlyAccessAttachment"
-  policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
-  roles      = [aws_iam_role.cognito_authenticated_engineering_role[0].name]
-}
-
-resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" {
-  count            = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
-  identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id
-  roles = {
-    authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn
-  }
-}
diff --git a/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml b/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml
deleted file mode 100644
index 0f05a308b..000000000
--- a/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-global:
-
-#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server
-# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata
-hostNetwork: true
-dnsPolicy: ClusterFirstWithHostNet
-
-service:
-  parsersFiles:
-    - /fluent-bit/parsers/parsers.conf
-  extraParsers: |
-    [PARSER]
-        Name    kubernetes
-        Format  regex
-        Regex   ^(?<namespace_name>[^_]+)\.(?<container_name>.+)\.(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?<docker_id>[a-z0-9]{64})-$
-
-input:
-  name: "tail"
-  enabled: true
-  tag: "systempods.<namespace_name>.<container_name>.<pod_name>.<docker_id>-"
-  path: "/var/log/containers/*.log"
-  db: "/var/log/flb_kube.db"
-  memBufLimit: 5MB
-  skipLongLines: "On"
-  refreshInterval: 10
-  extraInputs: |
-    multiline.parser  docker, cri
-    Tag_Regex         (?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$
-
-
-# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters
-filter:
-  name: "kubernetes"
-  match: "systempods.*"
-  kubeURL: "https://kubernetes.default.svc.cluster.local:443"
-  mergeLog: "On"
-  mergeLogKey: "log_processed"
-  keepLog: "On"
-  k8sLoggingParser: "On"
-  k8sLoggingExclude: "Off"
-  bufferSize: "0"
-  extraFilters: |
-    Kube_Tag_Prefix     systempods.
-    Regex_Parser        kubernetes
-    Labels              On
-    Annotations         Off
-    Use_Kubelet         true
-    Kubelet_Port        10250
-    Kube_CA_File        /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-    Kube_Token_File     /var/run/secrets/kubernetes.io/serviceaccount/token
-
-# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance.
-# cloudWatch:
-#   enabled: false
-
-# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch
-cloudWatchLogs:
-  enabled: true
-  match: "systempods.*"
-  region: ${region}
-  logGroupName: ${cloudwatch_log_group}
-  autoCreateGroup: false
-  extraOutputs: |
-    log_key               log
-
-# Resource config for large clusters
-resources:
-  limits:
-    cpu: 1000m
-    memory: 1500Mi
-  requests:
-    cpu: 500m
-    memory: 500Mi
-
-## Assign a PriorityClassName to pods if set
-priorityClassName: system-node-critical
-
-# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
-tolerations:
-  - operator: Exists
diff --git a/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml b/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml
deleted file mode 100644
index 5a42794f2..000000000
--- a/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-autoDiscovery:
-  clusterName: ${eks_cluster_id}
-
-awsRegion: ${aws_region}
-
-cloudProvider: aws
-
-extraArgs:
-  aws-use-static-instance-list: true
-
-# Best practice to update the resource requests and limits for each add-on
-resources:
-   limits:
-     cpu: 1000m
-     memory: 1G
-   requests:
-     cpu: 200m
-     memory: 512Mi
-
-# Best practice to updateStrategy for each add-on
-updateStrategy:
-  type: RollingUpdate
-  rollingUpdate:
-    maxSurge: 0
-    maxUnavailable: 1
diff --git a/ai-ml/jupyterhub/helm/efs/Chart.yaml b/ai-ml/jupyterhub/helm/efs/Chart.yaml
deleted file mode 100644
index e69ed7f3d..000000000
--- a/ai-ml/jupyterhub/helm/efs/Chart.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-apiVersion: v2
-name: efs
-description: Helm chart for efs options on the cluster
-version: 0.0.1
-appVersion: 0.0.1
diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml
deleted file mode 100644
index c10646f80..000000000
--- a/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: {{ .Values.pv.name }}
-spec:
-  capacity:
-    storage: 123Gi
-  accessModes:
-    - ReadWriteMany
-  nfs:
-    server: {{ .Values.pv.dnsName }}
-    path: "/"
diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml
deleted file mode 100644
index cd0a962d9..000000000
--- a/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: {{ .Values.pvc.name }}
-spec:
-  accessModes:
-    - ReadWriteMany
-  storageClassName: ""
-  resources:
-    requests:
-      storage: 1Gi
diff --git a/ai-ml/jupyterhub/helm/efs/values.yaml b/ai-ml/jupyterhub/helm/efs/values.yaml
deleted file mode 100644
index 703735ddd..000000000
--- a/ai-ml/jupyterhub/helm/efs/values.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-pv:
-  name: efs-persist
-  dnsName:
-pvc:
-  name: efs-persist
diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml
deleted file mode 100755
index 5088511c0..000000000
--- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml
+++ /dev/null
@@ -1,304 +0,0 @@
-hub:
-  db:
-    pvc:
-      storage: 50Gi
-      storageClassName: gp3
-  authenticatePrometheus: false
-  command: ["sh", "-c", "pip install boto3 && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py"]
-  config:
-    GenericOAuthenticator:
-      oauth_callback_url: ${jupyterdomain}
-      client_id: ${client_id}
-      client_secret: ${client_secret}
-      authorize_url: ${authorize_url}
-      token_url: ${token_url}
-      userdata_url: ${userdata_url}
-      scope:
-        - openid
-        - email
-      username_key: "username"
-      login_service : "AWS Cognito"
-      userdata_method: "POST"
-    JupyterHub:
-      authenticator_class: generic-oauth
-  extraConfig:
-    jupyterhub_config.py: |-
-      c.KubeSpawner.start_timeout = 1200
-      c.Authenticator.enable_auth_state = True
-
-    cognito_config.py: |-
-      import boto3
-      def auth_state_hook(spawner, auth_state):
-        client_idp = boto3.client('cognito-idp', region_name="${region}")
-        auth_response = client_idp.initiate_auth(
-          AuthFlow="REFRESH_TOKEN_AUTH",
-          AuthParameters={
-            "REFRESH_TOKEN": auth_state['refresh_token'],
-            "SECRET_HASH": "${client_secret}"
-          },
-          ClientId="${client_id}"
-        )
-        id_token = auth_response["AuthenticationResult"]["IdToken"]
-        client_identity = boto3.client("cognito-identity", region_name="${region}")
-        identity_response = client_identity.get_id(
-          IdentityPoolId="${identity_pool_id}",
-          Logins={
-            f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token
-          }
-        )
-        identity_id = identity_response['IdentityId']
-        credentials = client_identity.get_credentials_for_identity(
-          IdentityId=identity_id,
-          Logins={
-            f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token
-          }
-        )
-        key = credentials["Credentials"]["AccessKeyId"]
-        secret = credentials["Credentials"]["SecretKey"]
-        token = credentials["Credentials"]["SessionToken"]
-        spawner.environment['AWS_ACCESS_KEY_ID'] = key
-        spawner.environment['AWS_SECRET_ACCESS_KEY'] = secret
-        spawner.environment['AWS_SESSION_TOKEN'] = token
-
-      c.Spawner.auth_state_hook = auth_state_hook
-
-proxy:
-  https:
-    enabled: true
-    type: offload
-  service:
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn}
-      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
-      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
-      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-      service.beta.kubernetes.io/aws-load-balancer-type: external
-      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
-      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
-
-singleuser:
-  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
-  profileList:
-    - display_name: Data Engineering (CPU)
-      description: "PySpark Notebooks | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pyspark350:
-              display_name: "PySpark 3.5.0 + Python 3.11"
-              default: true
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.5.0
-            pyspark341:
-              display_name: "PySpark 3.4.1 + Python 3.11"
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.4.1
-      kubespawner_override:
-        node_selector:
-          NodePool: default
-        cpu_guarantee: 2
-        mem_guarantee: 8G
-        cpu_limit: 4
-        mem_limit: 8G
-      cmd: null
-    # NOTE:
-    - display_name: Trainium (trn1)
-      description: "Trainium | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch 1.13.1 + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: trainium
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: "start-singleuser.sh"
-    - display_name: Inferentia (inf2)
-      description: "Inferentia | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: inferentia
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 20
-        mem_guarantee: 100G
-        cpu_limit: 20
-        mem_limit: 100G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: null
-    - display_name: Data Science (GPU + Time-Slicing - G5)
-      default: true
-      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
-      kubespawner_override:
-        # namespace: data-team-a
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
-        node_selector:
-          NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_limit: 2
-        mem_limit: 4G
-        cpu_guarantee: 2
-        mem_guarantee: 4G
-        cmd: "start-singleuser.sh"
-    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
-    # Hence, this profile relies on Managed node groups with GPU MIG enabled
-    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
-      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
-        node_selector:
-          provisioner: cluster-autoscaler
-          node.kubernetes.io/instance-type: p4d.24xlarge
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_guarantees:
-          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
-        # extra_resource_limits:
-        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-    - display_name: Data Science (GPU - P4d.24xlarge)
-      description: "GPU with P4d instances | Karpenter Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only
-        node_selector:
-          NodePool: gpu-mig
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "8"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-  storage:
-    type: "static"
-    static:
-      pvcName: "efs-persist"
-      subPath: "home/{username}"
-    extraVolumes:
-    - name: jupyterhub-shared
-      persistentVolumeClaim:
-        claimName: efs-persist-shared
-    extraVolumeMounts:
-    - name: jupyterhub-shared
-      mountPath: /home/shared
-      readOnly: false
-  serviceAccountName: ${jupyter_single_user_sa_name}
-  allowPrivilegeEscalation: true
-  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
-    securityContext:
-        fsGroup: 100
-  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
-    GRANT_SUDO: "yes"
-    NOTEBOOK_ARGS: "--allow-root"
-    CHOWN_HOME: "yes"
-    CHOWN_HOME_OPTS: "-R"
-    CHOWN_EXTRA: "/home/shared"
-  uid: 0
-  fsGid: 0
-  cmd: null
-
-# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-scheduling:
-  userScheduler:
-    enabled: true
-  podPriority:
-    enabled: true
-  userPlaceholder:
-    enabled: false
-    replicas: 1
-  userPods:
-    nodeAffinity:
-      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
-
-prePuller:
-  hook:
-    enabled: false
-  continuous:
-    # NOTE: if used with Karpenter, also add user-placeholders
-    enabled: false
-
-global:
-  safeToShowValues: false
diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml
deleted file mode 100755
index 24320e22a..000000000
--- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml
+++ /dev/null
@@ -1,259 +0,0 @@
-hub:
-  db:
-    pvc:
-      storage: 50Gi
-      storageClassName: gp3
-  authenticatePrometheus: false
-
-proxy:
-  https:
-    enabled: false
-    type: offload
-  service:
-    type: ClusterIP
-    # Disabled LoadBalancer type
-#    annotations:
-#      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "ssl_cert_arn"
-#      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
-#      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
-#      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
-#      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-#      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-#      service.beta.kubernetes.io/aws-load-balancer-type: external
-#      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
-#      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
-singleuser:
-  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
-  profileList:
-    - display_name: Elyra (CPU)
-      description: "Elyra Notebooks | Karpenter Autoscaling"
-      kubespawner_override:
-        image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0
-        node_selector:
-          NodePool: default
-        cpu_guarantee: 2
-        mem_guarantee: 8G
-        cpu_limit: 4
-        mem_limit: 8G
-      cmd: null
-    - display_name: Data Engineering (CPU)
-      description: "PySpark Notebooks | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pyspark350:
-              display_name: "PySpark 3.5.0 + Python 3.11"
-              default: true
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.5.0
-            pyspark341:
-              display_name: "PySpark 3.4.1 + Python 3.11"
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.4.1
-      kubespawner_override:
-        node_selector:
-          NodePool: default
-        cpu_guarantee: 2
-        mem_guarantee: 8G
-        cpu_limit: 4
-        mem_limit: 8G
-      cmd: null
-    # NOTE:
-    - display_name: Trainium (trn1)
-      description: "Trainium | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch 1.13.1 + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: trainium
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: "start-singleuser.sh"
-    - display_name: Inferentia (inf2)
-      description: "Inferentia | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: inferentia
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 20
-        mem_guarantee: 100G
-        cpu_limit: 20
-        mem_limit: 100G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: null
-    - display_name: Data Science (GPU + Time-Slicing - G5)
-      default: true
-      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
-      kubespawner_override:
-        # namespace: data-team-a
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_limit: 2
-        mem_limit: 4G
-        cpu_guarantee: 2
-        mem_guarantee: 4G
-        cmd: "start-singleuser.sh"
-    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
-    # Hence, this profile relies on Managed node groups with GPU MIG enabled
-    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
-      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          provisioner: cluster-autoscaler
-          node.kubernetes.io/instance-type: p4d.24xlarge
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_guarantees:
-          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
-        # extra_resource_limits:
-        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-    - display_name: Data Science (GPU - P4d.24xlarge)
-      description: "GPU with P4d instances | Karpenter Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          NodePool: gpu-mig
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "8"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-  storage:
-    type: "static"
-    static:
-      pvcName: "efs-persist"
-      subPath: "home/{username}"
-    extraVolumes:
-    - name: jupyterhub-shared
-      persistentVolumeClaim:
-        claimName: efs-persist-shared
-    extraVolumeMounts:
-    - name: jupyterhub-shared
-      mountPath: /home/shared
-      readOnly: false
-  serviceAccountName: ${jupyter_single_user_sa_name}
-  allowPrivilegeEscalation: true
-  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
-    securityContext:
-        fsGroup: 100
-  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
-    GRANT_SUDO: "yes"
-    NOTEBOOK_ARGS: "--allow-root"
-    CHOWN_HOME: "yes"
-    CHOWN_HOME_OPTS: "-R"
-    CHOWN_EXTRA: "/home/shared"
-  uid: 0
-  fsGid: 0
-  cmd: null
-
-# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-scheduling:
-  userScheduler:
-    enabled: true
-  podPriority:
-    enabled: true
-  userPlaceholder:
-    enabled: false
-    replicas: 1
-  userPods:
-    nodeAffinity:
-      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
-
-prePuller:
-  hook:
-    enabled: false
-  continuous:
-    # NOTE: if used with Karpenter, also add user-placeholders
-    enabled: false
-
-global:
-  safeToShowValues: false
diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml
deleted file mode 100755
index 869163d22..000000000
--- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml
+++ /dev/null
@@ -1,273 +0,0 @@
-hub:
-  db:
-    pvc:
-      storage: 50Gi
-      storageClassName: gp3
-  authenticatePrometheus: false
-  config:
-    GenericOAuthenticator:
-      oauth_callback_url: ${jupyterdomain}
-      client_id: ${client_id}
-      client_secret: ${client_secret}
-      authorize_url: ${authorize_url}
-      token_url: ${token_url}
-      userdata_url: ${userdata_url}
-      scope:
-        - openid
-        - profile
-      username_key: "${username_key}"
-      login_service: "oauth"
-      allow_all: true # Allows all oauth authenticated users to use Jupyterhub. For finer grained control, you can use `allowed_users`: https://jupyterhub.readthedocs.io/en/stable/tutorial/getting-started/authenticators-users-basics.html#deciding-who-is-allowed
-    JupyterHub:
-      authenticator_class: generic-oauth
-proxy:
-  https:
-    enabled: true
-    type: offload
-  service:
-    annotations:
-      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn}
-      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
-      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
-      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
-      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-      service.beta.kubernetes.io/aws-load-balancer-type: external
-      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
-      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
-
-singleuser:
-  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
-  profileList:
-    - display_name: Elyra (CPU)
-      description: "Elyra Notebooks | Karpenter Autoscaling"
-      kubespawner_override:
-        image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0
-        node_selector:
-          NodePool: default
-        cpu_guarantee: 2
-        mem_guarantee: 8G
-        cpu_limit: 4
-        mem_limit: 8G
-      cmd: null
-    - display_name: Data Engineering (CPU)
-      description: "PySpark Notebooks | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pyspark350:
-              display_name: "PySpark 3.5.0 + Python 3.11"
-              default: true
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.5.0
-            pyspark341:
-              display_name: "PySpark 3.4.1 + Python 3.11"
-              kubespawner_override:
-                image: jupyter/pyspark-notebook:spark-3.4.1
-      kubespawner_override:
-        node_selector:
-          NodePool: default
-        cpu_guarantee: 2
-        mem_guarantee: 8G
-        cpu_limit: 4
-        mem_limit: 8G
-      cmd: null
-    # NOTE:
-    - display_name: Trainium (trn1)
-      description: "Trainium | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch 1.13.1 + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: trainium
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: "start-singleuser.sh"
-    - display_name: Inferentia (inf2)
-      description: "Inferentia | Karpenter AutoScaling"
-      profile_options:
-        image:
-          display_name: "Image"
-          choices:
-            pytorch1131:
-              display_name: "PyTorch + torch-neuronx"
-              default: true
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
-            tflow2101:
-              display_name: "Tensorflow + tensorflow-neuronx"
-              kubespawner_override:
-                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
-      kubespawner_override:
-        node_selector:
-          NodePool: inferentia
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: aws.amazon.com/neuroncore
-            operator: Exists
-            effect: NoSchedule
-          - key: aws.amazon.com/neuron
-            operator: Exists
-            effect: NoSchedule
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        cpu_guarantee: 20
-        mem_guarantee: 100G
-        cpu_limit: 20
-        mem_limit: 100G
-        extra_resource_limits:
-          aws.amazon.com/neuron: "1"
-        cmd: null
-    - display_name: Data Science (GPU + Time-Slicing - G5)
-      default: true
-      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
-      kubespawner_override:
-        # namespace: data-team-a
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_limit: 2
-        mem_limit: 4G
-        cpu_guarantee: 2
-        mem_guarantee: 4G
-        cmd: "start-singleuser.sh"
-    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
-    # Hence, this profile relies on Managed node groups with GPU MIG enabled
-    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
-      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          provisioner: cluster-autoscaler
-          node.kubernetes.io/instance-type: p4d.24xlarge
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_guarantees:
-          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
-        # extra_resource_limits:
-        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-    - display_name: Data Science (GPU - P4d.24xlarge)
-      description: "GPU with P4d instances | Karpenter Autoscaler"
-      kubespawner_override:
-        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
-        node_selector:
-          NodePool: gpu-mig
-          hub.jupyter.org/node-purpose: user
-        tolerations:
-          - key: "nvidia.com/gpu"
-            operator: "Exists"
-            effect: "NoSchedule"
-          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-            operator: "Equal"
-            value: "user"
-            effect: "NoSchedule"
-        extra_resource_limits:
-          nvidia.com/gpu: "8"
-        cpu_guarantee: 2
-        mem_guarantee: 10G
-        cpu_limit: 2
-        mem_limit: 10G
-        cmd: "start-singleuser.sh"
-  storage:
-    type: "static"
-    static:
-      pvcName: "efs-persist"
-      subPath: "home/{username}"
-    extraVolumes:
-      - name: jupyterhub-shared
-        persistentVolumeClaim:
-          claimName: efs-persist-shared
-    extraVolumeMounts:
-      - name: jupyterhub-shared
-        mountPath: /home/shared
-        readOnly: false
-  serviceAccountName: ${jupyter_single_user_sa_name}
-  allowPrivilegeEscalation: true
-  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
-    securityContext:
-      fsGroup: 100
-  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
-    GRANT_SUDO: "yes"
-    NOTEBOOK_ARGS: "--allow-root"
-    CHOWN_HOME: "yes"
-    CHOWN_HOME_OPTS: "-R"
-    CHOWN_EXTRA: "/home/shared"
-  uid: 0
-  fsGid: 0
-  cmd: null
-
-# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-scheduling:
-  userScheduler:
-    enabled: true
-  podPriority:
-    enabled: true
-  userPlaceholder:
-    enabled: false
-    replicas: 1
-  userPods:
-    nodeAffinity:
-      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
-
-prePuller:
-  hook:
-    enabled: false
-  continuous:
-    # NOTE: if used with Karpenter, also add user-placeholders
-    enabled: false
-
-global:
-  safeToShowValues: false
diff --git a/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml b/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml
deleted file mode 100644
index 1b13f6dec..000000000
--- a/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-prometheus:
-  prometheusSpec:
-    resources:
-      requests:
-        memory: 4Gi
-        cpu: 2
-    retention: 5h
-    scrapeInterval: 30s
-    evaluationInterval: 30s
-    scrapeTimeout: 10s
-    storageSpec:
-      volumeClaimTemplate:
-        metadata:
-          name: data
-        spec:
-          storageClassName: gp3
-          accessModes:
-            - ReadWriteOnce
-          resources:
-            requests:
-              storage: 100Gi
-    # Scrape Cost metrics for Kubecost and JupyterHub add-ons
-    additionalScrapeConfigs:
-      - job_name: kubecost
-        honor_labels: true
-        scrape_interval: 1m
-        scrape_timeout: 10s
-        metrics_path: /metrics
-        scheme: http
-        dns_sd_configs:
-          - names:
-              - kubecost-cost-analyzer.kubecost.svc
-            type: 'A'
-            port: 9003
-      - job_name: jupyterhub
-        honor_labels: true
-        scrape_interval: 1m
-        scrape_timeout: 10s
-        metrics_path: /hub/metrics
-        scheme: http
-        dns_sd_configs:
-          - names:
-              - hub.jupyterhub.svc
-            type: 'A'
-            port: 8081
-      - job_name: gpu-metrics
-        scrape_interval: 1m
-        metrics_path: /metrics
-        scheme: http
-        kubernetes_sd_configs:
-        - role: endpoints
-          namespaces:
-            names:
-            - gpu-operator
-        relabel_configs:
-        - source_labels: [__meta_kubernetes_pod_node_name]
-          action: replace
-          target_label: kubernetes_node
-
-alertmanager:
-  enabled: false
-
-grafana:
-  enabled: true
-  defaultDashboardsEnabled: true
-  resources:
-    requests:
-      memory: 4Gi
-      cpu: 2
-  sidecar:
-    datasources:
-      alertmanager:
-        enabled: false
-
-kube-state-metrics:
-  metricLabelsAllowlist:
-    # to select jupyterhub component pods and get the hub usernames
-    - pods=[app,component,hub.jupyter.org/username]
-    # allowing all labels is probably fine for nodes, since they don't churn much, unlike pods
-    - nodes=[*]
diff --git a/ai-ml/jupyterhub/helm/kubecost/values.yaml b/ai-ml/jupyterhub/helm/kubecost/values.yaml
deleted file mode 100644
index 0f9441497..000000000
--- a/ai-ml/jupyterhub/helm/kubecost/values.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-
-# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090
-
-global:
-  # pricingCsv:
-  #   enabled: false
-  #   location:
-  #     provider: "AWS"
-  #     region: "us-east-1"
-  #     URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI
-  #     csvAccessCredentials: pricing-schema-access-secret
-
-  # This Prometheus setup is reusing the existing Prometheus deployment
-  # Check for more docs under https://guide.kubecost.com/hc/en-us/articles/4407595941015
-  prometheus:
-    fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090
-    enabled: false
-
-# If you have node-exporter and/or KSM running on your cluster, follow this step to disable the Kubecost included versions.
-prometheus:
-  nodeExporter:
-    enabled: false
-  serviceAccounts:
-    nodeExporter:
-      create: false
-  kubeStateMetrics:
-    enabled: false
-
-#imageVersion: prod-1.96.0 # commented to use the latest
-
-kubecostFrontend:
-  image: public.ecr.aws/kubecost/frontend
-  resources:
-    requests:
-      cpu: "200m"
-      memory: "512Mi"
-
-kubecostMetrics:
-  emitPodAnnotations: true
-  emitNamespaceAnnotations: true
-
-kubecostModel:
-  image: public.ecr.aws/kubecost/cost-model
-  resources:
-    requests:
-      cpu: "500m"
-      memory: "512Mi"
-
-# Set this to false if you're bringing your own service account.
-#serviceAccount:
-#  create: false
-#  name: kubecost-cost-analyzer
-#  annotations:
-#    eks.amazonaws.com/role-arn: <iam-role-arn>
-
-# Define persistence volume for cost-analyzer
-persistentVolume:
-  size: 32Gi
-  dbSize: 32.0Gi
-  enabled: true # Note that setting this to false means configurations will be wiped out on pod restart.
-  storageClass: gp3
-  # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost
-
-grafana:
-  enabled: false
diff --git a/ai-ml/jupyterhub/helm/metrics-server/values.yaml b/ai-ml/jupyterhub/helm/metrics-server/values.yaml
deleted file mode 100644
index 026d97a6a..000000000
--- a/ai-ml/jupyterhub/helm/metrics-server/values.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-# HA config for metrics-server
-image:
-  repository: registry.k8s.io/metrics-server/metrics-server
-  pullPolicy: IfNotPresent
-
-serviceAccount:
-  create: true
-  name: metrics-server
-
-rbac:
-  create: true
-  pspEnabled: false
-
-apiService:
-  create: true
-
-podLabels:
-  k8s-app: metrics-server
-
-# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true
-replicas: 2
-
-updateStrategy:
-   type: RollingUpdate
-   rollingUpdate:
-     maxSurge: 0
-     maxUnavailable: 1
-
-podDisruptionBudget:
-  enabled: true
-  minAvailable: 1
-
-defaultArgs:
-  - --cert-dir=/tmp
-  - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
-  - --kubelet-use-node-status-port
-  - --metric-resolution=15s
-
-resources:
-  requests:
-    cpu: 200m
-    memory: 512Mi
-
-affinity:
-  podAntiAffinity:
-    requiredDuringSchedulingIgnoredDuringExecution:
-      - labelSelector:
-          matchLabels:
-            k8s-app: metrics-server
-        namespaces:
-          - kube-system
-        topologyKey: kubernetes.io/hostname
diff --git a/ai-ml/jupyterhub/install.sh b/ai-ml/jupyterhub/install.sh
index b87db5117..77838a56e 100755
--- a/ai-ml/jupyterhub/install.sh
+++ b/ai-ml/jupyterhub/install.sh
@@ -1,33 +1,6 @@
 #!/bin/bash
+# Copy the base infrastructure into the folder
+cp -r ../infrastructure/terraform/* ./terraform
 
-echo "Initializing ..."
-terraform init || echo "\"terraform init\" failed"
-
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.vpc"
-  "module.eks"
-)
-
-# Apply modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-    echo "SUCCESS: Terraform apply of $target completed successfully"
-  else
-    echo "FAILED: Terraform apply of $target failed"
-    exit 1
-  fi
-done
-
-# Final apply to catch any remaining resources
-echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-  echo "SUCCESS: Terraform apply of all modules completed successfully"
-else
-  echo "FAILED: Terraform apply of all modules failed"
-  exit 1
-fi
+cd terraform
+source ./install.sh
diff --git a/ai-ml/jupyterhub/jupyterhub.tf b/ai-ml/jupyterhub/jupyterhub.tf
deleted file mode 100644
index 30809aeef..000000000
--- a/ai-ml/jupyterhub/jupyterhub.tf
+++ /dev/null
@@ -1,143 +0,0 @@
-#-----------------------------------------------------------------------------------------
-# JupyterHub Sinlgle User IRSA, maybe that block could be incorporated in add-on registry
-#-----------------------------------------------------------------------------------------
-resource "kubernetes_namespace" "jupyterhub" {
-  metadata {
-    name = "jupyterhub"
-  }
-}
-
-module "jupyterhub_single_user_irsa" {
-  source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
-
-  role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa"
-
-  role_policy_arns = {
-    policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances.
-  }
-
-  oidc_providers = {
-    main = {
-      provider_arn               = module.eks.oidc_provider_arn
-      namespace_service_accounts = ["${kubernetes_namespace.jupyterhub.metadata[0].name}:jupyterhub-single-user"]
-    }
-  }
-}
-
-resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" {
-  metadata {
-    name        = "${module.eks.cluster_name}-jupyterhub-single-user"
-    namespace   = kubernetes_namespace.jupyterhub.metadata[0].name
-    annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa.iam_role_arn }
-  }
-
-  automount_service_account_token = true
-}
-
-resource "kubernetes_secret_v1" "jupyterhub_single_user" {
-  metadata {
-    name      = "${module.eks.cluster_name}-jupyterhub-single-user-secret"
-    namespace = kubernetes_namespace.jupyterhub.metadata[0].name
-    annotations = {
-      "kubernetes.io/service-account.name"      = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
-      "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub.metadata[0].name
-    }
-  }
-
-  type = "kubernetes.io/service-account-token"
-}
-
-#---------------------------------------------------------------
-# EFS Filesystem for private volumes per user
-# This will be replaced with Dynamic EFS provision using EFS CSI Driver
-#---------------------------------------------------------------
-resource "aws_efs_file_system" "efs" {
-  encrypted = true
-
-  tags = local.tags
-}
-
-#---------------------------------------------------------------
-# module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x]
-# We use index 2 and 3 to select the subnet in AZ1 with the 100.x CIDR:
-# Create EFS mount targets for the 3rd  subnet
-resource "aws_efs_mount_target" "efs_mt_1" {
-  file_system_id  = aws_efs_file_system.efs.id
-  subnet_id       = module.vpc.private_subnets[2]
-  security_groups = [aws_security_group.efs.id]
-}
-
-# Create EFS mount target for the 4th subnet
-resource "aws_efs_mount_target" "efs_mt_2" {
-  file_system_id  = aws_efs_file_system.efs.id
-  subnet_id       = module.vpc.private_subnets[3]
-  security_groups = [aws_security_group.efs.id]
-}
-
-resource "aws_security_group" "efs" {
-  name        = "${local.name}-efs"
-  description = "Allow inbound NFS traffic from private subnets of the VPC"
-  vpc_id      = module.vpc.vpc_id
-
-  ingress {
-    description = "Allow NFS 2049/tcp"
-    cidr_blocks = module.vpc.vpc_secondary_cidr_blocks
-    from_port   = 2049
-    to_port     = 2049
-    protocol    = "tcp"
-  }
-
-  tags = local.tags
-}
-
-#---------------------------------------
-# EFS Configuration
-#---------------------------------------
-module "efs_config" {
-  source  = "aws-ia/eks-blueprints-addons/aws"
-  version = "~> 1.2"
-
-  cluster_name      = module.eks.cluster_name
-  cluster_endpoint  = module.eks.cluster_endpoint
-  cluster_version   = module.eks.cluster_version
-  oidc_provider_arn = module.eks.oidc_provider_arn
-
-  helm_releases = {
-    efs = {
-      name             = "efs"
-      description      = "A Helm chart for storage configurations"
-      namespace        = "jupyterhub"
-      create_namespace = false
-      chart            = "${path.module}/helm/efs"
-      chart_version    = "0.0.1"
-      values = [
-        <<-EOT
-          pv:
-            name: efs-persist
-            dnsName: ${aws_efs_file_system.efs.dns_name}
-          pvc:
-            name: efs-persist
-        EOT
-      ]
-    }
-    efs-shared = {
-      name             = "efs-shared"
-      description      = "A Helm chart for shared storage configurations"
-      namespace        = "jupyterhub"
-      create_namespace = false
-      chart            = "${path.module}/helm/efs"
-      chart_version    = "0.0.1"
-      values = [
-        <<-EOT
-          pv:
-            name: efs-persist-shared
-            dnsName: ${aws_efs_file_system.efs.dns_name}
-          pvc:
-            name: efs-persist-shared
-        EOT
-      ]
-    }
-  }
-
-  depends_on = [kubernetes_namespace.jupyterhub]
-}
diff --git a/ai-ml/jupyterhub/main.tf b/ai-ml/jupyterhub/main.tf
deleted file mode 100755
index dcccf3de4..000000000
--- a/ai-ml/jupyterhub/main.tf
+++ /dev/null
@@ -1,157 +0,0 @@
-provider "aws" {
-  region = local.region
-}
-
-# Removed exec plugin as this doesn't work with Terraform Cloud and TOFU controller plugin with backstage
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
-
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-data "aws_availability_zones" "available" {}
-
-locals {
-  name   = var.name
-  region = var.region
-  azs    = slice(data.aws_availability_zones.available.names, 0, 2)
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
-
-#---------------------------------------------------------------
-# EKS Cluster
-#---------------------------------------------------------------
-module "eks" {
-  source  = "terraform-aws-modules/eks/aws"
-  version = "~> 19.15"
-
-  cluster_name    = local.name
-  cluster_version = var.eks_cluster_version
-
-  #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing.
-  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
-
-  vpc_id = module.vpc.vpc_id
-  # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
-  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-
-  manage_aws_auth_configmap = true
-  aws_auth_roles = [
-    # We need to add in the Karpenter node IAM role for nodes launched by Karpenter
-    {
-      rolearn  = module.eks_blueprints_addons.karpenter.node_iam_role_arn
-      username = "system:node:{{EC2PrivateDNSName}}"
-      groups = [
-        "system:bootstrappers",
-        "system:nodes",
-      ]
-    }
-  ]
-  #---------------------------------------
-  # Note: This can further restricted to specific required for each Add-on and your application
-  #---------------------------------------
-  # Extend cluster security group rules
-  cluster_security_group_additional_rules = {
-    ingress_nodes_ephemeral_ports_tcp = {
-      description                = "Nodes on ephemeral ports"
-      protocol                   = "tcp"
-      from_port                  = 1025
-      to_port                    = 65535
-      type                       = "ingress"
-      source_node_security_group = true
-    }
-  }
-
-  # Extend node-to-node security group rules
-  node_security_group_additional_rules = {
-    ingress_self_all = {
-      description = "Node to node all ports/protocols"
-      protocol    = "-1"
-      from_port   = 0
-      to_port     = 0
-      type        = "ingress"
-      self        = true
-    }
-
-    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
-    # This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
-    # Update this according to your security requirements if needed
-    ingress_cluster_to_node_all_traffic = {
-      description                   = "Cluster API to Nodegroup all traffic"
-      protocol                      = "-1"
-      from_port                     = 0
-      to_port                       = 0
-      type                          = "ingress"
-      source_cluster_security_group = true
-    }
-  }
-
-  eks_managed_node_group_defaults = {
-    iam_role_additional_policies = {
-      # Not required, but used in the example to access the nodes to inspect mounted volumes
-      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
-    }
-  }
-
-  eks_managed_node_groups = {
-    #  It's recommended to have a Managed Node group for hosting critical add-ons
-    #  It's recommended to use Karpenter to place your workloads instead of using Managed Node groups
-    #  You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes.
-    core_node_group = {
-      name        = "jupyterhub-node-group"
-      description = "EKS Core node group for hosting critical add-ons"
-      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
-      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-
-      min_size     = 4
-      max_size     = 8
-      desired_size = 4
-
-      instance_types = ["m5.xlarge"]
-
-      ebs_optimized = true
-      block_device_mappings = {
-        xvda = {
-          device_name = "/dev/xvda"
-          ebs = {
-            volume_size = 100
-            volume_type = "gp3"
-          }
-        }
-      }
-
-      labels = {
-        WorkerType    = "ON_DEMAND"
-        NodeGroupType = "core"
-      }
-
-      tags = merge(local.tags, {
-        Name                     = "core-node-grp",
-        "karpenter.sh/discovery" = local.name
-      })
-    }
-  }
-}
diff --git a/ai-ml/jupyterhub/outputs.tf b/ai-ml/jupyterhub/outputs.tf
deleted file mode 100755
index f6444daab..000000000
--- a/ai-ml/jupyterhub/outputs.tf
+++ /dev/null
@@ -1,4 +0,0 @@
-output "configure_kubectl" {
-  description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
-  value       = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}"
-}
diff --git a/ai-ml/jupyterhub/variables.tf b/ai-ml/jupyterhub/variables.tf
deleted file mode 100755
index b2a2a0600..000000000
--- a/ai-ml/jupyterhub/variables.tf
+++ /dev/null
@@ -1,91 +0,0 @@
-variable "name" {
-  description = "Name of the VPC and EKS Cluster"
-  default     = "jupyterhub-on-eks"
-  type        = string
-}
-
-variable "region" {
-  description = "Region"
-  type        = string
-  default     = "us-west-2"
-}
-
-variable "eks_cluster_version" {
-  description = "EKS Cluster version"
-  default     = "1.30"
-  type        = string
-}
-
-# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
-variable "vpc_cidr" {
-  description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range"
-  default     = "10.1.0.0/21"
-  type        = string
-}
-
-# RFC6598 range 100.64.0.0/10
-# Note you can only /16 range to VPC. You can add multiples of /16 if required
-variable "secondary_cidr_blocks" {
-  description = "Secondary CIDR blocks to be attached to VPC"
-  default     = ["100.64.0.0/16"]
-  type        = list(string)
-}
-
-# NOTE: You need to use private domain or public domain name with ACM certificate
-# Data-on-EKS website docs will show you how to create free public domain name with ACM certificate for testing purpose only
-# Example of public domain name(<subdomain-name>.<domain-name>.com): eks.jupyter-doeks.dynamic-dns.com
-variable "jupyter_hub_auth_mechanism" {
-  type        = string
-  description = "Allowed values: cognito, dummy, oauth"
-  default     = "dummy"
-}
-
-#  Domain name is public so make sure you use a unique while deploying, Only needed if auth mechanism is set to cognito
-variable "cognito_custom_domain" {
-  description = "Cognito domain prefix for Hosted UI authentication endpoints"
-  type        = string
-  default     = "eks"
-}
-
-# Only needed if auth mechanism is set to cognito
-variable "acm_certificate_domain" {
-  type        = string
-  description = "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com"
-  default     = ""
-}
-
-# Only needed if auth mechanism is set to cognito or oauth. This is the domain for jupyterhub
-variable "jupyterhub_domain" {
-  type        = string
-  description = "Enter domain name for jupyterhub to be hosted,  e.g. eks.example.com. Only needed if auth mechanism is set to cognito or oauth"
-  default     = ""
-}
-
-# Only needed if auth mechanism is set to oauth. This is the root path for the oidc endpoints
-variable "oauth_domain" {
-  type        = string
-  description = "Enter oauth domain and endpoint, e.g. https://keycloak.example.com/realms/master/protocol/openid-connect. Only needed if auth mechanism is set to oauth"
-  default     = ""
-}
-
-# Only needed if auth mechanism is set to oauth. This is the id of the client
-variable "oauth_jupyter_client_id" {
-  type        = string
-  description = "Enter oauth client id for jupyterhub, e.g. jupyterhub. Only needed if auth mechanism is set to oauth"
-  default     = ""
-}
-
-# Only needed if auth mechanism is set to oauth. This is the secret for the client
-variable "oauth_jupyter_client_secret" {
-  type        = string
-  description = "Enter oauth client secret. Only needed if auth mechanism is set to oauth"
-  default     = ""
-  sensitive   = true
-}
-
-# Only needed if auth mechanism is set to oauth. This is the key to use for looking up the username.
-variable "oauth_username_key" {
-  type        = string
-  description = "oauth field for the username. e.g. 'preferred_username' Only needed if auth mechanism is set to oauth"
-  default     = ""
-}
diff --git a/ai-ml/jupyterhub/versions.tf b/ai-ml/jupyterhub/versions.tf
deleted file mode 100755
index 9b6678a5f..000000000
--- a/ai-ml/jupyterhub/versions.tf
+++ /dev/null
@@ -1,27 +0,0 @@
-terraform {
-  required_version = ">= 1.0.0"
-
-  required_providers {
-    aws = {
-      source  = "hashicorp/aws"
-      version = ">= 3.72"
-    }
-    kubernetes = {
-      source  = "hashicorp/kubernetes"
-      version = ">= 2.10"
-    }
-    helm = {
-      source  = "hashicorp/helm"
-      version = ">= 2.12.1"
-    }
-    random = {
-      source  = "hashicorp/random"
-      version = "3.1.0" # Replace with the appropriate version of the random provider
-    }
-
-    archive = {
-      source  = "hashicorp/archive"
-      version = "2.4.0"
-    }
-  }
-}
diff --git a/ai-ml/jupyterhub/vpc.tf b/ai-ml/jupyterhub/vpc.tf
deleted file mode 100755
index 59c3da89c..000000000
--- a/ai-ml/jupyterhub/vpc.tf
+++ /dev/null
@@ -1,53 +0,0 @@
-locals {
-  # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ
-  private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)]
-  # Routable Public subnets with NAT Gateway and Internet Gateway
-  # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ
-  public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)]
-  # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ
-  secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)]
-}
-
-#---------------------------------------------------------------
-# VPC
-#---------------------------------------------------------------
-# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts.
-# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements
-
-module "vpc" {
-  source  = "terraform-aws-modules/vpc/aws"
-  version = "~> 5.0"
-
-  name = local.name
-  cidr = var.vpc_cidr
-  azs  = local.azs
-
-  # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods
-  secondary_cidr_blocks = var.secondary_cidr_blocks
-
-  # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods
-  # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc.
-  private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets)
-
-  # ------------------------------
-  # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments
-  # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW
-  public_subnets     = local.public_subnets
-  enable_nat_gateway = true
-  single_nat_gateway = true
-  #-------------------------------
-
-  public_subnet_tags = {
-    "kubernetes.io/role/elb" = 1
-  }
-
-  private_subnet_tags = {
-    "kubernetes.io/role/internal-elb" = 1
-    # Tags subnets for Karpenter auto-discovery
-    "karpenter.sh/discovery" = local.name
-  }
-
-  tags = local.tags
-}
diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh
deleted file mode 100755
index 6f96c6ef5..000000000
--- a/ai-ml/mlflow/cleanup.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-set -o errexit
-set -o pipefail
-
-targets=(
-  "module.eks_data_addons"
-  "module.eks_blueprints_addons"
-)
-
-#-------------------------------------------
-# Helpful to delete the stuck in "Terminating" namespaces
-# Rerun the cleanup.sh script to detect and delete the stuck resources
-#-------------------------------------------
-terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')
-
-# If there are no terminating namespaces, exit the script
-if [[ -z $terminating_namespaces ]]; then
-    echo "No terminating namespaces found"
-fi
-
-for ns in $terminating_namespaces; do
-    echo "Terminating namespace: $ns"
-    kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
-done
-
-for target in "${targets[@]}"
-do
-  terraform destroy -target="$target" -auto-approve
-  destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
-  if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-    echo "SUCCESS: Terraform destroy of $target completed successfully"
-  else
-    echo "FAILED: Terraform destroy of $target failed"
-    exit 1
-  fi
-done
-
-terraform destroy -auto-approve
-destroy_output=$(terraform destroy -auto-approve 2>&1)
-if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
-  echo "SUCCESS: Terraform destroy of all targets completed successfully"
-else
-  echo "FAILED: Terraform destroy of all targets failed"
-  exit 1
-fi
diff --git a/ai-ml/mlflow/install.sh b/ai-ml/mlflow/install.sh
index 2832252fb..77838a56e 100755
--- a/ai-ml/mlflow/install.sh
+++ b/ai-ml/mlflow/install.sh
@@ -1,37 +1,6 @@
 #!/bin/bash
+# Copy the base infrastructure into the folder
+cp -r ../infrastructure/terraform/* ./terraform
 
-# List of Terraform modules to apply in sequence
-targets=(
-  "module.vpc"
-  "module.eks"
-  "module.ebs_csi_driver_irsa"
-  "module.eks_blueprints_addons"
-  "module.db"
-)
-
-# Initialize Terraform
-echo "Initializing ..."
-terraform init --upgrade || echo "\"terraform init\" failed"
-
-# Apply modules in sequence
-for target in "${targets[@]}"
-do
-  echo "Applying module $target..."
-  apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty)
-  if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-    echo "SUCCESS: Terraform apply of $target completed successfully"
-  else
-    echo "FAILED: Terraform apply of $target failed"
-    exit 1
-  fi
-done
-
-# Final apply to catch any remaining resources
-echo "Applying remaining resources..."
-apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty)
-if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then
-  echo "SUCCESS: Terraform apply of all modules completed successfully"
-else
-  echo "FAILED: Terraform apply of all modules failed"
-  exit 1
-fi
+cd terraform
+source ./install.sh
diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf
deleted file mode 100644
index 1600e75b5..000000000
--- a/ai-ml/mlflow/variables.tf
+++ /dev/null
@@ -1,44 +0,0 @@
-variable "name" {
-  description = "Name of the VPC and EKS Cluster"
-  default     = "mlflow-on-eks"
-  type        = string
-}
-
-variable "region" {
-  description = "Region"
-  type        = string
-  default     = "us-west-2"
-}
-
-variable "eks_cluster_version" {
-  description = "EKS Cluster version"
-  default     = "1.27"
-  type        = string
-}
-
-# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs
-variable "vpc_cidr" {
-  description = "VPC CIDR"
-  default     = "10.1.0.0/21"
-  type        = string
-}
-
-# RFC6598 range 100.64.0.0/10
-# Note you can only /16 range to VPC. You can add multiples of /16 if required
-variable "secondary_cidr_blocks" {
-  description = "Secondary CIDR blocks to be attached to VPC"
-  default     = ["100.64.0.0/16"]
-  type        = list(string)
-}
-
-variable "enable_amazon_prometheus" {
-  description = "Enable AWS Managed Prometheus service"
-  type        = bool
-  default     = true
-}
-
-variable "enable_mlflow_tracking" {
-  description = "Enable MLflow Tracking"
-  type        = bool
-  default     = true
-}

From 43f2659c5497884d60f43ab06d9f4c2edb9de1f4 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:13:51 -0800
Subject: [PATCH 11/16] add missing blueprint tfvars

---
 ai-ml/bionemo/terraform/blueprint.tfvars          | 1 +
 ai-ml/emr-spark-rapids/terraform/blueprint.tfvars | 3 +++
 ai-ml/jark-stack/terraform/blueprint.tfvars       | 8 ++++++++
 ai-ml/jupyterhub/terraform/blueprint.tfvars       | 6 ++++++
 ai-ml/mlflow/terraform/blueprint.tfvars           | 4 ++++
 5 files changed, 22 insertions(+)
 create mode 100644 ai-ml/bionemo/terraform/blueprint.tfvars
 create mode 100644 ai-ml/emr-spark-rapids/terraform/blueprint.tfvars
 create mode 100644 ai-ml/jark-stack/terraform/blueprint.tfvars
 create mode 100644 ai-ml/jupyterhub/terraform/blueprint.tfvars
 create mode 100644 ai-ml/mlflow/terraform/blueprint.tfvars

diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars
new file mode 100644
index 000000000..a1bde59d1
--- /dev/null
+++ b/ai-ml/bionemo/terraform/blueprint.tfvars
@@ -0,0 +1 @@
+name = "bionemo-on-eks"
diff --git a/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars b/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars
new file mode 100644
index 000000000..df273cd86
--- /dev/null
+++ b/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars
@@ -0,0 +1,3 @@
+name = "emr-spark-rapids"
+enable_amazon_prometheus = true
+enable_amazon_emr = true
diff --git a/ai-ml/jark-stack/terraform/blueprint.tfvars b/ai-ml/jark-stack/terraform/blueprint.tfvars
new file mode 100644
index 000000000..125219743
--- /dev/null
+++ b/ai-ml/jark-stack/terraform/blueprint.tfvars
@@ -0,0 +1,8 @@
+name = "jark-stack"
+enable_aws_efs_csi_driver = true
+enable_aws_cloudwatch_metrics = true
+enable_jupyterhub = true
+enable_volcano = true
+enable_kuberay_operator = true
+enable_argo_workflows = true
+enable_argo_events = true
diff --git a/ai-ml/jupyterhub/terraform/blueprint.tfvars b/ai-ml/jupyterhub/terraform/blueprint.tfvars
new file mode 100644
index 000000000..4b9562b12
--- /dev/null
+++ b/ai-ml/jupyterhub/terraform/blueprint.tfvars
@@ -0,0 +1,6 @@
+name = "jark-stack"
+enable_aws_efs_csi_driver = true
+enable_aws_cloudwatch_metrics = true
+enable_jupyterhub = true
+enable_volcano = true
+enable_kuberay_operator = true
diff --git a/ai-ml/mlflow/terraform/blueprint.tfvars b/ai-ml/mlflow/terraform/blueprint.tfvars
new file mode 100644
index 000000000..41466c8a3
--- /dev/null
+++ b/ai-ml/mlflow/terraform/blueprint.tfvars
@@ -0,0 +1,4 @@
+name = "mlflow-on-eks"
+enable_aws_cloudwatch_metrics = true
+enable_amazon_prometheus = true
+enable_mlflow_tracking = true

From e4ed1cc1c03f01ffcd0b05f37a513422e8838a4f Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Wed, 12 Feb 2025 12:57:07 -0800
Subject: [PATCH 12/16] update fsx csi driver variable

---
 ai-ml/infrastructure/terraform/addons.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index 57e63d062..a15597aaf 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -186,7 +186,7 @@ module "eks_blueprints_addons" {
   #---------------------------------------
   # Enable FSx for Lustre CSI Driver
   #---------------------------------------
-  enable_aws_fsx_csi_driver = var.enable_aws_efa_k8s_device_plugin
+  enable_aws_fsx_csi_driver = var.enable_aws_fsx_csi_driver
 
   tags = local.tags
 

From 94b8576557f4718ea908cd5f85a5b4b02bd3b59d Mon Sep 17 00:00:00 2001
From: Divya Gupta <vishdivg@amazon.com>
Date: Mon, 17 Feb 2025 13:23:26 -0500
Subject: [PATCH 13/16] missing bionemo tfvar

---
 ai-ml/bionemo/terraform/blueprint.tfvars | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars
index a1bde59d1..6d2b662ea 100644
--- a/ai-ml/bionemo/terraform/blueprint.tfvars
+++ b/ai-ml/bionemo/terraform/blueprint.tfvars
@@ -1 +1,3 @@
 name = "bionemo-on-eks"
+enable_aws_fsx_csi_driver= "true"
+deploy_fsx_volume= "true"

From 91eb0e1e499b97a7687584fbb3e12ff3e9560895 Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Tue, 18 Feb 2025 07:42:38 -0800
Subject: [PATCH 14/16] add missing redis ha, torchx etcd vars

---
 ai-ml/infrastructure/terraform/addons.tf      | 17 ++++++
 .../terraform/elastic-cache-redis.tf          | 57 +++++++++++++++++++
 ai-ml/infrastructure/terraform/variables.tf   | 11 ++++
 3 files changed, 85 insertions(+)
 create mode 100644 ai-ml/infrastructure/terraform/elastic-cache-redis.tf

diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf
index a15597aaf..89f8badf6 100644
--- a/ai-ml/infrastructure/terraform/addons.tf
+++ b/ai-ml/infrastructure/terraform/addons.tf
@@ -603,6 +603,23 @@ module "eks_data_addons" {
   }
 }
 
+#---------------------------------------------------------------
+# ETCD for TorchX
+#---------------------------------------------------------------
+data "http" "torchx_etcd_yaml" {
+  url = "https://raw.githubusercontent.com/pytorch/torchx/main/resources/etcd.yaml"
+}
+
+data "kubectl_file_documents" "torchx_etcd_yaml" {
+  content = data.http.torchx_etcd_yaml.response_body
+}
+
+resource "kubectl_manifest" "torchx_etcd" {
+  for_each   = var.enable_torchx_etcd ? data.kubectl_file_documents.torchx_etcd_yaml.manifests : {}
+  yaml_body  = each.value
+  depends_on = [module.eks.eks_cluster_id]
+}
+
 #---------------------------------------------------------------
 # Grafana Admin credentials resources
 # Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
diff --git a/ai-ml/infrastructure/terraform/elastic-cache-redis.tf b/ai-ml/infrastructure/terraform/elastic-cache-redis.tf
new file mode 100644
index 000000000..df3c3c6a8
--- /dev/null
+++ b/ai-ml/infrastructure/terraform/elastic-cache-redis.tf
@@ -0,0 +1,57 @@
+#-------------------------------------------
+# For Rayhead High availability cluster
+#-------------------------------------------
+module "elasticache" {
+  create  = var.enable_rayserve_ha_elastic_cache_redis
+  source  = "terraform-aws-modules/elasticache/aws"
+  version = "1.2.0"
+
+  cluster_id               = local.name
+  create_cluster           = true
+  create_replication_group = false
+
+  engine_version = "7.1"
+  node_type      = "cache.t4g.small"
+
+  apply_immediately = true
+
+  # Security Group
+  vpc_id = module.vpc.vpc_id
+  security_group_rules = {
+    ingress_vpc = {
+      # Default type is `ingress`
+      # Default port is based on the default engine port
+      description = "VPC traffic"
+      cidr_ipv4   = module.vpc.vpc_cidr_block
+    }
+
+    ingress_from_eks_worker_node_tcp = {
+      description                  = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node"
+      protocol                     = "tcp"
+      from_port                    = 6379
+      referenced_security_group_id = module.eks.node_security_group_id
+      to_port                      = 6379
+      type                         = "ingress"
+    }
+  }
+
+  # Subnet Group
+  subnet_group_name        = local.name
+  subnet_group_description = "${title(local.name)} subnet group"
+  subnet_ids               = module.vpc.private_subnets
+
+  # Parameter Group
+  create_parameter_group      = true
+  parameter_group_name        = local.name
+  parameter_group_family      = "redis7"
+  parameter_group_description = "${title(local.name)} parameter group"
+  parameters = [
+    {
+      name  = "latency-tracking"
+      value = "yes"
+    }
+  ]
+
+  tags = local.tags
+
+}
diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf
index d768410c7..79b891afe 100644
--- a/ai-ml/infrastructure/terraform/variables.tf
+++ b/ai-ml/infrastructure/terraform/variables.tf
@@ -119,6 +119,17 @@ variable "huggingface_token" {
   default     = "DUMMY_TOKEN_REPLACE_ME"
   sensitive   = true
 }
+variable "enable_rayserve_ha_elastic_cache_redis" {
+  description = "Flag to enable Ray Head High Availability with Elastic Cache for Redis"
+  type        = bool
+  default     = false
+}
+
+variable "enable_torchx_etcd" {
+  description = "Flag to enable etcd deployment for torchx"
+  type        = bool
+  default     = false
+}
 
 # Jupyterhub Specific Variables
 

From 2a352da35dc2c93d75d25c3bcfae5392579c89c9 Mon Sep 17 00:00:00 2001
From: Divya Gupta <vishdivg@amazon.com>
Date: Tue, 18 Feb 2025 10:47:18 -0500
Subject: [PATCH 15/16] style fix

---
 ai-ml/bionemo/terraform/blueprint.tfvars | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars
index 6d2b662ea..53f738698 100644
--- a/ai-ml/bionemo/terraform/blueprint.tfvars
+++ b/ai-ml/bionemo/terraform/blueprint.tfvars
@@ -1,3 +1,3 @@
 name = "bionemo-on-eks"
-enable_aws_fsx_csi_driver= "true"
-deploy_fsx_volume= "true"
+enable_aws_fsx_csi_driver = "true"
+deploy_fsx_volume = "true"

From 7898ffa50b2dcc0ffb2527a8c2d3ae253aadbb5c Mon Sep 17 00:00:00 2001
From: omrishiv <327609+omrishiv@users.noreply.github.com>
Date: Thu, 20 Feb 2025 12:38:53 -0800
Subject: [PATCH 16/16] addressing some review comments

---
 ai-ml/infrastructure/terraform/eks.tf         | 96 -------------------
 .../jupyterhub-values-cognito.yaml            |  3 -
 .../helm-values/jupyterhub-values-dummy.yaml  |  3 -
 .../helm-values/jupyterhub-values-oauth.yaml  |  3 -
 .../monitoring/neuron-monitor-daemonset.yaml  |  4 +
 ai-ml/jupyterhub/terraform/blueprint.tfvars   |  5 +-
 6 files changed, 6 insertions(+), 108 deletions(-)

diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf
index 169b19bac..26ac6bffa 100644
--- a/ai-ml/infrastructure/terraform/eks.tf
+++ b/ai-ml/infrastructure/terraform/eks.tf
@@ -118,101 +118,5 @@ module "eks" {
         Name = "core-node-grp"
       })
     }
-
-#     # GPU Nodegroup for JupyterHub Notebook and Ray Service
-#     gpu1 = {
-#       name        = "gpu-node-grp"
-#       description = "EKS Node Group to run GPU workloads"
-#       # Filtering only Secondary CIDR private subnets starting with "100.".
-#       # Subnet IDs where the nodes/node groups will be provisioned
-#       subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-#         substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-#       )
-#
-#       ami_type     = "AL2_x86_64_GPU"
-#       min_size     = 0
-#       max_size     = 1
-#       desired_size = 0
-#
-#       instance_types = ["g5.12xlarge"]
-#
-#       labels = {
-#         WorkerType    = "ON_DEMAND"
-#         NodeGroupType = "gpu"
-#       }
-#
-#       taints = {
-#         gpu = {
-#           key      = "nvidia.com/gpu"
-#           effect   = "NO_SCHEDULE"
-#           operator = "EXISTS"
-#         }
-#       }
-#
-#       tags = merge(local.tags, {
-#         Name = "gpu-node-grp"
-#       })
-#     }
-
-    # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation.
-    # #
-    # gpu_p5_node_group = {
-    #   name        = "p5-gpu-node-grp"
-    #   description = "EKS Node Group to run GPU workloads"
-
-    #   ami_type     = "AL2_x86_64_GPU"
-
-    #   instance_types = ["p5.48xlarge"]
-    #   capacity_type = "ON_DEMAND"
-
-    #   # Filtering only Secondary CIDR private subnets starting with "100.".
-    #   # Subnet IDs where the nodes/node groups will be provisioned
-    #   subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
-    #     substr(cidr_block, 0, 4) == "100." ? subnet_id : null]
-    #   )
-
-    #   # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation.
-    #   # subnet_ids = ["subnet-01234567890fds"]
-    #   # capacity_reservation_specification = {
-    #   #   capacity_reservation_target = {
-    #   #     capacity_reservation_id = "cr-01234567890fds"
-    #   #   }
-    #   # }
-
-    #   min_size     = 1
-    #   max_size     = 1
-    #   desired_size = 1
-
-    #   # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance
-    #   # we assign the host interface device_index=0, and all other interfaces device_index=1
-    #   #   p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31
-    #   #   p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3
-    #   network_interfaces = [
-    #     for i in range(32) : {
-    #       associate_public_ip_address = false
-    #       delete_on_termination       = true
-    #       device_index                = i == 0 ? 0 : 1
-    #       network_card_index          = i
-    #       interface_type              = "efa"
-    #     }
-    #   ]
-
-    #   # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171
-    #   bootstrap_extra_args = "--local-disks raid0"
-    #   taints = {
-    #     gpu = {
-    #       key      = "nvidia.com/gpu"
-    #       effect   = "NO_SCHEDULE"
-    #       operator = "EXISTS"
-    #     }
-    #   }
-    #   labels = {
-    #     WorkerType    = "ON_DEMAND"
-    #     NodeGroupType = "gpu"
-    #   }
-    #   tags = merge(local.tags, {
-    #     Name = "p5-gpu-node-grp"
-    #   })
-    # }
   }
 }
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
index 4e2073836..aab0e8b9c 100755
--- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml
@@ -249,9 +249,6 @@ scheduling:
   userPlaceholder:
     enabled: false
     replicas: 1
-#  userPods:
-#    nodeAffinity:
-#      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
 
 prePuller:
   hook:
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
index 0d1fcdc4e..d13fa4126 100755
--- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml
@@ -204,9 +204,6 @@ scheduling:
   userPlaceholder:
     enabled: false
     replicas: 1
-#  userPods:
-#    nodeAffinity:
-#      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
 
 prePuller:
   hook:
diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
index 486a750a8..bf53eca66 100755
--- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
+++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml
@@ -217,9 +217,6 @@ scheduling:
   userPlaceholder:
     enabled: false
     replicas: 1
-  userPods:
-    nodeAffinity:
-      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
 
 prePuller:
   hook:
diff --git a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
index 2ed065546..bffd6558a 100644
--- a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
+++ b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml
@@ -38,5 +38,9 @@ spec:
             value: 160MiB
           securityContext:
             privileged: true
+      tolerations:
+        - key: aws.amazon.com/neuron
+          operator: Exists
+          effect: NoSchedule
       nodeSelector:
         accelerator: neuron
diff --git a/ai-ml/jupyterhub/terraform/blueprint.tfvars b/ai-ml/jupyterhub/terraform/blueprint.tfvars
index 4b9562b12..177a52b35 100644
--- a/ai-ml/jupyterhub/terraform/blueprint.tfvars
+++ b/ai-ml/jupyterhub/terraform/blueprint.tfvars
@@ -1,6 +1,5 @@
-name = "jark-stack"
+name = "jupyterhub"
 enable_aws_efs_csi_driver = true
 enable_aws_cloudwatch_metrics = true
 enable_jupyterhub = true
-enable_volcano = true
-enable_kuberay_operator = true
+