From 2a6c6db5dd66107407d75881a2db755d65829935 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 24 Jan 2025 15:31:00 -0800 Subject: [PATCH 01/16] consolidate ai blueprints --- ai-ml/infrastructure/terraform/addons.tf | 653 +++ ai-ml/infrastructure/terraform/cleanup.sh | 71 + ai-ml/infrastructure/terraform/eks.tf | 212 + .../terraform/fsx-for-lustre.tf | 138 + .../fsx-for-lustre/fsxlustre-static-pv.yaml | 21 + .../fsx-for-lustre/fsxlustre-static-pvc.yaml | 12 + .../fsxlustre-storage-class.yaml | 9 + .../helm-values/argo-events-values.yaml | 4 + .../helm-values/argo-workflows-values.yaml | 4 + .../aws-cloudwatch-metrics-values.yaml | 11 + .../aws-efa-k8s-device-plugin-values.yaml | 5 + .../helm-values/ingress-nginx-values.yaml | 11 + .../helm-values/jupyterhub-values.yaml | 54 + .../helm-values/kube-prometheus.yaml | 48 + .../helm-values/kubecost-values.yaml | 69 + .../helm-values/mlflow-tracking-values.yaml | 88 + ai-ml/infrastructure/terraform/install.sh | 33 + ai-ml/infrastructure/terraform/karpenter.tf | 0 ai-ml/infrastructure/terraform/main.tf | 60 + ai-ml/infrastructure/terraform/mlflow-core.tf | 245 + .../terraform/monitoring/dcgm.yaml | 82 + .../monitoring/neuron-monitor-daemonset.yaml | 42 + .../terraform/monitoring/podMonitor.yaml | 21 + .../data_grafana_dashboard.json | 4535 +++++++++++++++++ .../default_grafana_dashboard.json | 2836 +++++++++++ .../serve_deployment_grafana_dashboard.json | 2115 ++++++++ .../serve_grafana_dashboard.json | 3098 +++++++++++ .../terraform/monitoring/serviceMonitor.yaml | 25 + ai-ml/infrastructure/terraform/outputs.tf | 9 + ai-ml/infrastructure/terraform/variables.tf | 107 + ai-ml/infrastructure/terraform/versions.tf | 33 + ai-ml/infrastructure/terraform/vpc.tf | 62 + 32 files changed, 14713 insertions(+) create mode 100644 ai-ml/infrastructure/terraform/addons.tf create mode 100755 ai-ml/infrastructure/terraform/cleanup.sh create mode 100644 ai-ml/infrastructure/terraform/eks.tf create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre.tf create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml create mode 100644 ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml create mode 100755 ai-ml/infrastructure/terraform/install.sh create mode 100644 ai-ml/infrastructure/terraform/karpenter.tf create mode 100644 ai-ml/infrastructure/terraform/main.tf create mode 100644 ai-ml/infrastructure/terraform/mlflow-core.tf create mode 100644 ai-ml/infrastructure/terraform/monitoring/dcgm.yaml create mode 100644 ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml create mode 100644 ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json create mode 100644 ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json create mode 100644 ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml create mode 100644 ai-ml/infrastructure/terraform/outputs.tf create mode 100644 ai-ml/infrastructure/terraform/variables.tf create mode 100644 ai-ml/infrastructure/terraform/versions.tf create mode 100644 ai-ml/infrastructure/terraform/vpc.tf diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf new file mode 100644 index 000000000..f8ae300cc --- /dev/null +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -0,0 +1,653 @@ +#--------------------------------------------------------------- +# GP3 Encrypted Storage Class +#--------------------------------------------------------------- +resource "kubernetes_annotations" "disable_gp2" { + annotations = { + "storageclass.kubernetes.io/is-default-class" : "false" + } + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + metadata { + name = "gp2" + } + force = true + + depends_on = [module.eks.eks_cluster_id] +} + +resource "kubernetes_storage_class" "default_gp3" { + metadata { + name = "gp3" + annotations = { + "storageclass.kubernetes.io/is-default-class" : "true" + } + } + + storage_provisioner = "ebs.csi.aws.com" + reclaim_policy = "Delete" + allow_volume_expansion = true + volume_binding_mode = "WaitForFirstConsumer" + parameters = { + fsType = "ext4" + encrypted = true + type = "gp3" + } + + depends_on = [kubernetes_annotations.disable_gp2] +} + +#--------------------------------------------------------------- +# IRSA for EBS CSI Driver +#--------------------------------------------------------------- +module "ebs_csi_driver_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "~> 5.20" + role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") + attach_ebs_csi_policy = true + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + tags = local.tags +} + +#--------------------------------------------------------------- +# EKS Blueprints Addons +#--------------------------------------------------------------- +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.2" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------- + # Amazon EKS Managed Add-ons + #--------------------------------------- + eks_addons = { + aws-ebs-csi-driver = { + service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn + } + coredns = { + preserve = true + } + kube-proxy = { + preserve = true + } + # VPC CNI uses worker node IAM role policies + vpc-cni = { + preserve = true + } + } + + #--------------------------------------- + # AWS Load Balancer Controller Add-on + #--------------------------------------- + enable_aws_load_balancer_controller = true + # turn off the mutating webhook for services because we are using + # service.beta.kubernetes.io/aws-load-balancer-type: external + aws_load_balancer_controller = { + set = [{ + name = "enableServiceMutatorWebhook" + value = "false" + }] + } + + #--------------------------------------- + # Ingress Nginx Add-on + #--------------------------------------- + enable_ingress_nginx = true + ingress_nginx = { + values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] + } + + #--------------------------------------- + # Karpenter Autoscaler for EKS Cluster + #--------------------------------------- + enable_karpenter = true + karpenter_enable_spot_termination = true + karpenter_node = { + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + karpenter = { + chart_version = "0.37.0" + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + source_policy_documents = [ + data.aws_iam_policy_document.karpenter_controller_policy.json + ] + } + + #--------------------------------------- + # Argo Workflows & Argo Events + #--------------------------------------- + enable_argo_workflows = var.enable_argo_workflows + argo_workflows = { + name = "argo-workflows" + namespace = "argo-workflows" + repository = "https://argoproj.github.io/argo-helm" + values = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})] + } + + enable_argo_events = var.enable_argo_events + argo_events = { + name = "argo-events" + namespace = "argo-events" + repository = "https://argoproj.github.io/argo-helm" + values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})] + } + + #--------------------------------------- + # Prometheus and Grafana stack + #--------------------------------------- + #--------------------------------------------------------------- + # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` + # 2- Grafana Admin user: admin + # 3- Get secret name from Terrafrom output: `terraform output grafana_secret_name` + # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` + #--------------------------------------------------------------- + enable_kube_prometheus_stack = var.enable_kube_prometheus_stack + kube_prometheus_stack = { + values = [ + templatefile("${path.module}/helm-values/kube-prometheus.yaml", { + storage_class_type = kubernetes_storage_class.default_gp3.id + }) + ] + chart_version = "48.1.1" + set_sensitive = [ + { + name = "grafana.adminPassword" + value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string + } + ], + } + + #--------------------------------------- + # Enable FSx for Lustre CSI Driver + #--------------------------------------- + enable_aws_fsx_csi_driver = var.enable_aws_efa_k8s_device_plugin + + tags = local.tags + + #--------------------------------------- + # CloudWatch metrics for EKS + #--------------------------------------- + enable_aws_cloudwatch_metrics = var.enable_aws_cloudwatch_metrics + aws_cloudwatch_metrics = { + values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] + } + +} + +#--------------------------------------------------------------- +# Data on EKS Kubernetes Addons +#--------------------------------------------------------------- + +module "data_addons" { + source = "aws-ia/eks-data-addons/aws" + version = "1.33.0" + + oidc_provider_arn = module.eks.oidc_provider_arn + + #--------------------------------------------------------------- + # JupyterHub Add-on + #--------------------------------------------------------------- + enable_jupyterhub = var.enable_jupyterhub + jupyterhub_helm_config = { + namespace = kubernetes_namespace_v1.jupyterhub.id + create_namespace = false + values = [file("${path.module}/helm-values/jupyterhub-values.yaml")] + } + + enable_volcano = var.enable_volcano + #--------------------------------------- + # Kuberay Operator + #--------------------------------------- + enable_kuberay_operator = var.enable_kuberay_operator + kuberay_operator_helm_config = { + version = "1.1.1" + # Enabling Volcano as Batch scheduler for KubeRay Operator + values = [ + <<-EOT + batchScheduler: + enabled: true + EOT + ] + } + + #--------------------------------------------------------------- + # NVIDIA Device Plugin Add-on + #--------------------------------------------------------------- + enable_nvidia_device_plugin = true + nvidia_device_plugin_helm_config = { + version = "v0.16.1" + name = "nvidia-device-plugin" + values = [ + <<-EOT + nodeSelector: + accelerator: nvidia + gfd: + enabled: true + nfd: + gc: + nodeSelector: + accelerator: nvidia + topologyUpdater: + nodeSelector: + accelerator: nvidia + worker: + nodeSelector: + accelerator: nvidia + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" + EOT + ] + } + + #--------------------------------------- + # EFA Device Plugin Add-on + #--------------------------------------- + # IMPORTANT: Enable EFA only on nodes with EFA devices attached. + # Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling. + enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin + aws_efa_k8s_device_plugin_helm_config = { + values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")] + } + + #--------------------------------------------------------------- + # Kubecost Add-on + #--------------------------------------------------------------- + enable_kubecost = var.enable_kubecost + kubecost_helm_config = { + values = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})] + version = "2.2.2" + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + + #--------------------------------------------------------------- + # Neuron Add-on + #--------------------------------------------------------------- + enable_aws_neuron_device_plugin = true + + aws_neuron_device_plugin_helm_config = { + # Enable default scheduler + values = [ + <<-EOT + devicePlugin: + tolerations: + - key: CriticalAddonsOnly + operator: Exists + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + - key: hub.jupyter.org/dedicated + operator: Exists + effect: NoSchedule + scheduler: + enabled: true + npd: + enabled: false + EOT + ] + } + + #--------------------------------------------------------------- + # Karpenter Resources Add-on + #--------------------------------------------------------------- + enable_karpenter_resources = true + karpenter_resources_helm_config = { + + g5-gpu-karpenter = { + values = [ + <<-EOT + name: g5-gpu-karpenter + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + amiFamily: Bottlerocket + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[2]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + instanceStorePolicy: RAID0 + blockDeviceMappings: + # Root device + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp3 + encrypted: true + # Data device: Container resources such as images and logs + - deviceName: /dev/xvdb + ebs: + volumeSize: 300Gi + volumeType: gp3 + encrypted: true + ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""} + + nodePool: + labels: + - instanceType: g5-gpu-karpenter + - type: karpenter + - accelerator: nvidia + taints: + - key: nvidia.com/gpu + value: "Exists" + effect: "NoSchedule" + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["g5"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: [ "2xlarge", "4xlarge", "8xlarge" ] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 300s + expireAfter: 720h + weight: 100 + EOT + ] + } + x86-cpu-karpenter = { + values = [ + <<-EOT + name: x86-cpu-karpenter + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + amiFamily: Bottlerocket + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[3]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + blockDeviceMappings: + # Root device + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + # Data device: Container resources such as images and logs + - deviceName: /dev/xvdb + ebs: + volumeSize: 300Gi + volumeType: gp3 + encrypted: true + ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""} + + nodePool: + labels: + - type: karpenter + - instanceType: x86-cpu-karpenter + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["m5"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 300s + expireAfter: 720h + weight: 100 + EOT + ] + } + trainium-trn1 = { + values = [ + <<-EOT + name: trainium-trn1 + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + amiSelectorTerms: + - alias: al2023@v20241024 + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[2]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + instanceStorePolicy: RAID0 + blockDeviceMappings: + # Root device + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + # Data device: Container resources such as images and logs + - deviceName: /dev/xvdb + ebs: + volumeSize: 300Gi + volumeType: gp3 + encrypted: true + ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""} + + nodePool: + labels: + - type: karpenter + - instanceType: trainium-trn1 + - accelerator: neuron + taints: + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["trn1"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["on-demand"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 300s + expireAfter: 720h + weight: 100 + EOT + ] + } + inferentia-inf2 = { + values = [ + <<-EOT + name: inferentia-inf2 + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + amiSelectorTerms: + - alias: al2023@v20241024 + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[2]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + blockDevice: + # Root device + - deviceName: /dev/xvda + ebs: + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + # Data device: Container resources such as images and logs + - deviceName: /dev/xvdb + ebs: + volumeSize: 300Gi + volumeType: gp3 + encrypted: true + ${var.bottlerocket_data_disk_snapshot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snapshot_id}" : ""} + nodePool: + labels: + - instanceType: inferentia-inf2 + - type: karpenter + - accelerator: neuron + taints: + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["inf2"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: [ "on-demand"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 300s + expireAfter: 720h + weight: 100 + EOT + ] + } + } + + depends_on = [ + kubernetes_secret_v1.huggingface_token, + kubernetes_config_map_v1.notebook + ] +} + + +#--------------------------------------------------------------- +# Additional Resources +#--------------------------------------------------------------- + +resource "kubernetes_namespace_v1" "jupyterhub" { + metadata { + name = "jupyterhub" + } +} + + +resource "kubernetes_secret_v1" "huggingface_token" { + metadata { + name = "hf-token" + namespace = kubernetes_namespace_v1.jupyterhub.id + } + + data = { + token = var.huggingface_token + } +} + +resource "kubernetes_config_map_v1" "notebook" { + metadata { + name = "notebook" + namespace = kubernetes_namespace_v1.jupyterhub.id + } +} + +#--------------------------------------------------------------- +# MLflow Tracking Add-on +#--------------------------------------------------------------- +module "eks_data_addons" { + source = "aws-ia/eks-data-addons/aws" + version = "1.33.0" # ensure to update this to the latest/desired version + + oidc_provider_arn = module.eks.oidc_provider_arn + enable_mlflow_tracking = var.enable_mlflow_tracking + + mlflow_tracking_helm_config = { + mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace) + + values = [ + templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", { + mlflow_sa = local.mlflow_service_account + mlflow_irsa = try(module.mlflow_irsa[0].iam_role_arn, "") + # MLflow Postgres RDS Config + mlflow_db_username = local.mlflow_name + mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "") + mlflow_db_name = try(module.db[0].db_instance_name, "") + mlflow_db_host = try(element(split(":", module.db[0].db_instance_endpoint), 0), "") + # S3 bucket config for artifacts + s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "") + }) + ] + } +} + +#--------------------------------------------------------------- +# Grafana Admin credentials resources +# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana" +#--------------------------------------------------------------- +data "aws_secretsmanager_secret_version" "admin_password_version" { + secret_id = aws_secretsmanager_secret.grafana.id + depends_on = [aws_secretsmanager_secret_version.grafana] +} + +resource "random_password" "grafana" { + length = 16 + special = true + override_special = "@_" +} + +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "grafana" { + name_prefix = "${local.name}-oss-grafana" + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "grafana" { + secret_id = aws_secretsmanager_secret.grafana.id + secret_string = random_password.grafana.result +} + +resource "kubectl_manifest" "neuron_monitor" { + yaml_body = file("${path.module}/monitoring/neuron-monitor-daemonset.yaml") +} + +resource "kubectl_manifest" "dcgm" { + yaml_body = file("${path.module}/monitoring/dcgm.yaml") +} + +data "aws_iam_policy_document" "karpenter_controller_policy" { + statement { + actions = [ + "ec2:RunInstances", + "ec2:CreateLaunchTemplate", + ] + resources = ["*"] + effect = "Allow" + sid = "KarpenterControllerAdditionalPolicy" + } +} diff --git a/ai-ml/infrastructure/terraform/cleanup.sh b/ai-ml/infrastructure/terraform/cleanup.sh new file mode 100755 index 000000000..b09efd384 --- /dev/null +++ b/ai-ml/infrastructure/terraform/cleanup.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +echo "Destroying RayService..." + +# Delete the Ingress/SVC before removing the addons +TMPFILE=$(mktemp) +terraform output -raw configure_kubectl > "$TMPFILE" +# check if TMPFILE contains the string "No outputs found" +if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then + echo "No outputs found, skipping kubectl delete" + source "$TMPFILE" + kubectl delete -f src/service/ray-service.yaml +fi + + +# List of Terraform modules to apply in sequence +targets=( + "module.data_addons" + "module.eks_blueprints_addons" + "module.eks" + "module.vpc" +) + +# Destroy modules in sequence +for target in "${targets[@]}" +do + echo "Destroying module $target..." + destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then + echo "SUCCESS: Terraform destroy of $target completed successfully" + else + echo "FAILED: Terraform destroy of $target failed" + exit 1 + fi +done + +echo "Destroying Load Balancers..." + +for arn in $(aws resourcegroupstaggingapi get-resources \ + --resource-type-filters elasticloadbalancing:loadbalancer \ + --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'ResourceTagMappingList[].ResourceARN' \ + --output text); do \ + aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \ + done + +echo "Destroying Target Groups..." +for arn in $(aws resourcegroupstaggingapi get-resources \ + --resource-type-filters elasticloadbalancing:targetgroup \ + --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'ResourceTagMappingList[].ResourceARN' \ + --output text); do \ + aws elbv2 delete-target-group --target-group-arn "$arn"; \ + done + +echo "Destroying Security Groups..." +for sg in $(aws ec2 describe-security-groups \ + --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \ + --query 'SecurityGroups[].GroupId' --output text); do \ + aws ec2 delete-security-group --group-id "$sg"; \ + done + +## Final destroy to catch any remaining resources +echo "Destroying remaining resources..." +destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then + echo "SUCCESS: Terraform destroy of all modules completed successfully" +else + echo "FAILED: Terraform destroy of all modules failed" + exit 1 +fi diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf new file mode 100644 index 000000000..3543232ec --- /dev/null +++ b/ai-ml/infrastructure/terraform/eks.tf @@ -0,0 +1,212 @@ +#--------------------------------------------------------------- +# EKS Cluster +#--------------------------------------------------------------- +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 19.15" + + cluster_name = local.name + cluster_version = var.eks_cluster_version + + # if true, Your cluster API server is accessible from the internet. + # You can, optionally, limit the CIDR blocks that can access the public endpoint. + #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. + cluster_endpoint_public_access = true + + vpc_id = module.vpc.vpc_id + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the EKS Control Plane ENIs will be created + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) + + manage_aws_auth_configmap = true + aws_auth_roles = [ + # We need to add in the Karpenter node IAM role for nodes launched by Karpenter + { + rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = [ + "system:bootstrappers", + "system:nodes", + ] + } + ] + #--------------------------------------- + # Note: This can further restricted to specific required for each Add-on and your application + #--------------------------------------- + # Extend cluster security group rules + cluster_security_group_additional_rules = { + ingress_nodes_ephemeral_ports_tcp = { + description = "Nodes on ephemeral ports" + protocol = "tcp" + from_port = 0 + to_port = 65535 + type = "ingress" + source_node_security_group = true + } + } + + node_security_group_additional_rules = { + # Allows Control Plane Nodes to talk to Worker nodes on all ports. + # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. + # This can be restricted further to specific port based on the requirement for each Add-on + # e.g., coreDNS 53, metrics-server 4443. + # Update this according to your security requirements if needed + ingress_cluster_to_node_all_traffic = { + description = "Cluster API to Nodegroup all traffic" + protocol = "-1" + from_port = 0 + to_port = 0 + type = "ingress" + source_cluster_security_group = true + } + } + + eks_managed_node_group_defaults = { + iam_role_additional_policies = { + # Not required, but used in the example to access the nodes to inspect mounted volumes + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + + ebs_optimized = true + # This block device is used only for root volume. Adjust volume according to your size. + # NOTE: Don't use this volume for ML workloads + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 100 + volume_type = "gp3" + } + } + } + } + + eks_managed_node_groups = { + # It's recommended to have a Managed Node group for hosting critical add-ons + # It's recommended to use Karpenter to place your workloads instead of using Managed Node groups + # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. + core_node_group = { + name = "core-node-group" + description = "EKS Core node group for hosting system add-ons" + # Filtering only Secondary CIDR private subnets starting with "100.". + # Subnet IDs where the nodes/node groups will be provisioned + subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + ) + + # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2 + ami_type = "AL2_x86_64" # Use this for Graviton AL2_ARM_64 + min_size = 2 + max_size = 8 + desired_size = 2 + + instance_types = ["m5.xlarge"] + + labels = { + WorkerType = "ON_DEMAND" + NodeGroupType = "core" + } + + tags = merge(local.tags, { + Name = "core-node-grp" + }) + } + +# # GPU Nodegroup for JupyterHub Notebook and Ray Service +# gpu1 = { +# name = "gpu-node-grp" +# description = "EKS Node Group to run GPU workloads" +# # Filtering only Secondary CIDR private subnets starting with "100.". +# # Subnet IDs where the nodes/node groups will be provisioned +# subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : +# substr(cidr_block, 0, 4) == "100." ? subnet_id : null] +# ) +# +# ami_type = "AL2_x86_64_GPU" +# min_size = 0 +# max_size = 1 +# desired_size = 0 +# +# instance_types = ["g5.12xlarge"] +# +# labels = { +# WorkerType = "ON_DEMAND" +# NodeGroupType = "gpu" +# } +# +# taints = { +# gpu = { +# key = "nvidia.com/gpu" +# effect = "NO_SCHEDULE" +# operator = "EXISTS" +# } +# } +# +# tags = merge(local.tags, { +# Name = "gpu-node-grp" +# }) +# } + + # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation. + # # + # gpu_p5_node_group = { + # name = "p5-gpu-node-grp" + # description = "EKS Node Group to run GPU workloads" + + # ami_type = "AL2_x86_64_GPU" + + # instance_types = ["p5.48xlarge"] + # capacity_type = "ON_DEMAND" + + # # Filtering only Secondary CIDR private subnets starting with "100.". + # # Subnet IDs where the nodes/node groups will be provisioned + # subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : + # substr(cidr_block, 0, 4) == "100." ? subnet_id : null] + # ) + + # # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation. + # # subnet_ids = ["subnet-01234567890fds"] + # # capacity_reservation_specification = { + # # capacity_reservation_target = { + # # capacity_reservation_id = "cr-01234567890fds" + # # } + # # } + + # min_size = 1 + # max_size = 1 + # desired_size = 1 + + # # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance + # # we assign the host interface device_index=0, and all other interfaces device_index=1 + # # p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31 + # # p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3 + # network_interfaces = [ + # for i in range(32) : { + # associate_public_ip_address = false + # delete_on_termination = true + # device_index = i == 0 ? 0 : 1 + # network_card_index = i + # interface_type = "efa" + # } + # ] + + # # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171 + # bootstrap_extra_args = "--local-disks raid0" + # taints = { + # gpu = { + # key = "nvidia.com/gpu" + # effect = "NO_SCHEDULE" + # operator = "EXISTS" + # } + # } + # labels = { + # WorkerType = "ON_DEMAND" + # NodeGroupType = "gpu" + # } + # tags = merge(local.tags, { + # Name = "p5-gpu-node-grp" + # }) + # } + } +} diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre.tf b/ai-ml/infrastructure/terraform/fsx-for-lustre.tf new file mode 100644 index 000000000..d97eef9b0 --- /dev/null +++ b/ai-ml/infrastructure/terraform/fsx-for-lustre.tf @@ -0,0 +1,138 @@ +#--------------------------------------------------------------- +# FSx for Lustre File system Static provisioning +# 1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600) +# 2> Create Storage Class for Filesystem (Cluster scoped) +# 3> Persistent Volume with Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped) +# 4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped) +#--------------------------------------------------------------- +# NOTE: FSx for Lustre file system creation can take up to 10 mins +resource "aws_fsx_lustre_file_system" "this" { + count = var.deploy_fsx_volume ? 1 : 0 + deployment_type = "PERSISTENT_2" + storage_type = "SSD" + per_unit_storage_throughput = "500" # 125, 250, 500, 1000 + storage_capacity = 2400 + + subnet_ids = [module.vpc.private_subnets[0]] + security_group_ids = [aws_security_group.fsx[0].id] + log_configuration { + level = "WARN_ERROR" + } + tags = merge({ "Name" : "${local.name}-static" }, local.tags) +} + +# This process can take up to 7 mins +resource "aws_fsx_data_repository_association" "this" { + count = var.deploy_fsx_volume ? 1 : 0 + file_system_id = aws_fsx_lustre_file_system.this[0].id + data_repository_path = "s3://${module.fsx_s3_bucket[0].s3_bucket_id}" + file_system_path = "/data" # This directory will be used in Spark podTemplates under volumeMounts as subPath + + s3 { + auto_export_policy { + events = ["NEW", "CHANGED", "DELETED"] + } + + auto_import_policy { + events = ["NEW", "CHANGED", "DELETED"] + } + } +} + +#--------------------------------------------------------------- +# Sec group for FSx for Lustre +#--------------------------------------------------------------- +resource "aws_security_group" "fsx" { + count = var.deploy_fsx_volume ? 1 : 0 + name = "${local.name}-fsx" + description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem" + vpc_id = module.vpc.vpc_id + + ingress { + description = "Allows Lustre traffic between Lustre clients" + cidr_blocks = module.vpc.private_subnets_cidr_blocks + from_port = 1021 + to_port = 1023 + protocol = "tcp" + } + ingress { + description = "Allows Lustre traffic between Lustre clients" + cidr_blocks = module.vpc.private_subnets_cidr_blocks + from_port = 988 + to_port = 988 + protocol = "tcp" + } + tags = local.tags +} + +#--------------------------------------------------------------- +# S3 bucket for DataSync between FSx for Lustre and S3 Bucket +#--------------------------------------------------------------- +#tfsec:ignore:aws-s3-enable-bucket-logging tfsec:ignore:aws-s3-enable-versioning +module "fsx_s3_bucket" { + count = var.deploy_fsx_volume ? 1 : 0 + source = "terraform-aws-modules/s3-bucket/aws" + version = "~> 3.0" + + create_bucket = true + + bucket_prefix = "${local.name}-fsx-" + # For example only - please evaluate for your environment + force_destroy = true + + server_side_encryption_configuration = { + rule = { + apply_server_side_encryption_by_default = { + sse_algorithm = "AES256" + } + } + } +} + +#--------------------------------------------------------------- +# Storage Class - FSx for Lustre +#--------------------------------------------------------------- +resource "kubectl_manifest" "storage_class" { + count = var.deploy_fsx_volume ? 1 : 0 + yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-storage-class.yaml", { + subnet_id = module.vpc.private_subnets[0], + security_group_id = aws_security_group.fsx[0].id + }) + + depends_on = [ + module.eks_blueprints_addons + ] +} + +#--------------------------------------------------------------- +# FSx for Lustre Persistent Volume - Static provisioning +#--------------------------------------------------------------- +resource "kubectl_manifest" "static_pv" { + count = var.deploy_fsx_volume ? 1 : 0 + yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pv.yaml", { + filesystem_id = aws_fsx_lustre_file_system.this[0].id, + dns_name = aws_fsx_lustre_file_system.this[0].dns_name + mount_name = aws_fsx_lustre_file_system.this[0].mount_name, + }) + + depends_on = [ + module.eks_blueprints_addons, + kubectl_manifest.storage_class, + aws_fsx_lustre_file_system.this + ] +} + +#--------------------------------------------------------------- +# FSx for Lustre Persistent Volume Claim +#--------------------------------------------------------------- +resource "kubectl_manifest" "static_pvc" { + count = var.deploy_fsx_volume ? 1 : 0 + yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pvc.yaml", {}) + + depends_on = [ + module.eks_blueprints_addons, + kubectl_manifest.storage_class, + kubectl_manifest.static_pv, + aws_fsx_lustre_file_system.this + ] +} diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml new file mode 100644 index 000000000..857bdcf3a --- /dev/null +++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pv.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: fsx-static-pv +spec: + capacity: + storage: 1000Gi + volumeMode: Filesystem + storageClassName: fsx + accessModes: + - ReadWriteMany + mountOptions: + - flock + persistentVolumeReclaimPolicy: Recycle + csi: + driver: fsx.csi.aws.com + volumeHandle: ${filesystem_id} + volumeAttributes: + dnsname: ${dns_name} + mountname: ${mount_name} diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml new file mode 100644 index 000000000..dddebd66c --- /dev/null +++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-static-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: fsx-static-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: fsx + resources: + requests: + storage: 1000Gi + volumeName: fsx-static-pv diff --git a/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml new file mode 100644 index 000000000..125fb2478 --- /dev/null +++ b/ai-ml/infrastructure/terraform/fsx-for-lustre/fsxlustre-storage-class.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: fsx +provisioner: fsx.csi.aws.com +parameters: + subnetId: ${subnet_id} + securityGroupIds: ${security_group_id} diff --git a/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml b/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml new file mode 100644 index 000000000..de495c16a --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/argo-events-values.yaml @@ -0,0 +1,4 @@ +## Argo Events admission webhook +webhook: + # -- Enable admission webhook. Applies only for cluster-wide installation + enabled: true diff --git a/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml b/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml new file mode 100644 index 000000000..86c764042 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/argo-workflows-values.yaml @@ -0,0 +1,4 @@ +server: + autoscaling: + enabled: true + minReplicas: 1 diff --git a/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml new file mode 100644 index 000000000..ae3c41d44 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/aws-cloudwatch-metrics-values.yaml @@ -0,0 +1,11 @@ +resources: + limits: + cpu: 500m + memory: 2Gi + requests: + cpu: 200m + memory: 1Gi + +# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. +tolerations: + - operator: Exists diff --git a/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml b/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml new file mode 100644 index 000000000..c214e10ba --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml @@ -0,0 +1,5 @@ +tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" diff --git a/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml new file mode 100644 index 000000000..c8b1a5d74 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/ingress-nginx-values.yaml @@ -0,0 +1,11 @@ +controller: + service: + externalTrafficPolicy: "Local" + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC + targetPorts: + http: http + https: http diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml new file mode 100644 index 000000000..03ce4b4be --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values.yaml @@ -0,0 +1,54 @@ +hub: + config: + Authenticator: + admin_users: + - admin1 + allowed_users: + - user1 + # testing only - do not do this for production + DummyAuthenticator: + password: never-do-this + JupyterHub: + authenticator_class: dummy +proxy: + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' + service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 +singleuser: + image: + name: public.ecr.aws/h3o5n2r0/gpu-jupyter + tag: v1.5_cuda-11.6_ubuntu-20.04_python-only + pullPolicy: Always + cmd: null + startTimeout: 600 + memory: + guarantee: 24G + extraResource: + limits: + nvidia.com/gpu: "1" + extraEnv: + HUGGING_FACE_HUB_TOKEN: + valueFrom: + secretKeyRef: + name: hf-token + key: token + storage: + capacity: 100Gi + extraVolumes: + - name: shm-volume + emptyDir: + medium: Memory + extraVolumeMounts: + - name: shm-volume + mountPath: /dev/shm + extraTolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +scheduling: + userScheduler: + enabled: false diff --git a/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml new file mode 100644 index 000000000..47e090743 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus.yaml @@ -0,0 +1,48 @@ +prometheus: + prometheusSpec: + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: ${storage_class_type} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true +prometheus: + prometheusSpec: + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: ${storage_class_type} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true diff --git a/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml b/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml new file mode 100644 index 000000000..178eb68cf --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/kubecost-values.yaml @@ -0,0 +1,69 @@ + +# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090 + +global: + # pricingCsv: + # enabled: false + # location: + # provider: "AWS" + # region: "us-east-1" + # URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI + # csvAccessCredentials: pricing-schema-access-secret + + prometheus: + enabled: true # Kubecost depends on Prometheus data, it is not optional. When enabled: false, Prometheus will not be installed and you must configure your own Prometheus to scrape kubecost as well as provide the fqdn below. -- Warning: Before changing this setting, please read to understand the risks https://docs.kubecost.com/install-and-configure/install/custom-prom + fqdn: http://cost-analyzer-prometheus-server.default.svc # example address of a prometheus to connect to. Include protocol (http:// or https://) Ignored if enabled: true + + grafana: + enabled: true # If false, Grafana will not be installed + domainName: cost-analyzer-grafana.default.svc # example grafana domain Ignored if enabled: true + scheme: "http" # http or https, for the domain name above. + proxy: true # If true, the kubecost frontend will route to your grafana through its service endpoint + +kubecostFrontend: + image: public.ecr.aws/kubecost/frontend + resources: + requests: + cpu: "200m" + memory: "512Mi" + +kubecostMetrics: + emitPodAnnotations: true + emitNamespaceAnnotations: true + +kubecostModel: + image: public.ecr.aws/kubecost/cost-model + resources: + requests: + cpu: "500m" + memory: "512Mi" + +forecasting: + fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.6 + +networkCosts: + image: + repository: public.ecr.aws/kubecost/kubecost-network-costs + +clusterController: + image: + repository: public.ecr.aws/kubecost/cluster-controller + +prometheus: + server: + image: + repository: public.ecr.aws/kubecost/prometheus + + configmapReload: + prometheus: + image: + repository: public.ecr.aws/kubecost/prometheus-config-reloader + +reporting: + productAnalytics: false + +# Define persistence volume for cost-analyzer +persistentVolume: + size: 32Gi + dbSize: 32.0Gi + enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. diff --git a/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml b/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml new file mode 100644 index 000000000..1f604f610 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/mlflow-tracking-values.yaml @@ -0,0 +1,88 @@ +# Default values for mlflow-tracking-server. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: public.ecr.aws/data-on-eks/mlflow + pullPolicy: Always + tag: 2.7.1 + +imagePullSecrets: [] + +nameOverride: mlflow-tracking-server + +fullnameOverride: mlflow-tracking-server + +podAnnotations: {} + +replicaCount: 1 + +service: + type: ClusterIP + port: 5000 + +serviceAccount: + # Specifies whether a service account should be created + create: false + # Annotations to add to the service account + annotations: + eks.amazonaws.com/role-arn: ${mlflow_irsa} + labels: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: ${mlflow_sa} + +ingress: + enabled: true + className: nginx + annotations: + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/use-regex: "true" + hosts: + - host: + paths: + - path: / + pathType: Prefix + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +mlflow: + artifacts: + bucketName: ${s3_bucket_name} + database: + name: ${mlflow_db_name} + username: ${mlflow_db_username} + password: ${mlflow_db_password} + host: ${mlflow_db_host} + port: 5432 + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/ai-ml/infrastructure/terraform/install.sh b/ai-ml/infrastructure/terraform/install.sh new file mode 100755 index 000000000..1814a9044 --- /dev/null +++ b/ai-ml/infrastructure/terraform/install.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# List of Terraform modules to apply in sequence +targets=( + "module.vpc" + "module.eks" +) + +# Initialize Terraform +terraform init -upgrade + +# Apply modules in sequence +for target in "${targets[@]}" +do + echo "Applying module $target..." + apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) + if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of $target completed successfully" + else + echo "FAILED: Terraform apply of $target failed" + exit 1 + fi +done + +# Final apply to catch any remaining resources +echo "Applying remaining resources..." +apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) +if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then + echo "SUCCESS: Terraform apply of all modules completed successfully" +else + echo "FAILED: Terraform apply of all modules failed" + exit 1 +fi diff --git a/ai-ml/infrastructure/terraform/karpenter.tf b/ai-ml/infrastructure/terraform/karpenter.tf new file mode 100644 index 000000000..e69de29bb diff --git a/ai-ml/infrastructure/terraform/main.tf b/ai-ml/infrastructure/terraform/main.tf new file mode 100644 index 000000000..938dc4b74 --- /dev/null +++ b/ai-ml/infrastructure/terraform/main.tf @@ -0,0 +1,60 @@ +provider "aws" { + region = local.region +} + +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} +provider "kubectl" { + apply_retry_count = 30 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + load_config_file = false +} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} + +data "aws_availability_zones" "available" {} + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + +data "aws_caller_identity" "current" {} +data "aws_partition" "current" {} + +locals { + name = var.name + region = var.region + azs = slice(data.aws_availability_zones.available.names, 0, 2) + partition = data.aws_partition.current.partition + account_id = data.aws_caller_identity.current.account_id + mlflow_name = "mlflow" + mlflow_namespace = "mlflow" + mlflow_service_account = "mlflow" + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/infrastructure/terraform/mlflow-core.tf b/ai-ml/infrastructure/terraform/mlflow-core.tf new file mode 100644 index 000000000..55dafeff1 --- /dev/null +++ b/ai-ml/infrastructure/terraform/mlflow-core.tf @@ -0,0 +1,245 @@ +#--------------------------------------------------------------- +# RDS Postgres Database for MLflow Backend +#--------------------------------------------------------------- +module "db" { + count = var.enable_mlflow_tracking ? 1 : 0 + source = "terraform-aws-modules/rds/aws" + version = "~> 5.0" + + identifier = local.mlflow_name + + engine = "postgres" + engine_version = "14.13" + family = "postgres14" + major_engine_version = "14" + instance_class = "db.m6i.xlarge" + + storage_type = "io1" + allocated_storage = 100 + iops = 3000 + + db_name = local.mlflow_name + username = local.mlflow_name + create_random_password = false + password = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string) + port = 5432 + + multi_az = true + db_subnet_group_name = module.vpc.database_subnet_group + vpc_security_group_ids = [module.security_group[0].security_group_id] + + maintenance_window = "Mon:00:00-Mon:03:00" + backup_window = "03:00-06:00" + enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] + create_cloudwatch_log_group = true + + backup_retention_period = 5 + skip_final_snapshot = true + deletion_protection = false + + performance_insights_enabled = true + performance_insights_retention_period = 7 + create_monitoring_role = true + monitoring_interval = 60 + monitoring_role_name = "mlflow-backend" + monitoring_role_use_name_prefix = true + monitoring_role_description = "MLflow Postgres Backend for monitoring role" + + parameters = [ + { + name = "autovacuum" + value = 1 + }, + { + name = "client_encoding" + value = "utf8" + } + ] + + tags = local.tags +} + +#--------------------------------------------------------------- +# MLflow Postgres Backend DB Master password +#--------------------------------------------------------------- +resource "random_password" "postgres" { + count = var.enable_mlflow_tracking ? 1 : 0 + length = 16 + special = false +} +#tfsec:ignore:aws-ssm-secret-use-customer-key +resource "aws_secretsmanager_secret" "postgres" { + count = var.enable_mlflow_tracking ? 1 : 0 + name = local.mlflow_name + recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy +} + +resource "aws_secretsmanager_secret_version" "postgres" { + count = var.enable_mlflow_tracking ? 1 : 0 + secret_id = aws_secretsmanager_secret.postgres[0].id + secret_string = random_password.postgres[0].result +} + +#--------------------------------------------------------------- +# PostgreSQL RDS security group +#--------------------------------------------------------------- +module "security_group" { + count = var.enable_mlflow_tracking ? 1 : 0 + source = "terraform-aws-modules/security-group/aws" + version = "~> 5.0" + + name = local.name + description = "Complete PostgreSQL example security group" + vpc_id = module.vpc.vpc_id + + # ingress + ingress_with_cidr_blocks = [ + { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + description = "PostgreSQL access from within VPC" + cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}" + }, + ] + + tags = local.tags +} + + +#--------------------------------------------------------------- +# S3 bucket for MLflow artifacts +#--------------------------------------------------------------- + +#tfsec:ignore:* +module "mlflow_s3_bucket" { + count = var.enable_mlflow_tracking ? 1 : 0 + source = "terraform-aws-modules/s3-bucket/aws" + version = "~> 3.0" + + bucket_prefix = "${local.name}-artifacts-" + + # For example only - please evaluate for your environment + force_destroy = true + + server_side_encryption_configuration = { + rule = { + apply_server_side_encryption_by_default = { + sse_algorithm = "AES256" + } + } + } + + tags = local.tags +} + +#--------------------------------------------------------------- +# MLflow Namespace +#--------------------------------------------------------------- +resource "kubernetes_namespace_v1" "mlflow" { + count = var.enable_mlflow_tracking ? 1 : 0 + metadata { + name = local.mlflow_namespace + } + timeouts { + delete = "15m" + } +} + +resource "kubernetes_service_account_v1" "mlflow" { + count = var.enable_mlflow_tracking ? 1 : 0 + metadata { + name = local.mlflow_service_account + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn } + } + + automount_service_account_token = true +} + +resource "kubernetes_secret_v1" "mlflow" { + count = var.enable_mlflow_tracking ? 1 : 0 + metadata { + name = "${local.mlflow_service_account}-secret" + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.mlflow[0].metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name + } + } + + type = "kubernetes.io/service-account-token" +} + +# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled +module "mlflow_irsa" { + count = var.enable_mlflow_tracking ? 1 : 0 + + source = "aws-ia/eks-blueprints-addon/aws" + version = "~> 1.0" #ensure to update this to the latest/desired version + + # Disable helm release + create_release = false + + # IAM role for service account (IRSA) + create_role = true + create_policy = false # Policy is created in the next resource + + role_name = local.mlflow_service_account + role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn } + + oidc_providers = { + this = { + provider_arn = module.eks.oidc_provider_arn + namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name + service_account = local.mlflow_service_account + } + } + + tags = local.tags +} + +#-------------------------------------------------------------------------- +# IAM policy for MLflow for accessing S3 artifacts and RDS Postgres backend +#-------------------------------------------------------------------------- +resource "aws_iam_policy" "mlflow" { + count = var.enable_mlflow_tracking ? 1 : 0 + + description = "IAM policy for MLflow" + name_prefix = format("%s-%s-", local.name, "mlflow") + path = "/" + policy = data.aws_iam_policy_document.mlflow[0].json +} + +data "aws_iam_policy_document" "mlflow" { + count = var.enable_mlflow_tracking ? 1 : 0 + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"] + + actions = [ + "s3:ListBucket" + ] + } + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"] + + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject" + ] + } + statement { + sid = "" + effect = "Allow" + resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] + + actions = [ + "rds-db:connect", + ] + } +} diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml new file mode 100644 index 000000000..c3ffe67d3 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml @@ -0,0 +1,82 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: "dcgm-exporter" + namespace: kube-system + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" +spec: + updateStrategy: + type: RollingUpdate + selector: + matchLabels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" + template: + metadata: + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" + name: "dcgm-exporter" + spec: + containers: + - image: "nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04" + env: + - name: "DCGM_EXPORTER_LISTEN" + value: ":9400" + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + name: "dcgm-exporter" + ports: + - name: "metrics" + containerPort: 9400 + securityContext: + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: ["SYS_ADMIN"] + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + volumes: + - name: "pod-gpu-resources" + hostPath: + path: "/var/lib/kubelet/pod-resources" + nodeSelector: + accelerator: nvidia + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +--- +kind: Service +apiVersion: v1 +metadata: + name: "dcgm-exporter" + namespace: kube-system + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" +spec: + selector: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" + ports: + - name: "metrics" + port: 9400 diff --git a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml new file mode 100644 index 000000000..2ed065546 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: neuron-monitor + namespace: kube-system + labels: + app: neuron-monitor + version: v1 +spec: + selector: + matchLabels: + app: neuron-monitor + template: + metadata: + labels: + app: neuron-monitor + version: v1 + spec: + containers: + - name: neuron-monitor + image: public.ecr.aws/neuron/neuron-monitor:1.1.0 + ports: + - containerPort: 8000 + command: + - "/opt/bin/entrypoint.sh" + args: + - "--port" + - "8000" + resources: + limits: + cpu: 500m + memory: 256Mi + requests: + cpu: 256m + memory: 128Mi + env: + - name: GOMEMLIMIT + value: 160MiB + securityContext: + privileged: true + nodeSelector: + accelerator: neuron diff --git a/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml b/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml new file mode 100644 index 000000000..8ade99739 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/podMonitor.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: ray-workers-monitor + namespace: kube-prometheus-stack + labels: + # `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label. + release: kube-prometheus-stack +spec: + jobLabel: ray-workers + # Only select Kubernetes Pods in the "default" namespace. + namespaceSelector: + matchNames: + - rayserve-vllm + # Only select Kubernetes Pods with "matchLabels". + selector: + matchLabels: + ray.io/node-type: worker + # A list of endpoints allowed as part of this PodMonitor. + podMetricsEndpoints: + - port: metrics diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json new file mode 100644 index 000000000..26d11b3f1 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json @@ -0,0 +1,4535 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Spilled", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount allocated by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Allocated", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount freed by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Freed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Amount of memory store used by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Current Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical CPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical GPUs allocated to dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total bytes outputted by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Total rows outputted by dataset operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Outputted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks received by operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Received: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Received by Operator", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks received by operator.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Received: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Received by Operator", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of input blocks that operator's tasks have finished processing.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Processed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Blocks Processed by Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks that operator's tasks have finished processing.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Processed: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Processed by Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks passed to submitted tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Submitted: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Input Bytes Submitted to Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks generated by tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Generated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Blocks Generated by Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks generated by tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Generated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes Generated by Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of rows in generated output blocks from finished tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Rows Generated: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows Generated by Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "rows", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of output blocks that are already taken by downstream operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Blocks Taken: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Blocks Taken by Downstream Operators", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks that are already taken by downstream operators.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Taken: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Output Bytes Taken by Downstream Operators", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of submitted tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Submitted Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of running tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Running Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of tasks that already have output.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Tasks with output blocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of finished tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Finished Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of failed tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Failed Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent generating blocks in tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Block Generation Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Time spent in task submission backpressure.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Submission Backpressure Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal input queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks in the operator's internal input queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Inqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of blocks in operator's internal output queue", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Blocks)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "blocks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of output blocks in the operator's internal output queue.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Operator Internal Outqueue Size (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of input blocks used by pending tasks.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Size of Blocks used in Pending Tasks (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of freed memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Freed Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Byte size of spilled memory in object store.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", + "interval": "", + "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Spilled Memory in Object Store (Bytes)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in iterator initialization code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}, {{operator}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Initialization Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds user thread is blocked by iter_batches()", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration Blocked Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Seconds spent in user code", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", + "interval": "", + "legendFormat": "Seconds: {{dataset}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Iteration User Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.24.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", + "description": "Filter queries to specific ray sessions.", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_data_allocated_bytes{}, dataset)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "DatasetID", + "options": [], + "query": { + "query": "label_values(ray_data_allocated_bytes{}, dataset)", + "refId": "Prometheus-Dataset-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Data Dashboard", + "uid": "rayDataDashboard", + "version": 1 +} diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json new file mode 100644 index 000000000..7814395f5 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json @@ -0,0 +1,2836 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", + "interval": "", + "legendFormat": "{{State}} (retry)", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler Task State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", + "interval": "", + "legendFormat": "{{Name}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", + "interval": "", + "legendFormat": "{{Name}} (retry)", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active Tasks by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler Actor State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "actors", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current number of (live) actors with a particular name.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)", + "interval": "", + "legendFormat": "{{Name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active Actors by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "actors", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 2 + }, + "hiddenSeries": false, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)", + "interval": "", + "legendFormat": "CPU Usage: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))", + "interval": "", + "legendFormat": "MAX + PENDING", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler CPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 2 + }, + "hiddenSeries": false, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)", + "interval": "", + "legendFormat": "{{Location}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Object Store Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "GPU Usage: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))", + "interval": "", + "legendFormat": "MAX + PENDING", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler GPUs (logical slots)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "GPUs", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)", + "interval": "", + "legendFormat": "{{State}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler Placement Groups", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "placement groups", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", + "interval": "", + "legendFormat": "CPU Usage: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node CPU (hardware utilization)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", + "interval": "", + "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node GPU (hardware utilization)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "GPUs", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Disk Used: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Disk IO per node.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Write: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Read: {{instance}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Disk IO Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Memory Used: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Memory (heap + object store)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "OOM Killed: {{Name}}, {{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Out of Memory Failures by Name", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "failures", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))", + "interval": "", + "legendFormat": "{{Component}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "shared_memory", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Memory by Component", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100", + "interval": "", + "legendFormat": "{{Component}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node CPU by Component", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "cores", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024", + "interval": "", + "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024", + "interval": "", + "legendFormat": "MAX", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node GPU Memory (GRAM)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Network speed per node", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Recv: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", + "interval": "", + "legendFormat": "Send: {{instance}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", + "interval": "", + "legendFormat": "Active Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", + "interval": "", + "legendFormat": "Failed Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", + "interval": "", + "legendFormat": "Pending Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "nodes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})", + "interval": "", + "legendFormat": "CPU (physical)", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))", + "interval": "", + "legendFormat": "GPU (physical)", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100", + "interval": "", + "legendFormat": "Memory (RAM)", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100", + "interval": "", + "legendFormat": "GRAM", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100", + "interval": "", + "legendFormat": "Object Store Memory", + "queryType": "randomWalk", + "refId": "E" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100", + "interval": "", + "legendFormat": "Disk", + "queryType": "randomWalk", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.24.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", + "description": "Filter queries to specific ray sessions.", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "SessionName", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{}, SessionName)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Instance", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Default Dashboard", + "uid": "rayDefaultDashboard", + "version": 4, + "rayMeta": [ + "supportsGlobalFilterOverride" + ] +} diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json new file mode 100644 index 000000000..8648e308a --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json @@ -0,0 +1,2115 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of replicas per deployment. Ignores \"Route\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Replicas per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "replicas", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 0, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 1, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of requests queued per deployment. Ignores \"Replica\" and \"Route\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 2, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue size per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Current running requests for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 12, + "y": 2, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Running requests per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of multiplexed models for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed models per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "models", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of times of multiplexed models loaded for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model loads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "times", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of times of multiplexed models unloaded for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model unloads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "times", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency of mutliplexed model load per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency of multiplexed model loads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency of mutliplexed model unload per replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency of multiplexed model unloads per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The ids of multiplexed models for each replica.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}", + "interval": "", + "legendFormat": "{{replica}}:{{model_id}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model ids per replica", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "model", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The cache hit rate of multiplexed models for the deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 5, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))", + "interval": "", + "legendFormat": "{{replica}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Multiplexed model cache hit rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.24.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries to specific prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Deployment", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Replica", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Route", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Deployment Dashboard", + "uid": "rayServeDeploymentDashboard", + "version": 1 +} diff --git a/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json new file mode 100644 index 000000000..4d1ec6e8e --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json @@ -0,0 +1,3098 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 0, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "avg(ray_node_cpu_utilization{})", + "interval": "", + "legendFormat": "CPU (physical)", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))", + "interval": "", + "legendFormat": "GPU (physical)", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100", + "interval": "", + "legendFormat": "Memory (RAM)", + "queryType": "randomWalk", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100", + "interval": "", + "legendFormat": "GRAM", + "queryType": "randomWalk", + "refId": "D" + }, + { + "exemplar": true, + "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100", + "interval": "", + "legendFormat": "Object Store Memory", + "queryType": "randomWalk", + "refId": "E" + }, + { + "exemplar": true, + "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100", + "interval": "", + "legendFormat": "Disk", + "queryType": "randomWalk", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "%", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 12, + "y": 0, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 1, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each selected application.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 12, + "y": 1, + "w": 12, + "h": 8 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, error_code)", + "interval": "", + "legendFormat": "{{application, route, error_code}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, error_code)", + "interval": "", + "legendFormat": "{{application, method, error_code}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per application per error code", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency for selected applications.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 2, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", + "interval": "", + "legendFormat": "{{application, route}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", + "interval": "", + "legendFormat": "{{application, method}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per application", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of replicas per deployment. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Replicas per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "replicas", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "QPS for each deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QPS per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Error QPS for each deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 3, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Error QPS per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "qps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P50 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P50 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P90 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P90 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "P99 latency per deployment.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 4, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", + "interval": "", + "legendFormat": "Total", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 latency per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of requests queued per deployment. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 5, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)", + "interval": "", + "legendFormat": "{{application, deployment}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue size per deployment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Number of nodes in this cluster. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 5, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(autoscaler_active_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Active Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Failed Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)", + "interval": "", + "legendFormat": "Pending Nodes: {{NodeType}}", + "queryType": "randomWalk", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "nodes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "Network speed per node. Ignores \"Application\" variable.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 5, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(ray_node_network_receive_speed{}) by (instance)", + "interval": "", + "legendFormat": "Recv: {{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(ray_node_network_send_speed{}) by (instance)", + "interval": "", + "legendFormat": "Send: {{instance}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node network", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of ongoing requests in the HTTP Proxy.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 6, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 20, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_ongoing_http_requests{}", + "interval": "", + "legendFormat": "Ongoing HTTP Requests", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ongoing HTTP Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of ongoing requests in the gRPC Proxy.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 6, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_ongoing_grpc_requests{}", + "interval": "", + "legendFormat": "Ongoing gRPC Requests", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Ongoing gRPC Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "requests", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of request scheduling tasks in the router.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 6, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_scheduling_tasks{}", + "interval": "", + "legendFormat": "Scheduling Tasks", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduling Tasks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of request scheduling tasks in the router that are undergoing backoff.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 0, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_num_scheduling_tasks_in_backoff{}", + "interval": "", + "legendFormat": "Scheduling Tasks in Backoff", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduling Tasks in Backoff", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "tasks", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The duration of the last control loop.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 8, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_controller_control_loop_duration_s{}", + "interval": "", + "legendFormat": "Control Loop Duration", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Controller Control Loop Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "seconds", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${datasource}", + "description": "The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 10, + "fillGradient": 0, + "gridPos": { + "x": 16, + "y": 7, + "w": 8, + "h": 8 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": true, + "color": "#1F60C4", + "fill": 0, + "stack": false + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": true + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": true, + "color": "#777777", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "ray_serve_controller_num_control_loops{}", + "interval": "", + "legendFormat": "Control Loops", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Number of Control Loops", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "loops", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [ + "rayVersion:2.24.0" + ], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_http_requests_total{}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "HTTP Route", + "multi": true, + "name": "HTTP_Route", + "options": [], + "query": { + "query": "label_values(ray_serve_num_http_requests_total{}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_grpc_requests{}, method)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "gRPC Service Method", + "multi": true, + "name": "gRPC_Method", + "options": [], + "query": { + "query": "label_values(ray_serve_num_grpc_requests{}, method)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": [ + "excludesSystemRoutes", + "supportsGlobalFilterOverride" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Dashboard", + "uid": "rayServeDashboard", + "version": 1 +} diff --git a/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml new file mode 100644 index 000000000..dbda70c40 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor.yaml @@ -0,0 +1,25 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: ray-head-monitor + namespace: kube-prometheus-stack + labels: + # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label. + release: kube-prometheus-stack +spec: + jobLabel: ray-head + # Only select Kubernetes Services in the "default" namespace. + namespaceSelector: + matchNames: + - rayserve-vllm + # Only select Kubernetes Services with "matchLabels". + selector: + matchLabels: + ray.io/node-type: head + # A list of endpoints allowed as part of this ServiceMonitor. + endpoints: + - port: metrics + - port: as-metrics # autoscaler metrics + - port: dash-metrics # dashboard metrics + targetLabels: + - ray.io/cluster diff --git a/ai-ml/infrastructure/terraform/outputs.tf b/ai-ml/infrastructure/terraform/outputs.tf new file mode 100644 index 000000000..5771ae141 --- /dev/null +++ b/ai-ml/infrastructure/terraform/outputs.tf @@ -0,0 +1,9 @@ +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" +} + +output "grafana_secret_name" { + description = "The name of the secret containing the Grafana admin password." + value = aws_secretsmanager_secret.grafana.name +} diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf new file mode 100644 index 000000000..465f9c152 --- /dev/null +++ b/ai-ml/infrastructure/terraform/variables.tf @@ -0,0 +1,107 @@ +variable "name" { + description = "Name of the VPC and EKS Cluster" + default = "ai-stack" + type = string +} + +variable "region" { + description = "region" + default = "us-east-1" + type = string +} + +variable "eks_cluster_version" { + description = "EKS Cluster version" + default = "1.30" + type = string +} + +# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs +variable "vpc_cidr" { + description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range" + default = "10.1.0.0/21" + type = string +} + +# RFC6598 range 100.64.0.0/10 +# Note you can only /16 range to VPC. You can add multiples of /16 if required +variable "secondary_cidr_blocks" { + description = "Secondary CIDR blocks to be attached to VPC" + default = ["100.64.0.0/16"] + type = list(string) +} + +# Infrastructure Variables +variable "enable_aws_cloudwatch_metrics" { + description = "Enable AWS Cloudwatch Metrics addon" + type = bool + default = true +} +variable "bottlerocket_data_disk_snapshot_id" { + description = "Bottlerocket Data Disk Snapshot ID" + type = string + default = "" +} +variable "enable_aws_efa_k8s_device_plugin" { + description = "Enable AWS EFA K8s Device Plugin" + type = bool + default = false +} +variable "enable_aws_fsx_csi_driver"{ + description = "Whether or not to deploy the Fsx Driver" + type = bool + default = false +} +variable "deploy_fsx_volume" { + description = "Whether or not to deploy the example Fsx Volume" + type = bool + default = false +} + +# Addon Variables +variable "enable_kube_prometheus_stack" { + description = "Enable Kube Prometheus addon" + type = bool + default = false +} +variable "enable_kubecost" { + description = "Enable Kubecost addon" + type = bool + default = false +} +variable "enable_argo_workflows" { + description = "Enable Argo Workflows addon" + type = bool + default = false +} +variable "enable_argo_events" { + description = "Enable Argo Events addon" + type = bool + default = false +} +variable "enable_mlflow_tracking" { + description = "Enable MLFlow Tracking" + type = bool + default = false +} +variable "enable_jupyterhub" { + description = "Enable JupyterHub" + type = bool + default = false +} +variable "enable_volcano" { + description = "Enable Volcano" + type = bool + default = false +} +variable "enable_kuberay_operator" { + description = "Enable KubeRay Operator" + type = bool + default = true +} +variable "huggingface_token" { + description = "Hugging Face Secret Token" + type = string + default = "DUMMY_TOKEN_REPLACE_ME" + sensitive = true +} diff --git a/ai-ml/infrastructure/terraform/versions.tf b/ai-ml/infrastructure/terraform/versions.tf new file mode 100644 index 000000000..e24e99c1f --- /dev/null +++ b/ai-ml/infrastructure/terraform/versions.tf @@ -0,0 +1,33 @@ +terraform { + required_version = ">= 1.0.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 3.72" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.4.1" + } + kubectl = { + source = "gavinbunney/kubectl" + version = ">= 1.14" + } + random = { + source = "hashicorp/random" + version = ">= 3.6.0" # Replace with the appropriate version of the random provider + } + } + + # ## Used for end-to-end testing on project; update to suit your needs + # backend "s3" { + # bucket = "doeks-github-actions-e2e-test-state" + # region = "us-west-2" + # key = "e2e/jark/terraform.tfstate" + # } +} diff --git a/ai-ml/infrastructure/terraform/vpc.tf b/ai-ml/infrastructure/terraform/vpc.tf new file mode 100644 index 000000000..b12f63d59 --- /dev/null +++ b/ai-ml/infrastructure/terraform/vpc.tf @@ -0,0 +1,62 @@ +locals { + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + + database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] + + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] +} + +#--------------------------------------------------------------- +# VPC +#--------------------------------------------------------------- +# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. +# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = local.name + cidr = var.vpc_cidr + azs = local.azs + + # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods + secondary_cidr_blocks = var.secondary_cidr_blocks + + # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods + # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. + private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) + + # ------------------------------ + # Private Subnets for MLflow backend store + database_subnets = local.database_private_subnets + create_database_subnet_group = true + create_database_subnet_route_table = true + + # ------------------------------ + # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments + # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW + public_subnets = local.public_subnets + enable_nat_gateway = true + single_nat_gateway = true + #------------------------------- + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + # Tags subnets for Karpenter auto-discovery + "karpenter.sh/discovery" = local.name + } + + tags = local.tags +} From a148ef574f1271dea1172a8748f3ae0e9380e1c5 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 14:42:17 -0800 Subject: [PATCH 02/16] split dcgm and enable volcano to fix kuberay startup --- ai-ml/bionemo/addons.tf | 93 - ai-ml/bionemo/cleanup.sh | 45 - ai-ml/bionemo/eks.tf | 145 - ai-ml/bionemo/fsx-for-lustre.tf | 136 - .../fsx-for-lustre/fsxlustre-static-pv.yaml | 21 - .../fsx-for-lustre/fsxlustre-static-pvc.yaml | 12 - .../fsxlustre-storage-class.yaml | 9 - ai-ml/bionemo/main.tf | 53 - ai-ml/bionemo/outputs.tf | 9 - ai-ml/bionemo/versions.tf | 29 - ai-ml/bionemo/vpc.tf | 57 - ai-ml/infrastructure/terraform/addons.tf | 4 + ai-ml/infrastructure/terraform/variables.tf | 2 +- ai-ml/jark-stack/terraform/addons.tf | 460 -- ai-ml/jark-stack/terraform/eks.tf | 212 - .../helm-values/argo-events-values.yaml | 4 - .../helm-values/argo-workflows-values.yaml | 5 - .../aws-cloudwatch-metrics-values.yaml | 11 - .../aws-efa-k8s-device-plugin-values.yaml | 5 - .../helm-values/ingress-nginx-values.yaml | 11 - .../helm-values/jupyterhub-values.yaml | 59 - .../helm-values/kube-prometheus.yaml | 48 - .../helm-values/kubecost-values.yaml | 69 - ai-ml/jark-stack/terraform/karpenter.tf | 0 ai-ml/jark-stack/terraform/main.tf | 51 - .../terraform/monitoring/podMonitor.yaml | 21 - .../data_grafana_dashboard.json | 4535 ----------------- .../default_grafana_dashboard.json | 2836 ----------- .../serve_deployment_grafana_dashboard.json | 2115 -------- .../serve_grafana_dashboard.json | 3098 ----------- .../terraform/monitoring/serviceMonitor.yaml | 25 - ai-ml/jark-stack/terraform/outputs.tf | 9 - ai-ml/jark-stack/terraform/versions.tf | 33 - ai-ml/jark-stack/terraform/vpc.tf | 53 - ai-ml/mlflow/addons.tf | 431 -- ai-ml/mlflow/amp.tf | 136 - ai-ml/mlflow/eks.tf | 118 - .../helm-values/aws-for-fluentbit-values.yaml | 102 - .../cluster-autoscaler-values.yaml | 25 - .../coredns-autoscaler-values.yaml | 40 - .../helm-values/ingress-nginx-values.yaml | 11 - .../kube-prometheus-amp-enable.yaml | 65 - ai-ml/mlflow/helm-values/kube-prometheus.yaml | 36 - .../helm-values/metrics-server-values.yaml | 52 - .../helm-values/mlflow-tracking-values.yaml | 88 - ai-ml/mlflow/helm-values/nvidia-values.yaml | 97 - .../00-karpenter-provisioner-cpu.yaml | 57 - ai-ml/mlflow/main.tf | 65 - ai-ml/mlflow/mlflow-core.tf | 245 - ai-ml/mlflow/outputs.tf | 24 - ai-ml/mlflow/versions.tf | 33 - ai-ml/mlflow/vpc.tf | 59 - ai-ml/trainium-inferentia/addons.tf | 536 -- ai-ml/trainium-inferentia/eks.tf | 410 -- .../elastic-cache-redis.tf | 57 - ai-ml/trainium-inferentia/fsx-for-lustre.tf | 118 - .../aws-cloudwatch-metrics-values.yaml | 11 - .../helm-values/aws-for-fluentbit-values.yaml | 102 - .../cluster-autoscaler-values.yaml | 15 - .../helm-values/ingress-nginx-values.yaml | 11 - .../helm-values/jupyterhub-values.yaml | 139 - .../helm-values/kube-prometheus.yaml | 23 - .../helm-values/metrics-server-values.yaml | 52 - ai-ml/trainium-inferentia/jupyterhub.tf | 181 - ai-ml/trainium-inferentia/main.tf | 75 - ai-ml/trainium-inferentia/outputs.tf | 9 - ai-ml/trainium-inferentia/versions.tf | 37 - ai-ml/trainium-inferentia/vpc.tf | 53 - 68 files changed, 5 insertions(+), 17783 deletions(-) delete mode 100644 ai-ml/bionemo/addons.tf delete mode 100755 ai-ml/bionemo/cleanup.sh delete mode 100644 ai-ml/bionemo/eks.tf delete mode 100644 ai-ml/bionemo/fsx-for-lustre.tf delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml delete mode 100644 ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml delete mode 100644 ai-ml/bionemo/main.tf delete mode 100644 ai-ml/bionemo/outputs.tf delete mode 100644 ai-ml/bionemo/versions.tf delete mode 100644 ai-ml/bionemo/vpc.tf delete mode 100644 ai-ml/jark-stack/terraform/addons.tf delete mode 100644 ai-ml/jark-stack/terraform/eks.tf delete mode 100644 ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml delete mode 100644 ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml delete mode 100644 ai-ml/jark-stack/terraform/karpenter.tf delete mode 100644 ai-ml/jark-stack/terraform/main.tf delete mode 100644 ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json delete mode 100644 ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json delete mode 100644 ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml delete mode 100644 ai-ml/jark-stack/terraform/outputs.tf delete mode 100644 ai-ml/jark-stack/terraform/versions.tf delete mode 100644 ai-ml/jark-stack/terraform/vpc.tf delete mode 100644 ai-ml/mlflow/addons.tf delete mode 100644 ai-ml/mlflow/amp.tf delete mode 100644 ai-ml/mlflow/eks.tf delete mode 100644 ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/ingress-nginx-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml delete mode 100644 ai-ml/mlflow/helm-values/kube-prometheus.yaml delete mode 100644 ai-ml/mlflow/helm-values/metrics-server-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml delete mode 100644 ai-ml/mlflow/helm-values/nvidia-values.yaml delete mode 100644 ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml delete mode 100644 ai-ml/mlflow/main.tf delete mode 100644 ai-ml/mlflow/mlflow-core.tf delete mode 100644 ai-ml/mlflow/outputs.tf delete mode 100644 ai-ml/mlflow/versions.tf delete mode 100644 ai-ml/mlflow/vpc.tf delete mode 100644 ai-ml/trainium-inferentia/addons.tf delete mode 100644 ai-ml/trainium-inferentia/eks.tf delete mode 100644 ai-ml/trainium-inferentia/elastic-cache-redis.tf delete mode 100644 ai-ml/trainium-inferentia/fsx-for-lustre.tf delete mode 100644 ai-ml/trainium-inferentia/helm-values/aws-cloudwatch-metrics-values.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/aws-for-fluentbit-values.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml delete mode 100644 ai-ml/trainium-inferentia/jupyterhub.tf delete mode 100755 ai-ml/trainium-inferentia/main.tf delete mode 100755 ai-ml/trainium-inferentia/outputs.tf delete mode 100755 ai-ml/trainium-inferentia/versions.tf delete mode 100755 ai-ml/trainium-inferentia/vpc.tf diff --git a/ai-ml/bionemo/addons.tf b/ai-ml/bionemo/addons.tf deleted file mode 100644 index 6b47a5ffe..000000000 --- a/ai-ml/bionemo/addons.tf +++ /dev/null @@ -1,93 +0,0 @@ -#--------------------------------------------------------------- -# EKS Blueprints Kubernetes Addons -#--------------------------------------------------------------- -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.3" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - coredns = { - preserve = true - } - vpc-cni = { - preserve = true - } - kube-proxy = { - preserve = true - } - amazon-cloudwatch-observability = { - preserve = true - service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn - } - } - - #--------------------------------------- - # ALB Controller - #--------------------------------------- - enable_aws_load_balancer_controller = true - - #--------------------------------------- - # Kubernetes Metrics Server - #--------------------------------------- - enable_metrics_server = true - - - #--------------------------------------- - # Enable FSx for Lustre CSI Driver - #--------------------------------------- - enable_aws_fsx_csi_driver = true - - tags = local.tags - -} - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- -module "eks_data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "~> 1.30" # ensure to update this to the latest/desired version - - oidc_provider_arn = module.eks.oidc_provider_arn - enable_nvidia_device_plugin = true - -} - -#--------------------------------------------------------------- -# EKS Amazon CloudWatch Observability Role -#--------------------------------------------------------------- -resource "aws_iam_role" "cloudwatch_observability_role" { - name = "eks-cloudwatch-agent-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRoleWithWebIdentity" - Effect = "Allow" - Principal = { - Federated = module.eks.oidc_provider_arn - } - Condition = { - StringEquals = { - "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent", - "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com" - } - } - } - ] - }) -} - -resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" { - policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" - role = aws_iam_role.cloudwatch_observability_role.name -} diff --git a/ai-ml/bionemo/cleanup.sh b/ai-ml/bionemo/cleanup.sh deleted file mode 100755 index da1fb7c16..000000000 --- a/ai-ml/bionemo/cleanup.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -set -o errexit -set -o pipefail - -targets=( - "module.eks" - "module.vpc" -) - -#------------------------------------------- -# Helpful to delete the stuck in "Terminating" namespaces -# Rerun the cleanup.sh script to detect and delete the stuck resources -#------------------------------------------- -terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name') - -# If there are no terminating namespaces, exit the script -if [[ -z $terminating_namespaces ]]; then - echo "No terminating namespaces found" -fi - -for ns in $terminating_namespaces; do - echo "Terminating namespace: $ns" - kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - -done - -for target in "${targets[@]}" -do - terraform destroy -target="$target" -auto-approve - destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1) - if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of $target completed successfully" - else - echo "FAILED: Terraform destroy of $target failed" - exit 1 - fi -done - -terraform destroy -auto-approve -destroy_output=$(terraform destroy -auto-approve 2>&1) -if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of all targets completed successfully" -else - echo "FAILED: Terraform destroy of all targets failed" - exit 1 -fi diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf deleted file mode 100644 index e45e5a816..000000000 --- a/ai-ml/bionemo/eks.tf +++ /dev/null @@ -1,145 +0,0 @@ -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. - vpc_id = module.vpc.vpc_id - subnet_ids = module.vpc.private_subnets - manage_aws_auth_configmap = true - - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 1025 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - # Extend node-to-node security group rules - node_security_group_additional_rules = { - ingress_self_all = { - description = "Node to node all ports/protocols" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - self = true - } - # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc. - # Change this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - - eks_managed_node_groups = { - # We recommend to have a MNG to place your critical workloads and add-ons - # Then rely on Karpenter to scale your workloads - # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners - - core_node_group = { - name = "core-node-group" - description = "EKS Core node group for hosting critical add-ons" - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - min_size = 3 - max_size = 9 - desired_size = 3 - - instance_types = ["m5.xlarge"] - - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - } - - tags = merge(local.tags, { - Name = "core-node-grp", - "karpenter.sh/discovery" = local.name - }) - } - - gpu1 = { - name = "gpu-node-grp" - description = "EKS Node Group to run GPU workloads" - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - ami_type = "AL2_x86_64_GPU" - min_size = 2 - max_size = 3 - desired_size = 2 - - instance_types = ["g5.12xlarge"] - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 200 - volume_type = "gp3" - } - } - } - taints = { - gpu = { - key = "nvidia.com/gpu" - effect = "NO_SCHEDULE" - operator = "EXISTS" - } - } - labels = { - WorkerType = "ON_DEMAND" - eks-node = "gpu" - } - - tags = merge(local.tags, { - Name = "gpu-node-grp", - "karpenter.sh/discovery" = local.name - }) - } - } -} diff --git a/ai-ml/bionemo/fsx-for-lustre.tf b/ai-ml/bionemo/fsx-for-lustre.tf deleted file mode 100644 index 2175461f0..000000000 --- a/ai-ml/bionemo/fsx-for-lustre.tf +++ /dev/null @@ -1,136 +0,0 @@ -#--------------------------------------------------------------- -# FSx for Lustre File system Static provisioning -# 1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600) -# 2> Create Storage Class for Filesystem (Cluster scoped) -# 3> Persistent Volume with Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped) -# 4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped) -#--------------------------------------------------------------- -# NOTE: FSx for Lustre file system creation can take up to 10 mins -resource "aws_fsx_lustre_file_system" "this" { - deployment_type = "PERSISTENT_2" - storage_type = "SSD" - per_unit_storage_throughput = "500" # 125, 250, 500, 1000 - storage_capacity = 2400 - - subnet_ids = [module.vpc.private_subnets[0]] - security_group_ids = [aws_security_group.fsx.id] - log_configuration { - level = "WARN_ERROR" - } - tags = merge({ "Name" : "${local.name}-static" }, local.tags) -} - -# This process can take upto 7 mins -resource "aws_fsx_data_repository_association" "this" { - - file_system_id = aws_fsx_lustre_file_system.this.id - data_repository_path = "s3://${module.fsx_s3_bucket.s3_bucket_id}" - file_system_path = "/data" # This directory will be used in Spark podTemplates under volumeMounts as subPath - - s3 { - auto_export_policy { - events = ["NEW", "CHANGED", "DELETED"] - } - - auto_import_policy { - events = ["NEW", "CHANGED", "DELETED"] - } - } -} - -#--------------------------------------------------------------- -# Sec group for FSx for Lustre -#--------------------------------------------------------------- -resource "aws_security_group" "fsx" { - - name = "${local.name}-fsx" - description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem" - vpc_id = module.vpc.vpc_id - - ingress { - description = "Allows Lustre traffic between Lustre clients" - cidr_blocks = module.vpc.private_subnets_cidr_blocks - from_port = 1021 - to_port = 1023 - protocol = "tcp" - } - ingress { - description = "Allows Lustre traffic between Lustre clients" - cidr_blocks = module.vpc.private_subnets_cidr_blocks - from_port = 988 - to_port = 988 - protocol = "tcp" - } - tags = local.tags -} - -#--------------------------------------------------------------- -# S3 bucket for DataSync between FSx for Lustre and S3 Bucket -#--------------------------------------------------------------- -#tfsec:ignore:aws-s3-enable-bucket-logging tfsec:ignore:aws-s3-enable-versioning -module "fsx_s3_bucket" { - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - create_bucket = true - - bucket_prefix = "${local.name}-fsx-" - # For example only - please evaluate for your environment - force_destroy = true - - server_side_encryption_configuration = { - rule = { - apply_server_side_encryption_by_default = { - sse_algorithm = "AES256" - } - } - } -} - -#--------------------------------------------------------------- -# Storage Class - FSx for Lustre -#--------------------------------------------------------------- -resource "kubectl_manifest" "storage_class" { - - yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-storage-class.yaml", { - subnet_id = module.vpc.private_subnets[0], - security_group_id = aws_security_group.fsx.id - }) - - depends_on = [ - module.eks_blueprints_addons - ] -} - -#--------------------------------------------------------------- -# FSx for Lustre Persistent Volume - Static provisioning -#--------------------------------------------------------------- -resource "kubectl_manifest" "static_pv" { - - yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pv.yaml", { - filesystem_id = aws_fsx_lustre_file_system.this.id, - dns_name = aws_fsx_lustre_file_system.this.dns_name - mount_name = aws_fsx_lustre_file_system.this.mount_name, - }) - - depends_on = [ - module.eks_blueprints_addons, - kubectl_manifest.storage_class, - aws_fsx_lustre_file_system.this - ] -} - -#--------------------------------------------------------------- -# FSx for Lustre Persistent Volume Claim -#--------------------------------------------------------------- -resource "kubectl_manifest" "static_pvc" { - - yaml_body = templatefile("${path.module}/fsx-for-lustre/fsxlustre-static-pvc.yaml", {}) - - depends_on = [ - module.eks_blueprints_addons, - kubectl_manifest.storage_class, - kubectl_manifest.static_pv, - aws_fsx_lustre_file_system.this - ] -} diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml deleted file mode 100644 index 857bdcf3a..000000000 --- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pv.yaml +++ /dev/null @@ -1,21 +0,0 @@ ---- -apiVersion: v1 -kind: PersistentVolume -metadata: - name: fsx-static-pv -spec: - capacity: - storage: 1000Gi - volumeMode: Filesystem - storageClassName: fsx - accessModes: - - ReadWriteMany - mountOptions: - - flock - persistentVolumeReclaimPolicy: Recycle - csi: - driver: fsx.csi.aws.com - volumeHandle: ${filesystem_id} - volumeAttributes: - dnsname: ${dns_name} - mountname: ${mount_name} diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml deleted file mode 100644 index dddebd66c..000000000 --- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-static-pvc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: fsx-static-pvc -spec: - accessModes: - - ReadWriteMany - storageClassName: fsx - resources: - requests: - storage: 1000Gi - volumeName: fsx-static-pv diff --git a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml b/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml deleted file mode 100644 index 125fb2478..000000000 --- a/ai-ml/bionemo/fsx-for-lustre/fsxlustre-storage-class.yaml +++ /dev/null @@ -1,9 +0,0 @@ ---- -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: fsx -provisioner: fsx.csi.aws.com -parameters: - subnetId: ${subnet_id} - securityGroupIds: ${security_group_id} diff --git a/ai-ml/bionemo/main.tf b/ai-ml/bionemo/main.tf deleted file mode 100644 index dd7d220a0..000000000 --- a/ai-ml/bionemo/main.tf +++ /dev/null @@ -1,53 +0,0 @@ -provider "aws" { - region = local.region -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -provider "kubectl" { - apply_retry_count = 10 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - token = data.aws_eks_cluster_auth.this.token -} - -data "aws_availability_zones" "available" {} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -#--------------------------------------------------------------- -# Local variables -#--------------------------------------------------------------- -locals { - name = var.name - region = var.region - vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 2) - - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/bionemo/outputs.tf b/ai-ml/bionemo/outputs.tf deleted file mode 100644 index 0f7edf2c1..000000000 --- a/ai-ml/bionemo/outputs.tf +++ /dev/null @@ -1,9 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}" -} - -output "eks_api_server_url" { - description = "Your eks API server endpoint" - value = module.eks.cluster_endpoint -} diff --git a/ai-ml/bionemo/versions.tf b/ai-ml/bionemo/versions.tf deleted file mode 100644 index a62c02b66..000000000 --- a/ai-ml/bionemo/versions.tf +++ /dev/null @@ -1,29 +0,0 @@ -terraform { - required_version = ">= 1.0.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 3.72" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4.1" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "doeks-github-actions-e2e-test-state" - # region = "us-west-2" - # key = "e2e/bionemo/terraform.tfstate" - # } -} diff --git a/ai-ml/bionemo/vpc.tf b/ai-ml/bionemo/vpc.tf deleted file mode 100644 index f63ccbe0c..000000000 --- a/ai-ml/bionemo/vpc.tf +++ /dev/null @@ -1,57 +0,0 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - - database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = local.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - - # ------------------------------ - # Private Subnets for MLflow backend store - database_subnets = local.database_private_subnets - create_database_subnet_group = true - create_database_subnet_route_table = true - - enable_nat_gateway = true - single_nat_gateway = true - enable_dns_hostnames = true - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - } - - tags = local.tags -} diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index f8ae300cc..93cc471cb 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -640,6 +640,10 @@ resource "kubectl_manifest" "dcgm" { yaml_body = file("${path.module}/monitoring/dcgm.yaml") } +resource "kubectl_manifest" "dcgm" { + yaml_body = file("${path.module}/monitoring/dcgm-service.yaml") +} + data "aws_iam_policy_document" "karpenter_controller_policy" { statement { actions = [ diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf index 465f9c152..f0606cd0e 100644 --- a/ai-ml/infrastructure/terraform/variables.tf +++ b/ai-ml/infrastructure/terraform/variables.tf @@ -92,7 +92,7 @@ variable "enable_jupyterhub" { variable "enable_volcano" { description = "Enable Volcano" type = bool - default = false + default = true } variable "enable_kuberay_operator" { description = "Enable KubeRay Operator" diff --git a/ai-ml/jark-stack/terraform/addons.tf b/ai-ml/jark-stack/terraform/addons.tf deleted file mode 100644 index 4ec1171cf..000000000 --- a/ai-ml/jark-stack/terraform/addons.tf +++ /dev/null @@ -1,460 +0,0 @@ -#--------------------------------------------------------------- -# GP3 Encrypted Storage Class -#--------------------------------------------------------------- -resource "kubernetes_annotations" "disable_gp2" { - annotations = { - "storageclass.kubernetes.io/is-default-class" : "false" - } - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - metadata { - name = "gp2" - } - force = true - - depends_on = [module.eks.eks_cluster_id] -} - -resource "kubernetes_storage_class" "default_gp3" { - metadata { - name = "gp3" - annotations = { - "storageclass.kubernetes.io/is-default-class" : "true" - } - } - - storage_provisioner = "ebs.csi.aws.com" - reclaim_policy = "Delete" - allow_volume_expansion = true - volume_binding_mode = "WaitForFirstConsumer" - parameters = { - fsType = "ext4" - encrypted = true - type = "gp3" - } - - depends_on = [kubernetes_annotations.disable_gp2] -} - -#--------------------------------------------------------------- -# IRSA for EBS CSI Driver -#--------------------------------------------------------------- -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.20" - role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") - attach_ebs_csi_policy = true - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - tags = local.tags -} - -#--------------------------------------------------------------- -# EKS Blueprints Addons -#--------------------------------------------------------------- -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.2" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - } - coredns = { - preserve = true - } - kube-proxy = { - preserve = true - } - # VPC CNI uses worker node IAM role policies - vpc-cni = { - preserve = true - } - } - - #--------------------------------------- - # AWS Load Balancer Controller Add-on - #--------------------------------------- - enable_aws_load_balancer_controller = true - # turn off the mutating webhook for services because we are using - # service.beta.kubernetes.io/aws-load-balancer-type: external - aws_load_balancer_controller = { - set = [{ - name = "enableServiceMutatorWebhook" - value = "false" - }] - } - - #--------------------------------------- - # Ingress Nginx Add-on - #--------------------------------------- - enable_ingress_nginx = true - ingress_nginx = { - values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] - } - - #--------------------------------------- - # Karpenter Autoscaler for EKS Cluster - #--------------------------------------- - enable_karpenter = true - karpenter_enable_spot_termination = true - karpenter_node = { - iam_role_additional_policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - karpenter = { - chart_version = "0.37.0" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - source_policy_documents = [ - data.aws_iam_policy_document.karpenter_controller_policy.json - ] - } - - #--------------------------------------- - # Argo Workflows & Argo Events - #--------------------------------------- - enable_argo_workflows = true - argo_workflows = { - name = "argo-workflows" - namespace = "argo-workflows" - repository = "https://argoproj.github.io/argo-helm" - values = [templatefile("${path.module}/helm-values/argo-workflows-values.yaml", {})] - } - - enable_argo_events = true - argo_events = { - name = "argo-events" - namespace = "argo-events" - repository = "https://argoproj.github.io/argo-helm" - values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})] - } - - #--------------------------------------- - # Prommetheus and Grafana stack - #--------------------------------------- - #--------------------------------------------------------------- - # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` - # 2- Grafana Admin user: admin - # 3- Get sexret name from Terrafrom output: `terraform output grafana_secret_name` - # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` - #--------------------------------------------------------------- - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - values = [ - templatefile("${path.module}/helm-values/kube-prometheus.yaml", { - storage_class_type = kubernetes_storage_class.default_gp3.id - }) - ] - chart_version = "48.1.1" - set_sensitive = [ - { - name = "grafana.adminPassword" - value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string - } - ], - } - - #--------------------------------------- - # CloudWatch metrics for EKS - #--------------------------------------- - enable_aws_cloudwatch_metrics = true - aws_cloudwatch_metrics = { - values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] - } - -} - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- - -module "data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "1.33.0" - - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------------------------------- - # JupyterHub Add-on - #--------------------------------------------------------------- - enable_jupyterhub = true - jupyterhub_helm_config = { - namespace = kubernetes_namespace_v1.jupyterhub.id - create_namespace = false - values = [file("${path.module}/helm-values/jupyterhub-values.yaml")] - } - - enable_volcano = true - #--------------------------------------- - # Kuberay Operator - #--------------------------------------- - enable_kuberay_operator = true - kuberay_operator_helm_config = { - version = "1.1.1" - # Enabling Volcano as Batch scheduler for KubeRay Operator - values = [ - <<-EOT - batchScheduler: - enabled: true - EOT - ] - } - - #--------------------------------------------------------------- - # NVIDIA Device Plugin Add-on - #--------------------------------------------------------------- - enable_nvidia_device_plugin = true - nvidia_device_plugin_helm_config = { - version = "v0.16.1" - name = "nvidia-device-plugin" - values = [ - <<-EOT - gfd: - enabled: true - nfd: - worker: - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" - EOT - ] - } - - #--------------------------------------- - # EFA Device Plugin Add-on - #--------------------------------------- - # IMPORTANT: Enable EFA only on nodes with EFA devices attached. - # Otherwise, you'll encounter the "No devices found..." error. Restart the pod after attaching an EFA device, or use a node selector to prevent incompatible scheduling. - enable_aws_efa_k8s_device_plugin = var.enable_aws_efa_k8s_device_plugin - aws_efa_k8s_device_plugin_helm_config = { - values = [file("${path.module}/helm-values/aws-efa-k8s-device-plugin-values.yaml")] - } - - #--------------------------------------------------------------- - # Kubecost Add-on - #--------------------------------------------------------------- - enable_kubecost = var.enable_kubecost - kubecost_helm_config = { - values = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})] - version = "2.2.2" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------------------------------- - # Karpenter Resources Add-on - #--------------------------------------------------------------- - enable_karpenter_resources = true - karpenter_resources_helm_config = { - - g5-gpu-karpenter = { - values = [ - <<-EOT - name: g5-gpu-karpenter - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - amiFamily: Bottlerocket - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - blockDeviceMappings: - # Root device - - deviceName: /dev/xvda - ebs: - volumeSize: 50Gi - volumeType: gp3 - encrypted: true - # Data device: Container resources such as images and logs - - deviceName: /dev/xvdb - ebs: - volumeSize: 300Gi - volumeType: gp3 - encrypted: true - ${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""} - - nodePool: - labels: - - type: karpenter - - NodeGroupType: g5-gpu-karpenter - taints: - - key: nvidia.com/gpu - value: "Exists" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["g5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: [ "2xlarge", "4xlarge", "8xlarge" ] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s - expireAfter: 720h - weight: 100 - EOT - ] - } - x86-cpu-karpenter = { - values = [ - <<-EOT - name: x86-cpu-karpenter - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - amiFamily: Bottlerocket - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[3]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - # instanceStorePolicy: RAID0 - blockDeviceMappings: - # Root device - - deviceName: /dev/xvda - ebs: - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - # Data device: Container resources such as images and logs - - deviceName: /dev/xvdb - ebs: - volumeSize: 300Gi - volumeType: gp3 - encrypted: true - ${var.bottlerocket_data_disk_snpashot_id != null ? "snapshotID: ${var.bottlerocket_data_disk_snpashot_id}" : ""} - - nodePool: - labels: - - type: karpenter - - NodeGroupType: x86-cpu-karpenter - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["m5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s - expireAfter: 720h - weight: 100 - EOT - ] - } - } - - depends_on = [ - kubernetes_secret_v1.huggingface_token, - kubernetes_config_map_v1.notebook - ] -} - - -#--------------------------------------------------------------- -# Additional Resources -#--------------------------------------------------------------- - -resource "kubernetes_namespace_v1" "jupyterhub" { - metadata { - name = "jupyterhub" - } -} - - -resource "kubernetes_secret_v1" "huggingface_token" { - metadata { - name = "hf-token" - namespace = kubernetes_namespace_v1.jupyterhub.id - } - - data = { - token = var.huggingface_token - } -} - -resource "kubernetes_config_map_v1" "notebook" { - metadata { - name = "notebook" - namespace = kubernetes_namespace_v1.jupyterhub.id - } - - data = { - "dogbooth.ipynb" = file("${path.module}/src/notebook/dogbooth.ipynb") - } -} - -#--------------------------------------------------------------- -# Grafana Admin credentials resources -# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana" -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] -} - -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" -} - -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name_prefix = "${local.name}-oss-grafana" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result -} - -data "aws_iam_policy_document" "karpenter_controller_policy" { - statement { - actions = [ - "ec2:RunInstances", - "ec2:CreateLaunchTemplate", - ] - resources = ["*"] - effect = "Allow" - sid = "KarpenterControllerAdditionalPolicy" - } -} diff --git a/ai-ml/jark-stack/terraform/eks.tf b/ai-ml/jark-stack/terraform/eks.tf deleted file mode 100644 index aaf11a9e7..000000000 --- a/ai-ml/jark-stack/terraform/eks.tf +++ /dev/null @@ -1,212 +0,0 @@ -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - - # if true, Your cluster API server is accessible from the internet. - # You can, optionally, limit the CIDR blocks that can access the public endpoint. - #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. - cluster_endpoint_public_access = true - - vpc_id = module.vpc.vpc_id - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the EKS Control Plane ENIs will be created - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - } - ] - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 0 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - node_security_group_additional_rules = { - # Allows Control Plane Nodes to talk to Worker nodes on all ports. - # Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on - # e.g., coreDNS 53, metrics-server 4443. - # Update this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - - ebs_optimized = true - # This block device is used only for root volume. Adjust volume according to your size. - # NOTE: Don't use this volume for ML workloads - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - } - - eks_managed_node_groups = { - # It's recommended to have a Managed Node group for hosting critical add-ons - # It's recommended to use Karpenter to place your workloads instead of using Managed Node groups - # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. - core_node_group = { - name = "core-node-group" - description = "EKS Core node group for hosting system add-ons" - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2 - ami_type = "AL2_x86_64" # Use this for Graviton AL2_ARM_64 - min_size = 2 - max_size = 8 - desired_size = 2 - - instance_types = ["m5.xlarge"] - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - } - - tags = merge(local.tags, { - Name = "core-node-grp" - }) - } - - # GPU Nodegroup for JupyterHub Notebook and Ray Service - gpu1 = { - name = "gpu-node-grp" - description = "EKS Node Group to run GPU workloads" - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - ami_type = "AL2_x86_64_GPU" - min_size = 0 - max_size = 1 - desired_size = 0 - - instance_types = ["g5.12xlarge"] - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "gpu" - } - - taints = { - gpu = { - key = "nvidia.com/gpu" - effect = "NO_SCHEDULE" - operator = "EXISTS" - } - } - - tags = merge(local.tags, { - Name = "gpu-node-grp" - }) - } - - # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation. - # # - # gpu_p5_node_group = { - # name = "p5-gpu-node-grp" - # description = "EKS Node Group to run GPU workloads" - - # ami_type = "AL2_x86_64_GPU" - - # instance_types = ["p5.48xlarge"] - # capacity_type = "ON_DEMAND" - - # # Filtering only Secondary CIDR private subnets starting with "100.". - # # Subnet IDs where the nodes/node groups will be provisioned - # subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - # substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - # ) - - # # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation. - # # subnet_ids = ["subnet-01234567890fds"] - # # capacity_reservation_specification = { - # # capacity_reservation_target = { - # # capacity_reservation_id = "cr-01234567890fds" - # # } - # # } - - # min_size = 1 - # max_size = 1 - # desired_size = 1 - - # # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance - # # we assign the host interface device_index=0, and all other interfaces device_index=1 - # # p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31 - # # p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3 - # network_interfaces = [ - # for i in range(32) : { - # associate_public_ip_address = false - # delete_on_termination = true - # device_index = i == 0 ? 0 : 1 - # network_card_index = i - # interface_type = "efa" - # } - # ] - - # # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171 - # bootstrap_extra_args = "--local-disks raid0" - # taints = { - # gpu = { - # key = "nvidia.com/gpu" - # effect = "NO_SCHEDULE" - # operator = "EXISTS" - # } - # } - # labels = { - # WorkerType = "ON_DEMAND" - # NodeGroupType = "gpu" - # } - # tags = merge(local.tags, { - # Name = "p5-gpu-node-grp" - # }) - # } - } -} diff --git a/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml b/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml deleted file mode 100644 index de495c16a..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/argo-events-values.yaml +++ /dev/null @@ -1,4 +0,0 @@ -## Argo Events admission webhook -webhook: - # -- Enable admission webhook. Applies only for cluster-wide installation - enabled: true diff --git a/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml b/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml deleted file mode 100644 index 2f6c9e729..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/argo-workflows-values.yaml +++ /dev/null @@ -1,5 +0,0 @@ -server: - autoscaling: - enabled: true - minReplicas: 1 - serviceType: LoadBalancer diff --git a/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml deleted file mode 100644 index ae3c41d44..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/aws-cloudwatch-metrics-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 200m - memory: 1Gi - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml b/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml deleted file mode 100644 index c214e10ba..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/aws-efa-k8s-device-plugin-values.yaml +++ /dev/null @@ -1,5 +0,0 @@ -tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" diff --git a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml b/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml deleted file mode 100644 index c8b1a5d74..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/ingress-nginx-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -controller: - service: - externalTrafficPolicy: "Local" - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC - targetPorts: - http: http - https: http diff --git a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml b/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml deleted file mode 100644 index fcad06b62..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/jupyterhub-values.yaml +++ /dev/null @@ -1,59 +0,0 @@ -hub: - config: - Authenticator: - admin_users: - - admin1 - allowed_users: - - user1 - # testing only - do not do this for production - DummyAuthenticator: - password: never-do-this - JupyterHub: - authenticator_class: dummy -proxy: - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 -singleuser: - image: - name: public.ecr.aws/h3o5n2r0/gpu-jupyter - tag: v1.5_cuda-11.6_ubuntu-20.04_python-only - pullPolicy: Always - cmd: null - startTimeout: 600 - memory: - guarantee: 24G - extraResource: - limits: - nvidia.com/gpu: "1" - extraEnv: - HUGGING_FACE_HUB_TOKEN: - valueFrom: - secretKeyRef: - name: hf-token - key: token - storage: - capacity: 100Gi - extraVolumes: - - name: shm-volume - emptyDir: - medium: Memory - - name: notebook - configMap: - name: notebook - extraVolumeMounts: - - name: shm-volume - mountPath: /dev/shm - - name: notebook - mountPath: /home/jovyan/dogbooth - extraTolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule -scheduling: - userScheduler: - enabled: false diff --git a/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml b/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml deleted file mode 100644 index 47e090743..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/kube-prometheus.yaml +++ /dev/null @@ -1,48 +0,0 @@ -prometheus: - prometheusSpec: - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: ${storage_class_type} - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true -prometheus: - prometheusSpec: - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: ${storage_class_type} - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true diff --git a/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml b/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml deleted file mode 100644 index 178eb68cf..000000000 --- a/ai-ml/jark-stack/terraform/helm-values/kubecost-values.yaml +++ /dev/null @@ -1,69 +0,0 @@ - -# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090 - -global: - # pricingCsv: - # enabled: false - # location: - # provider: "AWS" - # region: "us-east-1" - # URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI - # csvAccessCredentials: pricing-schema-access-secret - - prometheus: - enabled: true # Kubecost depends on Prometheus data, it is not optional. When enabled: false, Prometheus will not be installed and you must configure your own Prometheus to scrape kubecost as well as provide the fqdn below. -- Warning: Before changing this setting, please read to understand the risks https://docs.kubecost.com/install-and-configure/install/custom-prom - fqdn: http://cost-analyzer-prometheus-server.default.svc # example address of a prometheus to connect to. Include protocol (http:// or https://) Ignored if enabled: true - - grafana: - enabled: true # If false, Grafana will not be installed - domainName: cost-analyzer-grafana.default.svc # example grafana domain Ignored if enabled: true - scheme: "http" # http or https, for the domain name above. - proxy: true # If true, the kubecost frontend will route to your grafana through its service endpoint - -kubecostFrontend: - image: public.ecr.aws/kubecost/frontend - resources: - requests: - cpu: "200m" - memory: "512Mi" - -kubecostMetrics: - emitPodAnnotations: true - emitNamespaceAnnotations: true - -kubecostModel: - image: public.ecr.aws/kubecost/cost-model - resources: - requests: - cpu: "500m" - memory: "512Mi" - -forecasting: - fullImageName: public.ecr.aws/kubecost/kubecost-modeling:v0.1.6 - -networkCosts: - image: - repository: public.ecr.aws/kubecost/kubecost-network-costs - -clusterController: - image: - repository: public.ecr.aws/kubecost/cluster-controller - -prometheus: - server: - image: - repository: public.ecr.aws/kubecost/prometheus - - configmapReload: - prometheus: - image: - repository: public.ecr.aws/kubecost/prometheus-config-reloader - -reporting: - productAnalytics: false - -# Define persistence volume for cost-analyzer -persistentVolume: - size: 32Gi - dbSize: 32.0Gi - enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. diff --git a/ai-ml/jark-stack/terraform/karpenter.tf b/ai-ml/jark-stack/terraform/karpenter.tf deleted file mode 100644 index e69de29bb..000000000 diff --git a/ai-ml/jark-stack/terraform/main.tf b/ai-ml/jark-stack/terraform/main.tf deleted file mode 100644 index f93511951..000000000 --- a/ai-ml/jark-stack/terraform/main.tf +++ /dev/null @@ -1,51 +0,0 @@ -provider "aws" { - region = local.region -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} -provider "kubectl" { - apply_retry_count = 30 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - load_config_file = false -} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_availability_zones" "available" {} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -locals { - name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 2) - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml b/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml deleted file mode 100644 index 8ade99739..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/podMonitor.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - name: ray-workers-monitor - namespace: kube-prometheus-stack - labels: - # `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label. - release: kube-prometheus-stack -spec: - jobLabel: ray-workers - # Only select Kubernetes Pods in the "default" namespace. - namespaceSelector: - matchNames: - - rayserve-vllm - # Only select Kubernetes Pods with "matchLabels". - selector: - matchLabels: - ray.io/node-type: worker - # A list of endpoints allowed as part of this PodMonitor. - podMetricsEndpoints: - - port: metrics diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json deleted file mode 100644 index 26d11b3f1..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/data_grafana_dashboard.json +++ /dev/null @@ -1,4535 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 1, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_spilled_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Spilled: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Spilled", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount allocated by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_allocated_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Allocated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Allocated", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount freed by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_freed_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Freed: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Freed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Amount of memory store used by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_current_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Current Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Object Store Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical CPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 2 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_cpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "CPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "CPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical GPUs allocated to dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 2 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_gpu_usage_cores{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "GPU Usage: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total bytes outputted by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_output_bytes{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Outputted: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Outputted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Total rows outputted by dataset operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_output_rows{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Outputted: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Outputted", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks received by operator.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Received: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Received by Operator", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks received by operator.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_bytes_inputs_received{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Received: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Received by Operator", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of input blocks that operator's tasks have finished processing.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Processed: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Blocks Processed by Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks that operator's tasks have finished processing.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_bytes_task_inputs_processed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Processed: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Processed by Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks passed to submitted tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 21, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_bytes_inputs_of_submitted_tasks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Submitted: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Input Bytes Submitted to Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks generated by tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Generated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Blocks Generated by Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks generated by tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 23, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_bytes_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Generated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Bytes Generated by Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of rows in generated output blocks from finished tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_rows_task_outputs_generated{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Rows Generated: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows Generated by Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "rows", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of output blocks that are already taken by downstream operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 25, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Blocks Taken: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Blocks Taken by Downstream Operators", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks that are already taken by downstream operators.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_bytes_outputs_taken{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Taken: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Output Bytes Taken by Downstream Operators", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of submitted tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "hiddenSeries": false, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_submitted{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Submitted Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Submitted Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of running tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_running{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Running Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Running Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of tasks that already have output.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 10 - }, - "hiddenSeries": false, - "id": 31, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_have_outputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Tasks with output blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Tasks with output blocks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of finished tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 10 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_finished{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Finished Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Finished Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of failed tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 11 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_num_tasks_failed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Failed Tasks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Failed Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent generating blocks in tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 11 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_block_generation_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Block Generation Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Block Generation Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Time spent in task submission backpressure.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 12 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_task_submission_backpressure_time{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Backpressure Time: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Task Submission Backpressure Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal input queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 12 - }, - "hiddenSeries": false, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks in the operator's internal input queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 13 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_inqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Inqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of blocks in operator's internal output queue", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 13 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue_blocks{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Number of Blocks: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Blocks)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "blocks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of output blocks in the operator's internal output queue.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 14 - }, - "hiddenSeries": false, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_internal_outqueue{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Operator Internal Outqueue Size (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of input blocks used by pending tasks.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 14 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_pending_task_inputs{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Size of Blocks used in Pending Tasks (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of freed memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 15 - }, - "hiddenSeries": false, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_freed{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Freed Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Byte size of spilled memory in object store.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 15 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_obj_store_mem_spilled{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset, operator)", - "interval": "", - "legendFormat": "Bytes Size: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Spilled Memory in Object Store (Bytes)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in iterator initialization code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_initialize_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}, {{operator}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Initialization Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds user thread is blocked by iter_batches()", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_total_blocked_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration Blocked Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Seconds spent in user code", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 17 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_data_iter_user_seconds{dataset=~\"$DatasetID\",SessionName=~\"$SessionName\",}) by (dataset)", - "interval": "", - "legendFormat": "Seconds: {{dataset}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Iteration User Time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.24.0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, SessionName)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "SessionName", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, SessionName)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_data_allocated_bytes{}, dataset)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "DatasetID", - "options": [], - "query": { - "query": "label_values(ray_data_allocated_bytes{}, dataset)", - "refId": "Prometheus-Dataset-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "rayMeta": [ - "excludesSystemRoutes", - "supportsGlobalFilterOverride" - ], - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Data Dashboard", - "uid": "rayDataDashboard", - "version": 1 -} diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json deleted file mode 100644 index 7814395f5..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/default_grafana_dashboard.json +++ /dev/null @@ -1,2836 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "hiddenSeries": false, - "id": 26, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(max_over_time(ray_tasks{IsRetry!=\"0\",State=~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}[14d])) by (State) or clamp_min(sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (State), 0)", - "interval": "", - "legendFormat": "{{State}} (retry)", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Task State", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_tasks{IsRetry=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_tasks{IsRetry!=\"0\",State!~\"FINISHED|FAILED\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}} (retry)", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active Tasks by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_actors{SessionName=~\"$SessionName\",}) by (State)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Actor State", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "actors", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of (live) actors with a particular name.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_actors{State!=\"DEAD\",SessionName=~\"$SessionName\",}) by (Name)", - "interval": "", - "legendFormat": "{{Name}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active Actors by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "actors", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 2 - }, - "hiddenSeries": false, - "id": 27, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",State=\"USED\",SessionName=~\"$SessionName\",}) by (instance)", - "interval": "", - "legendFormat": "CPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"CPU\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"CPU\",SessionName=~\"$SessionName\",}) or vector(0)))", - "interval": "", - "legendFormat": "MAX + PENDING", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler CPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 2 - }, - "hiddenSeries": false, - "id": 29, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) by (Location)", - "interval": "", - "legendFormat": "{{Location}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Object Store Memory", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 3 - }, - "hiddenSeries": false, - "id": 28, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_resources{Name=\"GPU\",State=\"USED\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "GPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_resources{Name=\"GPU\",SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "((sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) and (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) + (sum(autoscaler_pending_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)) > (sum(autoscaler_cluster_resources{resource=\"GPU\",SessionName=~\"$SessionName\",}) or vector(0)))", - "interval": "", - "legendFormat": "MAX + PENDING", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler GPUs (logical slots)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "GPUs", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 3 - }, - "hiddenSeries": false, - "id": 40, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_placement_groups{SessionName=~\"$SessionName\",}) by (State)", - "interval": "", - "legendFormat": "{{State}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduler Placement Groups", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "placement groups", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 4 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_cpu_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * ray_node_cpu_count{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", - "interval": "", - "legendFormat": "CPU Usage: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node CPU (hardware utilization)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 4 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_gpus_utilization{instance=~\"$Instance\",SessionName=~\"$SessionName\",} / 100", - "interval": "", - "legendFormat": "GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_available{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node GPU (hardware utilization)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "GPUs", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 5 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_disk_usage{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Disk Used: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Disk", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Disk IO per node.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 5 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_disk_io_write_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Write: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "ray_node_disk_io_read_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Read: {{instance}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Disk IO Speed", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 6 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_mem_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Memory Used: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Memory (heap + object store)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 6 - }, - "hiddenSeries": false, - "id": 44, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_memory_manager_worker_eviction_total{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "OOM Killed: {{Name}}, {{instance}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Out of Memory Failures by Name", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "failures", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 7 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "(sum(ray_component_rss_mb{SessionName=~\"$SessionName\",} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{SessionName=~\"$SessionName\",}) by (Component))", - "interval": "", - "legendFormat": "{{Component}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_shared_bytes{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "shared_memory", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_total{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Memory by Component", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 7 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_component_cpu_percentage{SessionName=~\"$SessionName\",}) by (Component) / 100", - "interval": "", - "legendFormat": "{{Component}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_cpu_count{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node CPU by Component", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "cores", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 8 - }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_gram_used{instance=~\"$Instance\",SessionName=~\"$SessionName\",} * 1024 * 1024", - "interval": "", - "legendFormat": "Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "(sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 1024 * 1024", - "interval": "", - "legendFormat": "MAX", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node GPU Memory (GRAM)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "bytes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Network speed per node", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 8 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_node_network_receive_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Recv: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "ray_node_network_send_speed{instance=~\"$Instance\",SessionName=~\"$SessionName\",}", - "interval": "", - "legendFormat": "Send: {{instance}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Network", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(autoscaler_active_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Active Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Failed Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{SessionName=~\"$SessionName\",}) by (NodeType)", - "interval": "", - "legendFormat": "Pending Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "nodes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 41, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{SessionName=~\"$SessionName\",})", - "interval": "", - "legendFormat": "CPU (physical)", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{SessionName=~\"$SessionName\",}) / on() (sum(autoscaler_cluster_resources{resource='GPU',SessionName=~\"$SessionName\",}) or vector(0))", - "interval": "", - "legendFormat": "GPU (physical)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_mem_total{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "Memory (RAM)", - "queryType": "randomWalk", - "refId": "C" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gram_used{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_gram_available{SessionName=~\"$SessionName\",}) + sum(ray_node_gram_used{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "GRAM", - "queryType": "randomWalk", - "refId": "D" - }, - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{SessionName=~\"$SessionName\",}) / on() sum(ray_resources{Name=\"object_store_memory\",SessionName=~\"$SessionName\",}) * 100", - "interval": "", - "legendFormat": "Object Store Memory", - "queryType": "randomWalk", - "refId": "E" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_usage{SessionName=~\"$SessionName\",}) / on() (sum(ray_node_disk_free{SessionName=~\"$SessionName\",}) + sum(ray_node_disk_usage{SessionName=~\"$SessionName\",})) * 100", - "interval": "", - "legendFormat": "Disk", - "queryType": "randomWalk", - "refId": "F" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Cluster Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "%", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.24.0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".+", - "current": { - "selected": false - }, - "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{}, SessionName)", - "description": "Filter queries to specific ray sessions.", - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "SessionName", - "options": [], - "query": { - "query": "label_values(ray_node_network_receive_speed{}, SessionName)", - "refId": "StandardVariableQuery" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 2, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".+", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Instance", - "options": [], - "query": { - "query": "label_values(ray_node_network_receive_speed{SessionName=\"$SessionName\",}, instance)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Default Dashboard", - "uid": "rayDefaultDashboard", - "version": 4, - "rayMeta": [ - "supportsGlobalFilterOverride" - ] -} diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json deleted file mode 100644 index 8648e308a..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_deployment_grafana_dashboard.json +++ /dev/null @@ -1,2115 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of replicas per deployment. Ignores \"Route\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 0, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 1, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Replicas per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "replicas", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "QPS for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 0, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "QPS per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Error QPS for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 0, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter_total{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Error QPS per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P50 latency per replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 1, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P50 latency per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P90 latency per replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 1, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P90 latency per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P99 latency per replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 1, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",route!~\"/-/.*\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{route=~\"$Route\",application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P99 latency per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of requests queued per deployment. Ignores \"Replica\" and \"Route\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 2, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Queue size per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Current running requests for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 12, - "y": 2, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_replica_processing_queries{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Running requests per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of multiplexed models for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_num_multiplexed_models{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Multiplexed models per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "models", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of times of multiplexed models loaded for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Multiplexed model loads per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "times", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of times of multiplexed models unloaded for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_multiplexed_models_unload_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}) by (application, deployment, replica)", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Multiplexed model unloads per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "times", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P99 latency of mutliplexed model load per replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P99 latency of multiplexed model loads per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P99 latency of mutliplexed model unload per replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])) by (application, deployment, replica, le))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P99 latency of multiplexed model unloads per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The ids of multiplexed models for each replica.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_registered_multiplexed_model_id{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}", - "interval": "", - "legendFormat": "{{replica}}:{{model_id}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Multiplexed model ids per replica", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "model", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The cache hit rate of multiplexed models for the deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 5, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{application=~\"$Application\",deployment=~\"$Deployment\",replica=~\"$Replica\",}[5m])))", - "interval": "", - "legendFormat": "{{replica}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Multiplexed model cache hit rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "%", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.24.0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries to specific prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Application", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Deployment", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",}, deployment)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Replica", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_replica_healthy{application=~\"$Application\",deployment=~\"$Deployment\",}, replica)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Route", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_request_counter{deployment=~\"$Deployment\",}, route)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "rayMeta": [ - "excludesSystemRoutes", - "supportsGlobalFilterOverride" - ], - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Serve Deployment Dashboard", - "uid": "rayServeDeploymentDashboard", - "version": 1 -} diff --git a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json b/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json deleted file mode 100644 index 4d1ec6e8e..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/ray-dashboards/serve_grafana_dashboard.json +++ /dev/null @@ -1,3098 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "iteration": 1667344411089, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 0, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "avg(ray_node_cpu_utilization{})", - "interval": "", - "legendFormat": "CPU (physical)", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gpus_utilization{}) / on() (sum(autoscaler_cluster_resources{resource='GPU',}) or vector(0))", - "interval": "", - "legendFormat": "GPU (physical)", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(ray_node_mem_used{}) / on() (sum(ray_node_mem_total{})) * 100", - "interval": "", - "legendFormat": "Memory (RAM)", - "queryType": "randomWalk", - "refId": "C" - }, - { - "exemplar": true, - "expr": "sum(ray_node_gram_used{}) / on() (sum(ray_node_gram_available{}) + sum(ray_node_gram_used{})) * 100", - "interval": "", - "legendFormat": "GRAM", - "queryType": "randomWalk", - "refId": "D" - }, - { - "exemplar": true, - "expr": "sum(ray_object_store_memory{}) / on() sum(ray_resources{Name=\"object_store_memory\",}) * 100", - "interval": "", - "legendFormat": "Object Store Memory", - "queryType": "randomWalk", - "refId": "E" - }, - { - "exemplar": true, - "expr": "sum(ray_node_disk_usage{}) / on() (sum(ray_node_disk_free{}) + sum(ray_node_disk_usage{})) * 100", - "interval": "", - "legendFormat": "Disk", - "queryType": "randomWalk", - "refId": "F" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Cluster Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "%", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "QPS for each selected application.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 12, - "y": 0, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 7, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", - "interval": "", - "legendFormat": "{{application, route}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", - "interval": "", - "legendFormat": "{{application, method}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "QPS per application", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Error QPS for each selected application.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 1, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 8, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route)", - "interval": "", - "legendFormat": "{{application, route}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method)", - "interval": "", - "legendFormat": "{{application, method}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Error QPS per application", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Error QPS for each selected application.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 12, - "y": 1, - "w": 12, - "h": 8 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_http_error_requests_total{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, error_code)", - "interval": "", - "legendFormat": "{{application, route, error_code}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(rate(ray_serve_num_grpc_error_requests_total{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, error_code)", - "interval": "", - "legendFormat": "{{application, method, error_code}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Error QPS per application per error code", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P50 latency for selected applications.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 2, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 12, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", - "interval": "", - "legendFormat": "{{application, route}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", - "interval": "", - "legendFormat": "{{application, method}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P50 latency per application", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P90 latency for selected applications.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 2, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", - "interval": "", - "legendFormat": "{{application, route}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", - "interval": "", - "legendFormat": "{{application, method}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P90 latency per application", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P99 latency for selected applications.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 2, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 16, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",route=~\"$HTTP_Route\",route!~\"/-/.*\",}[5m])) by (application, route, le))", - "interval": "", - "legendFormat": "{{application, route}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{application=~\"$Application\",application!~\"\",method=~\"$gRPC_Method\",}[5m])) by (application, method, le))", - "interval": "", - "legendFormat": "{{application, method}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate({__name__=~ \"ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket\",application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P99 latency per application", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of replicas per deployment. Ignores \"Application\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_deployment_replica_healthy{}) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Replicas per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "replicas", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "QPS for each deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_request_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "QPS per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Error QPS for each deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 3, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 14, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(ray_serve_deployment_error_counter_total{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Error QPS per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "qps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P50 latency per deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 9, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P50 latency per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P90 latency per deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P90 latency per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "P99 latency per deployment.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 4, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 11, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (application, deployment, le))", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{application=~\"$Application\",application!~\"\",}[5m])) by (le))", - "interval": "", - "legendFormat": "Total", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "P99 latency per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of requests queued per deployment. Ignores \"Application\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 0, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 5, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 3, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_serve_deployment_queued_queries{}) by (application, deployment)", - "interval": "", - "legendFormat": "{{application, deployment}}", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Queue size per deployment", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Number of nodes in this cluster. Ignores \"Application\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 5, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(autoscaler_active_nodes{}) by (NodeType)", - "interval": "", - "legendFormat": "Active Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_recently_failed_nodes{}) by (NodeType)", - "interval": "", - "legendFormat": "Failed Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "B" - }, - { - "exemplar": true, - "expr": "sum(autoscaler_pending_nodes{}) by (NodeType)", - "interval": "", - "legendFormat": "Pending Nodes: {{NodeType}}", - "queryType": "randomWalk", - "refId": "C" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "nodes", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "Network speed per node. Ignores \"Application\" variable.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 5, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 6, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(ray_node_network_receive_speed{}) by (instance)", - "interval": "", - "legendFormat": "Recv: {{instance}}", - "queryType": "randomWalk", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum(ray_node_network_send_speed{}) by (instance)", - "interval": "", - "legendFormat": "Send: {{instance}}", - "queryType": "randomWalk", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Node network", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of ongoing requests in the HTTP Proxy.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 6, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 20, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_num_ongoing_http_requests{}", - "interval": "", - "legendFormat": "Ongoing HTTP Requests", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ongoing HTTP Requests", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of ongoing requests in the gRPC Proxy.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 6, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 21, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_num_ongoing_grpc_requests{}", - "interval": "", - "legendFormat": "Ongoing gRPC Requests", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Ongoing gRPC Requests", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "requests", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of request scheduling tasks in the router.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 6, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_num_scheduling_tasks{}", - "interval": "", - "legendFormat": "Scheduling Tasks", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduling Tasks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of request scheduling tasks in the router that are undergoing backoff.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 0, - "y": 7, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 23, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_num_scheduling_tasks_in_backoff{}", - "interval": "", - "legendFormat": "Scheduling Tasks in Backoff", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Scheduling Tasks in Backoff", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "tasks", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The duration of the last control loop.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 8, - "y": 7, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 24, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_controller_control_loop_duration_s{}", - "interval": "", - "legendFormat": "Control Loop Duration", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Controller Control Loop Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "seconds", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${datasource}", - "description": "The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 10, - "fillGradient": 0, - "gridPos": { - "x": 16, - "y": 7, - "w": 8, - "h": 8 - }, - "hiddenSeries": false, - "id": 25, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.17", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "$$hashKey": "object:2987", - "alias": "MAX", - "dashes": true, - "color": "#1F60C4", - "fill": 0, - "stack": false - }, - { - "$$hashKey": "object:78", - "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", - "hiddenSeries": true - }, - { - "$$hashKey": "object:2987", - "alias": "MAX + PENDING", - "dashes": true, - "color": "#777777", - "fill": 0, - "stack": false - } - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "ray_serve_controller_num_control_loops{}", - "interval": "", - "legendFormat": "Control Loops", - "queryType": "randomWalk", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Number of Control Loops", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:628", - "format": "loops", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:629", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - } - ], - "refresh": false, - "schemaVersion": 27, - "style": "dark", - "tags": [ - "rayVersion:2.24.0" - ], - "templating": { - "list": [ - { - "current": { - "selected": false - }, - "description": "Filter queries of a specific Prometheus type.", - "hide": 2, - "includeAll": false, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": null, - "multi": true, - "name": "Application", - "options": [], - "query": { - "query": "label_values(ray_serve_deployment_replica_healthy{}, application)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_num_http_requests_total{}, route)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "HTTP Route", - "multi": true, - "name": "HTTP_Route", - "options": [], - "query": { - "query": "label_values(ray_serve_num_http_requests_total{}, route)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${datasource}", - "definition": "label_values(ray_serve_num_grpc_requests{}, method)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "gRPC Service Method", - "multi": true, - "name": "gRPC_Method", - "options": [], - "query": { - "query": "label_values(ray_serve_num_grpc_requests{}, method)", - "refId": "Prometheus-Instance-Variable-Query" - }, - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "rayMeta": [ - "excludesSystemRoutes", - "supportsGlobalFilterOverride" - ], - "time": { - "from": "now-30m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Serve Dashboard", - "uid": "rayServeDashboard", - "version": 1 -} diff --git a/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml b/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml deleted file mode 100644 index dbda70c40..000000000 --- a/ai-ml/jark-stack/terraform/monitoring/serviceMonitor.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: ray-head-monitor - namespace: kube-prometheus-stack - labels: - # `release: $HELM_RELEASE`: Prometheus can only detect ServiceMonitor with this label. - release: kube-prometheus-stack -spec: - jobLabel: ray-head - # Only select Kubernetes Services in the "default" namespace. - namespaceSelector: - matchNames: - - rayserve-vllm - # Only select Kubernetes Services with "matchLabels". - selector: - matchLabels: - ray.io/node-type: head - # A list of endpoints allowed as part of this ServiceMonitor. - endpoints: - - port: metrics - - port: as-metrics # autoscaler metrics - - port: dash-metrics # dashboard metrics - targetLabels: - - ray.io/cluster diff --git a/ai-ml/jark-stack/terraform/outputs.tf b/ai-ml/jark-stack/terraform/outputs.tf deleted file mode 100644 index 5771ae141..000000000 --- a/ai-ml/jark-stack/terraform/outputs.tf +++ /dev/null @@ -1,9 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" -} - -output "grafana_secret_name" { - description = "The name of the secret containing the Grafana admin password." - value = aws_secretsmanager_secret.grafana.name -} diff --git a/ai-ml/jark-stack/terraform/versions.tf b/ai-ml/jark-stack/terraform/versions.tf deleted file mode 100644 index e24e99c1f..000000000 --- a/ai-ml/jark-stack/terraform/versions.tf +++ /dev/null @@ -1,33 +0,0 @@ -terraform { - required_version = ">= 1.0.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 3.72" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4.1" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } - random = { - source = "hashicorp/random" - version = ">= 3.6.0" # Replace with the appropriate version of the random provider - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "doeks-github-actions-e2e-test-state" - # region = "us-west-2" - # key = "e2e/jark/terraform.tfstate" - # } -} diff --git a/ai-ml/jark-stack/terraform/vpc.tf b/ai-ml/jark-stack/terraform/vpc.tf deleted file mode 100644 index 59c3da89c..000000000 --- a/ai-ml/jark-stack/terraform/vpc.tf +++ /dev/null @@ -1,53 +0,0 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- -# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. -# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = var.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods - # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - enable_nat_gateway = true - single_nat_gateway = true - #------------------------------- - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - # Tags subnets for Karpenter auto-discovery - "karpenter.sh/discovery" = local.name - } - - tags = local.tags -} diff --git a/ai-ml/mlflow/addons.tf b/ai-ml/mlflow/addons.tf deleted file mode 100644 index 5e3a7beb1..000000000 --- a/ai-ml/mlflow/addons.tf +++ /dev/null @@ -1,431 +0,0 @@ -#--------------------------------------------------------------- -# IRSA for EBS CSI Driver -#--------------------------------------------------------------- -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.20" - role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") - attach_ebs_csi_policy = true - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - tags = local.tags -} -#--------------------------------------------------------------- -# EKS Blueprints Kubernetes Addons -#--------------------------------------------------------------- -module "eks_blueprints_addons" { - # Short commit hash from 8th May using git rev-parse --short HEAD - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.3" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - } - coredns = { - preserve = true - } - vpc-cni = { - preserve = true - } - kube-proxy = { - preserve = true - } - } - - #--------------------------------------------------------------- - # CoreDNS Autoscaler helps to scale for large EKS Clusters - # Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/ - #--------------------------------------------------------------- - enable_cluster_proportional_autoscaler = true - cluster_proportional_autoscaler = { - values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", { - target = "deployment/coredns" - })] - description = "Cluster Proportional Autoscaler for CoreDNS Service" - } - - #--------------------------------------- - # Metrics Server - #--------------------------------------- - enable_metrics_server = true - metrics_server = { - values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] - } - - #--------------------------------------- - # Cluster Autoscaler - #--------------------------------------- - enable_cluster_autoscaler = true - cluster_autoscaler = { - timeout = "300" - values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { - aws_region = var.region, - eks_cluster_id = module.eks.cluster_name - })] - } - - #--------------------------------------- - # AWS for FluentBit - DaemonSet - #--------------------------------------- - enable_aws_for_fluentbit = true - aws_for_fluentbit_cw_log_group = { - use_name_prefix = false - name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group - retention_in_days = 30 - } - aws_for_fluentbit = { - s3_bucket_arns = [ - module.fluentbit_s3_bucket.s3_bucket_arn, - "${module.fluentbit_s3_bucket.s3_bucket_arn}/*" - ] - values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", { - region = local.region, - cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" - s3_bucket_name = module.fluentbit_s3_bucket.s3_bucket_id - cluster_name = module.eks.cluster_name - })] - } - - #--------------------------------------- - # Karpenter Autoscaler for EKS Cluster - #--------------------------------------- - enable_karpenter = true - karpenter_enable_spot_termination = true - karpenter_node = { - iam_role_additional_policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - karpenter = { - chart_version = "v0.34.0" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------- - # AWS Load Balancer Controller - #--------------------------------------- - enable_aws_load_balancer_controller = true - aws_load_balancer_controller = { - set = [{ - name = "enableServiceMutatorWebhook" - value = "false" - }] - } - - #--------------------------------------- - # Ingress Nginx Add-on - #--------------------------------------- - enable_ingress_nginx = true - ingress_nginx = { - values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] - } - - #--------------------------------------- - # Prommetheus and Grafana stack - #--------------------------------------- - #--------------------------------------------------------------- - # Install Kafka Monitoring Stack with Prometheus and Grafana - # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` - # 2- Grafana Admin user: admin - # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` - #--------------------------------------------------------------- - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - values = [ - var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", { - region = local.region - amp_sa = local.amp_ingest_service_account - amp_irsa = module.amp_ingest_irsa[0].iam_role_arn - amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write" - amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}" - storage_class_type = kubernetes_storage_class.ebs_csi_encrypted_gp3_storage_class.id - }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {}) - ] - chart_version = "48.1.1" - set_sensitive = [ - { - name = "grafana.adminPassword" - value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string - } - ], - } - - tags = local.tags - -} - - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- -module "eks_data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "1.33.0" # ensure to update this to the latest/desired version - - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------------------------------- - # MLflow Tracking Add-on - #--------------------------------------------------------------- - - enable_mlflow_tracking = true - mlflow_tracking_helm_config = { - mlflow_namespace = try(kubernetes_namespace_v1.mlflow[0].metadata[0].name, local.mlflow_namespace) - - values = [templatefile("${path.module}/helm-values/mlflow-tracking-values.yaml", { - mlflow_sa = local.mlflow_service_account - mlflow_irsa = module.mlflow_irsa[0].iam_role_arn - # MLflow Postgres RDS Config - mlflow_db_username = local.mlflow_name - mlflow_db_password = try(sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string), "") - mlflow_db_name = try(module.db[0].db_instance_name, "") - mlflow_db_host = try(element(split(":", module.db[0].db_instance_endpoint), 0), "") - # S3 bucket config for artifacts - s3_bucket_name = try(module.mlflow_s3_bucket[0].s3_bucket_id, "") - })] - } - - #--------------------------------------------------------------- - # NVIDIA GPU Operator Add-on - #--------------------------------------------------------------- - enable_nvidia_gpu_operator = true - nvidia_gpu_operator_helm_config = { - values = [templatefile("${path.module}/helm-values/nvidia-values.yaml", {})] - } - - #--------------------------------------- - # Deploying Karpenter resources(Nodepool and NodeClass) with Helm Chart - #--------------------------------------- - enable_karpenter_resources = true - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - karpenter_resources_helm_config = { - gpu-g5 = { - values = [ - <<-EOT - name: gpu-g5 - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - blockDevice: - deviceName: /dev/xvda - volumeSize: 500Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - nodePool: - labels: - - instanceType: gp5 - - provisionerType: Karpenter - taints: - - key: nvidia.com/gpu - operator: "Exists" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["g5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["on-demand"] - limits: - cpu: 1000 - amiFamily: Ubuntu - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 - EOT - ] - } - default = { - values = [ - <<-EOT - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - blockDevice: - deviceName: /dev/xvda - volumeSize: 200Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - nodePool: - labels: - - instanceType: mixed-x86 - - provisionerType: Karpenter - - workload: mlflow - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["c5", "m5", "r5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 - EOT - ] - } - } -} - -#--------------------------------------------------------------- -# Ingress Nginx external security groups -#--------------------------------------------------------------- -resource "aws_security_group" "ingress_nginx_external" { - name = "ingress-nginx-external" - description = "Allow public HTTP and HTTPS traffic" - vpc_id = module.vpc.vpc_id - - ingress { - from_port = 80 - to_port = 80 - protocol = "tcp" - cidr_blocks = ["0.0.0.0/0"] # modify to your requirements - } - - ingress { - from_port = 443 - to_port = 443 - protocol = "tcp" - cidr_blocks = ["0.0.0.0/0"] # modify to your requirements - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# Grafana Admin credentials resources -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] -} - -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" -} - -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name = "${local.name}-grafana" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result -} - -#--------------------------------------------------------------- -# S3 log bucket for FluentBit -#--------------------------------------------------------------- -#tfsec:ignore:* -module "fluentbit_s3_bucket" { - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - bucket_prefix = "${local.name}-fluentbit-logs-" - # For example only - please evaluate for your environment - force_destroy = true - server_side_encryption_configuration = { - rule = { - apply_server_side_encryption_by_default = { - sse_algorithm = "AES256" - } - } - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# GP3 Encrypted Storage Class -#--------------------------------------------------------------- - -resource "kubernetes_annotations" "gp2_default" { - annotations = { - "storageclass.kubernetes.io/is-default-class" : "false" - } - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - metadata { - name = "gp2" - } - force = true - - depends_on = [module.eks] -} - -resource "kubernetes_storage_class" "ebs_csi_encrypted_gp3_storage_class" { - metadata { - name = "gp3" - annotations = { - "storageclass.kubernetes.io/is-default-class" : "true" - } - } - - storage_provisioner = "ebs.csi.aws.com" - reclaim_policy = "Delete" - allow_volume_expansion = true - volume_binding_mode = "WaitForFirstConsumer" - parameters = { - fsType = "xfs" - encrypted = true - type = "gp3" - } - - depends_on = [kubernetes_annotations.gp2_default] -} diff --git a/ai-ml/mlflow/amp.tf b/ai-ml/mlflow/amp.tf deleted file mode 100644 index 14b47ba4c..000000000 --- a/ai-ml/mlflow/amp.tf +++ /dev/null @@ -1,136 +0,0 @@ -#------------------------------------------ -# Amazon Prometheus -#------------------------------------------ -locals { - amp_ingest_service_account = "amp-iamproxy-ingest-service-account" - amp_namespace = "kube-prometheus-stack" -} - -resource "aws_prometheus_workspace" "amp" { - count = var.enable_amazon_prometheus ? 1 : 0 - - alias = format("%s-%s", "amp-ws", local.name) - tags = local.tags -} -#IAM Policy for Amazon Prometheus & Grafana -resource "aws_iam_policy" "grafana" { - count = var.enable_amazon_prometheus ? 1 : 0 - - description = "IAM policy for Grafana Pod" - name_prefix = format("%s-%s-", local.name, "grafana") - path = "/" - policy = data.aws_iam_policy_document.grafana[0].json -} - -data "aws_iam_policy_document" "grafana" { - count = var.enable_amazon_prometheus ? 1 : 0 - - statement { - sid = "AllowReadingMetricsFromCloudWatch" - effect = "Allow" - resources = ["*"] - - actions = [ - "cloudwatch:DescribeAlarmsForMetric", - "cloudwatch:ListMetrics", - "cloudwatch:GetMetricData", - "cloudwatch:GetMetricStatistics" - ] - } - - statement { - sid = "AllowGetInsightsCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"] - - actions = [ - "cloudwatch:GetInsightRuleReport", - ] - } - - statement { - sid = "AllowReadingAlarmHistoryFromCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"] - - actions = [ - "cloudwatch:DescribeAlarmHistory", - "cloudwatch:DescribeAlarms", - ] - } - - statement { - sid = "AllowReadingLogsFromCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"] - - actions = [ - "logs:DescribeLogGroups", - "logs:GetLogGroupFields", - "logs:StartQuery", - "logs:StopQuery", - "logs:GetQueryResults", - "logs:GetLogEvents", - ] - } - - statement { - sid = "AllowReadingTagsInstancesRegionsFromEC2" - effect = "Allow" - resources = ["*"] - - actions = [ - "ec2:DescribeTags", - "ec2:DescribeInstances", - "ec2:DescribeRegions", - ] - } - - statement { - sid = "AllowReadingResourcesForTags" - effect = "Allow" - resources = ["*"] - actions = ["tag:GetResources"] - } - - statement { - sid = "AllowListApsWorkspaces" - effect = "Allow" - resources = [ - "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*", - "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*", - "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*", - ] - actions = [ - "aps:ListWorkspaces", - "aps:DescribeWorkspace", - "aps:GetMetricMetadata", - "aps:GetSeries", - "aps:QueryMetrics", - "aps:RemoteWrite", - "aps:GetLabels" - ] - } -} - -module "amp_ingest_irsa" { - count = var.enable_amazon_prometheus ? 1 : 0 - - source = "aws-ia/eks-blueprints-addon/aws" - version = "~> 1.0" - create_release = false - create_role = true - create_policy = false - role_name = format("%s-%s", local.name, "amp-ingest") - role_policies = { amp_policy = aws_iam_policy.grafana[0].arn } - - oidc_providers = { - this = { - provider_arn = module.eks.oidc_provider_arn - namespace = local.amp_namespace - service_account = local.amp_ingest_service_account - } - } - - tags = local.tags -} diff --git a/ai-ml/mlflow/eks.tf b/ai-ml/mlflow/eks.tf deleted file mode 100644 index 15fa077d1..000000000 --- a/ai-ml/mlflow/eks.tf +++ /dev/null @@ -1,118 +0,0 @@ -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - - cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. - - vpc_id = module.vpc.vpc_id - - subnet_ids = module.vpc.private_subnets - - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - } - ] - - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 1025 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - # Extend node-to-node security group rules - node_security_group_additional_rules = { - ingress_self_all = { - description = "Node to node all ports/protocols" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - self = true - } - # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc. - # Change this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - - eks_managed_node_groups = { - # We recommend to have a MNG to place your critical workloads and add-ons - # Then rely on Karpenter to scale your workloads - # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners - core_node_group = { - name = "core-node-group" - description = "EKS Core node group for hosting critical add-ons" - # Filtering only Secondary CIDR private subnets starting with "100.". - # Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - min_size = 3 - max_size = 9 - desired_size = 3 - - instance_types = ["m5.xlarge"] - - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - Environment = "preprod" - Zone = "test" - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - } - - tags = merge(local.tags, { - Name = "core-node-grp", - "karpenter.sh/discovery" = local.name - }) - } - } -} diff --git a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml deleted file mode 100644 index 82a654554..000000000 --- a/ai-ml/mlflow/helm-values/aws-for-fluentbit-values.yaml +++ /dev/null @@ -1,102 +0,0 @@ -global: - -#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server -# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata -hostNetwork: true -dnsPolicy: ClusterFirstWithHostNet - -service: - parsersFiles: - - /fluent-bit/parsers/parsers.conf - extraParsers: | - [PARSER] - Name kubernetes - Format regex - Regex ^(?[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ - -input: - name: "tail" - enabled: true - tag: "systempods....-" - path: "/var/log/containers/*.log" - db: "/var/log/flb_kube.db" - memBufLimit: 5MB - skipLongLines: "On" - refreshInterval: 10 - extraInputs: | - multiline.parser docker, cri - Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ - - -# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters -filter: - name: "kubernetes" - match: "systempods.*" - kubeURL: "https://kubernetes.default.svc.cluster.local:443" - mergeLog: "On" - mergeLogKey: "log_processed" - keepLog: "On" - k8sLoggingParser: "On" - k8sLoggingExclude: "Off" - bufferSize: "0" - extraFilters: | - Kube_Tag_Prefix systempods. - Regex_Parser kubernetes - Labels On - Annotations Off - Use_Kubelet true - Kubelet_Port 10250 - Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token - -# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. -# cloudWatch: -# enabled: false - -# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch -cloudWatchLogs: - enabled: true - match: "systempods.*" - region: ${region} - logGroupName: ${cloudwatch_log_group} - autoCreateGroup: false - extraOutputs: | - log_key log - -#----------------------------------------------------------# -# OUTPUT logs to S3 -#----------------------------------------------------------# - -# This is an example for writing logs to S3 bucket. -# This example writes system pod logs and spark logs into dedicated prefix. -# This second output is using the rewrite_tag filter commented above - -additionalOutputs: | - [OUTPUT] - Name s3 - Match systempods.* - region ${region} - bucket ${s3_bucket_name} - total_file_size 100M - s3_key_format /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log - s3_key_format_tag_delimiters .. - store_dir /home/ec2-user/buffer - upload_timeout 10m - log_key log - - -# Resource config for large clusters -resources: - limits: - cpu: 1000m - memory: 1500Mi - requests: - cpu: 500m - memory: 500Mi - -## Assign a PriorityClassName to pods if set -priorityClassName: system-node-critical - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml deleted file mode 100644 index 5a42794f2..000000000 --- a/ai-ml/mlflow/helm-values/cluster-autoscaler-values.yaml +++ /dev/null @@ -1,25 +0,0 @@ -autoDiscovery: - clusterName: ${eks_cluster_id} - -awsRegion: ${aws_region} - -cloudProvider: aws - -extraArgs: - aws-use-static-instance-list: true - -# Best practice to update the resource requests and limits for each add-on -resources: - limits: - cpu: 1000m - memory: 1G - requests: - cpu: 200m - memory: 512Mi - -# Best practice to updateStrategy for each add-on -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 diff --git a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml b/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml deleted file mode 100644 index 64cb540bf..000000000 --- a/ai-ml/mlflow/helm-values/coredns-autoscaler-values.yaml +++ /dev/null @@ -1,40 +0,0 @@ -nameOverride: kube-dns-autoscaler - -# Formula for controlling the replicas. Adjust according to your needs -# replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) ) -# replicas = min(replicas, max) -# replicas = max(replicas, min) -config: - linear: - coresPerReplica: 256 - nodesPerReplica: 16 - min: 1 - max: 100 - preventSinglePointFailure: true - includeUnschedulableNodes: true - -# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive). -options: - target: ${target} - -serviceAccount: - create: true - name: kube-dns-autoscaler - -podSecurityContext: - seccompProfile: - type: RuntimeDefault - supplementalGroups: [ 65534 ] - fsGroup: 65534 - -resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - -tolerations: - - key: "CriticalAddonsOnly" - operator: "Exists" diff --git a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml b/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml deleted file mode 100644 index c8b1a5d74..000000000 --- a/ai-ml/mlflow/helm-values/ingress-nginx-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -controller: - service: - externalTrafficPolicy: "Local" - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Private Load Balancer can only be accessed within the VPC - targetPorts: - http: http - https: http diff --git a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml deleted file mode 100644 index cc7687163..000000000 --- a/ai-ml/mlflow/helm-values/kube-prometheus-amp-enable.yaml +++ /dev/null @@ -1,65 +0,0 @@ -prometheus: - serviceAccount: - create: true - name: ${amp_sa} - annotations: - eks.amazonaws.com/role-arn: ${amp_irsa} - prometheusSpec: - remoteWrite: - - url: ${amp_remotewrite_url} - sigv4: - region: ${region} - queueConfig: - maxSamplesPerSend: 1000 - maxShards: 200 - capacity: 2500 - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: ${storage_class_type} - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - # Scrape metrics for Yunikorn add-on - additionalScrapeConfigs: - - job_name: yunikorn - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /ws/v1//metrics - scheme: http - dns_sd_configs: - - names: - - yunikorn-service.yunikorn.svc - type: 'A' - port: 9080 -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true -# Adding AMP datasource to Grafana config - serviceAccount: - create: false - name: ${amp_sa} - grafana.ini: - auth: - sigv4_auth_enabled: true - additionalDataSources: - - name: AMP - editable: true - jsonData: - sigV4Auth: true - sigV4Region: ${region} - type: prometheus - isDefault: false - url: ${amp_url} diff --git a/ai-ml/mlflow/helm-values/kube-prometheus.yaml b/ai-ml/mlflow/helm-values/kube-prometheus.yaml deleted file mode 100644 index dedff553b..000000000 --- a/ai-ml/mlflow/helm-values/kube-prometheus.yaml +++ /dev/null @@ -1,36 +0,0 @@ -prometheus: - prometheusSpec: - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: ${storage_class_type} - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - # Scrape metrics for Yunikorn add-on - additionalScrapeConfigs: - - job_name: yunikorn - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /ws/v1//metrics - scheme: http - dns_sd_configs: - - names: - - yunikorn-service.yunikorn.svc - type: 'A' - port: 9080 -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true diff --git a/ai-ml/mlflow/helm-values/metrics-server-values.yaml b/ai-ml/mlflow/helm-values/metrics-server-values.yaml deleted file mode 100644 index bc806ced6..000000000 --- a/ai-ml/mlflow/helm-values/metrics-server-values.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# HA config for metrics-server -image: - repository: registry.k8s.io/metrics-server/metrics-server - pullPolicy: IfNotPresent - -serviceAccount: - create: true - name: metrics-server - -rbac: - create: true - pspEnabled: false - -apiService: - create: true - -podLabels: - k8s-app: metrics-server - -# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true -replicas: 2 - -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 - -podDisruptionBudget: - enabled: true - minAvailable: 1 - -defaultArgs: - - --cert-dir=/tmp - - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname - - --kubelet-use-node-status-port - - --metric-resolution=15s - -resources: - requests: - cpu: 200m - memory: 512Mi - -affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - k8s-app: metrics-server - namespaces: - - kube-system - topologyKey: kubernetes.io/hostname diff --git a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml b/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml deleted file mode 100644 index 1f604f610..000000000 --- a/ai-ml/mlflow/helm-values/mlflow-tracking-values.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Default values for mlflow-tracking-server. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -image: - repository: public.ecr.aws/data-on-eks/mlflow - pullPolicy: Always - tag: 2.7.1 - -imagePullSecrets: [] - -nameOverride: mlflow-tracking-server - -fullnameOverride: mlflow-tracking-server - -podAnnotations: {} - -replicaCount: 1 - -service: - type: ClusterIP - port: 5000 - -serviceAccount: - # Specifies whether a service account should be created - create: false - # Annotations to add to the service account - annotations: - eks.amazonaws.com/role-arn: ${mlflow_irsa} - labels: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: ${mlflow_sa} - -ingress: - enabled: true - className: nginx - annotations: - kubernetes.io/ingress.class: nginx - nginx.ingress.kubernetes.io/use-regex: "true" - hosts: - - host: - paths: - - path: / - pathType: Prefix - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -mlflow: - artifacts: - bucketName: ${s3_bucket_name} - database: - name: ${mlflow_db_name} - username: ${mlflow_db_username} - password: ${mlflow_db_password} - host: ${mlflow_db_host} - port: 5432 - -podSecurityContext: {} - # fsGroup: 2000 - -securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - -resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -nodeSelector: {} - -tolerations: [] - -affinity: {} diff --git a/ai-ml/mlflow/helm-values/nvidia-values.yaml b/ai-ml/mlflow/helm-values/nvidia-values.yaml deleted file mode 100644 index 60078daa6..000000000 --- a/ai-ml/mlflow/helm-values/nvidia-values.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# Default values for gpu-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -daemonsets: - labels: {} - annotations: {} - priorityClassName: system-node-critical - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes - -validator: - repository: nvcr.io/nvidia/cloud-native - image: gpu-operator-validator - -operator: - repository: nvcr.io/nvidia - priorityClassName: system-node-critical - defaultRuntime: containerd - image: gpu-operator - cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs - upgradeCRD: false - resources: - limits: - cpu: 500m - memory: 350Mi - requests: - cpu: 200m - memory: 100Mi - -mig: - strategy: single - -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html -# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers. -# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed" -driver: - enabled: true - repository: nvcr.io/nvidia - image: driver - # Commented this as latest Ubuntu AMIs are failing with this option enabled - # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version. - manager: - image: k8s-driver-manager - repository: nvcr.io/nvidia/cloud-native - -toolkit: - enabled: true - -devicePlugin: - enabled: true - -dcgm: - enabled: false - -dcgmExporter: - enabled: true - -gfd: - enabled: true - -migManager: - enabled: true - -nodeStatusExporter: - enabled: false - -gds: - enabled: false - -vgpuManager: - enabled: false - -vgpuDeviceManager: - enabled: true - -vfioManager: - enabled: true - -sandboxDevicePlugin: - enabled: true - -node-feature-discovery: - enableNodeFeatureApi: true - worker: - tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes diff --git a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml b/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml deleted file mode 100644 index 73e3802df..000000000 --- a/ai-ml/mlflow/karpenter-provisioners/00-karpenter-provisioner-cpu.yaml +++ /dev/null @@ -1,57 +0,0 @@ ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: default -spec: - # Which AWS Node Template to pick - providerRef: - name: default - - # ttlSecondsAfterEmpty: 30 - - # Requirements that constrain the parameters of provisioned nodes. - # These requirements are combined with pod.spec.affinity.nodeAffinity rules. - # Operators { In, not in } are supported to enable including or excluding values - requirements: - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["c", "m", "r"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["4", "8", "16", "32"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" # If not included, the webhook for the AWS cloud provider will default to on-demand - operator: In - values: ["on-demand", "spot"] - limits: - resources: - cpu: 20 # CPU Cores across all instances - memory: 2000Gi - - # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those - # that can't be removed. Mutually exclusive with the ttlSecondsAfterEmpty parameter. - consolidation: - enabled: true ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: default -spec: - subnetSelector: - Name: ${cluster_name}-private* # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: ${cluster_name}-node* # name of the SecurityGroup to be used with Nodes - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - tags: - managed-by: "karpenter" - intent: "apps" - Name: "karpenter-node-default" diff --git a/ai-ml/mlflow/main.tf b/ai-ml/mlflow/main.tf deleted file mode 100644 index a5e4360ea..000000000 --- a/ai-ml/mlflow/main.tf +++ /dev/null @@ -1,65 +0,0 @@ -provider "aws" { - region = local.region -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -provider "kubectl" { - apply_retry_count = 10 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - token = data.aws_eks_cluster_auth.this.token -} - -data "aws_availability_zones" "available" {} -data "aws_caller_identity" "current" {} -data "aws_partition" "current" {} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -#--------------------------------------------------------------- -# Local variables -#--------------------------------------------------------------- -locals { - name = var.name - region = var.region - vpc_cidr = var.vpc_cidr - azs = slice(data.aws_availability_zones.available.names, 0, 2) - account_id = data.aws_caller_identity.current.account_id - partition = data.aws_partition.current.partition - - mlflow_name = "mlflow" - mlflow_namespace = "mlflow" - mlflow_service_account = "mlflow" - - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} diff --git a/ai-ml/mlflow/mlflow-core.tf b/ai-ml/mlflow/mlflow-core.tf deleted file mode 100644 index c3ca28e76..000000000 --- a/ai-ml/mlflow/mlflow-core.tf +++ /dev/null @@ -1,245 +0,0 @@ -#--------------------------------------------------------------- -# RDS Postgres Database for MLflow Backend -#--------------------------------------------------------------- -module "db" { - count = var.enable_mlflow_tracking ? 1 : 0 - source = "terraform-aws-modules/rds/aws" - version = "~> 5.0" - - identifier = local.mlflow_name - - engine = "postgres" - engine_version = "14.3" - family = "postgres14" - major_engine_version = "14" - instance_class = "db.m6i.xlarge" - - storage_type = "io1" - allocated_storage = 100 - iops = 3000 - - db_name = local.mlflow_name - username = local.mlflow_name - create_random_password = false - password = sensitive(aws_secretsmanager_secret_version.postgres[0].secret_string) - port = 5432 - - multi_az = true - db_subnet_group_name = module.vpc.database_subnet_group - vpc_security_group_ids = [module.security_group[0].security_group_id] - - maintenance_window = "Mon:00:00-Mon:03:00" - backup_window = "03:00-06:00" - enabled_cloudwatch_logs_exports = ["postgresql", "upgrade"] - create_cloudwatch_log_group = true - - backup_retention_period = 5 - skip_final_snapshot = true - deletion_protection = false - - performance_insights_enabled = true - performance_insights_retention_period = 7 - create_monitoring_role = true - monitoring_interval = 60 - monitoring_role_name = "mlflow-backend" - monitoring_role_use_name_prefix = true - monitoring_role_description = "MLflow Postgres Backend for monitoring role" - - parameters = [ - { - name = "autovacuum" - value = 1 - }, - { - name = "client_encoding" - value = "utf8" - } - ] - - tags = local.tags -} - -#--------------------------------------------------------------- -# MLflow Postgres Backend DB Master password -#--------------------------------------------------------------- -resource "random_password" "postgres" { - count = var.enable_mlflow_tracking ? 1 : 0 - length = 16 - special = false -} -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "postgres" { - count = var.enable_mlflow_tracking ? 1 : 0 - name = local.mlflow_name - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "postgres" { - count = var.enable_mlflow_tracking ? 1 : 0 - secret_id = aws_secretsmanager_secret.postgres[0].id - secret_string = random_password.postgres[0].result -} - -#--------------------------------------------------------------- -# PostgreSQL RDS security group -#--------------------------------------------------------------- -module "security_group" { - count = var.enable_mlflow_tracking ? 1 : 0 - source = "terraform-aws-modules/security-group/aws" - version = "~> 5.0" - - name = local.name - description = "Complete PostgreSQL example security group" - vpc_id = module.vpc.vpc_id - - # ingress - ingress_with_cidr_blocks = [ - { - from_port = 5432 - to_port = 5432 - protocol = "tcp" - description = "PostgreSQL access from within VPC" - cidr_blocks = "${module.vpc.vpc_cidr_block},${module.vpc.vpc_secondary_cidr_blocks[0]}" - }, - ] - - tags = local.tags -} - - -#--------------------------------------------------------------- -# S3 bucket for MLflow artifacts -#--------------------------------------------------------------- - -#tfsec:ignore:* -module "mlflow_s3_bucket" { - count = var.enable_mlflow_tracking ? 1 : 0 - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - bucket_prefix = "${local.name}-artifacts-" - - # For example only - please evaluate for your environment - force_destroy = true - - server_side_encryption_configuration = { - rule = { - apply_server_side_encryption_by_default = { - sse_algorithm = "AES256" - } - } - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# MLflow Namespace -#--------------------------------------------------------------- -resource "kubernetes_namespace_v1" "mlflow" { - count = var.enable_mlflow_tracking ? 1 : 0 - metadata { - name = local.mlflow_namespace - } - timeouts { - delete = "15m" - } -} - -resource "kubernetes_service_account_v1" "mlflow" { - count = var.enable_mlflow_tracking ? 1 : 0 - metadata { - name = local.mlflow_service_account - namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name - annotations = { "eks.amazonaws.com/role-arn" : module.mlflow_irsa[0].iam_role_arn } - } - - automount_service_account_token = true -} - -resource "kubernetes_secret_v1" "mlflow" { - count = var.enable_mlflow_tracking ? 1 : 0 - metadata { - name = "${local.mlflow_service_account}-secret" - namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name - annotations = { - "kubernetes.io/service-account.name" = kubernetes_service_account_v1.mlflow[0].metadata[0].name - "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.mlflow[0].metadata[0].name - } - } - - type = "kubernetes.io/service-account-token" -} - -# Create IAM Role for Service Account (IRSA) Only if MLflow is enabled -module "mlflow_irsa" { - count = var.enable_mlflow_tracking ? 1 : 0 - - source = "aws-ia/eks-blueprints-addon/aws" - version = "~> 1.0" #ensure to update this to the latest/desired version - - # Disable helm release - create_release = false - - # IAM role for service account (IRSA) - create_role = true - create_policy = false # Policy is created in the next resource - - role_name = local.mlflow_service_account - role_policies = { mlflow_policy = aws_iam_policy.mlflow[0].arn } - - oidc_providers = { - this = { - provider_arn = module.eks.oidc_provider_arn - namespace = kubernetes_namespace_v1.mlflow[0].metadata[0].name - service_account = local.mlflow_service_account - } - } - - tags = local.tags -} - -#-------------------------------------------------------------------------- -# IAM policy for MLflow for accessing S3 artifacts and RDS Postgres backend -#-------------------------------------------------------------------------- -resource "aws_iam_policy" "mlflow" { - count = var.enable_mlflow_tracking ? 1 : 0 - - description = "IAM policy for MLflow" - name_prefix = format("%s-%s-", local.name, "mlflow") - path = "/" - policy = data.aws_iam_policy_document.mlflow[0].json -} - -data "aws_iam_policy_document" "mlflow" { - count = var.enable_mlflow_tracking ? 1 : 0 - statement { - sid = "" - effect = "Allow" - resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}"] - - actions = [ - "s3:ListBucket" - ] - } - statement { - sid = "" - effect = "Allow" - resources = ["arn:${local.partition}:s3:::${module.mlflow_s3_bucket[0].s3_bucket_id}/*"] - - actions = [ - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject" - ] - } - statement { - sid = "" - effect = "Allow" - resources = ["arn:${local.partition}:rds-db:${local.region}:${local.account_id}:dbuser:${module.db[0].db_instance_name}/${local.mlflow_name}"] - - actions = [ - "rds-db:connect", - ] - } -} diff --git a/ai-ml/mlflow/outputs.tf b/ai-ml/mlflow/outputs.tf deleted file mode 100644 index b5db71900..000000000 --- a/ai-ml/mlflow/outputs.tf +++ /dev/null @@ -1,24 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --alias ${module.eks.cluster_name} --name ${module.eks.cluster_name}" -} - -output "eks_api_server_url" { - description = "Your eks API server endpoint" - value = module.eks.cluster_endpoint -} - -output "grafana_secret_name" { - description = "Grafana password secret name" - value = aws_secretsmanager_secret.grafana.name -} - -output "mlflow_s3_artifacts" { - description = "S3 bucket for MLflow artifacts" - value = module.mlflow_s3_bucket[0].s3_bucket_id -} - -output "mlflow_db_backend" { - description = "Amazon RDS Postgres database for MLflow backend" - value = module.db[0].db_instance_endpoint -} diff --git a/ai-ml/mlflow/versions.tf b/ai-ml/mlflow/versions.tf deleted file mode 100644 index 156fc1e49..000000000 --- a/ai-ml/mlflow/versions.tf +++ /dev/null @@ -1,33 +0,0 @@ -terraform { - required_version = ">= 1.0.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 3.72" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4.1" - } - random = { - source = "hashicorp/random" - version = "3.3.2" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "doeks-github-actions-e2e-test-state" - # region = "us-west-2" - # key = "e2e/mlflow/terraform.tfstate" - # } -} diff --git a/ai-ml/mlflow/vpc.tf b/ai-ml/mlflow/vpc.tf deleted file mode 100644 index 0aa8b7aab..000000000 --- a/ai-ml/mlflow/vpc.tf +++ /dev/null @@ -1,59 +0,0 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - - database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = local.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - - # ------------------------------ - # Private Subnets for MLflow backend store - database_subnets = local.database_private_subnets - create_database_subnet_group = true - create_database_subnet_route_table = true - - enable_nat_gateway = true - single_nat_gateway = true - enable_dns_hostnames = true - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - # Tags subnets for Karpenter auto-discovery - "karpenter.sh/discovery" = local.name - } - - tags = local.tags -} diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf deleted file mode 100644 index 8d15a83bb..000000000 --- a/ai-ml/trainium-inferentia/addons.tf +++ /dev/null @@ -1,536 +0,0 @@ -#--------------------------------------------------------------- -# GP3 Encrypted Storage Class -#--------------------------------------------------------------- -resource "kubernetes_annotations" "disable_gp2" { - annotations = { - "storageclass.kubernetes.io/is-default-class" : "false" - } - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - metadata { - name = "gp2" - } - force = true - - depends_on = [module.eks.eks_cluster_id] -} - -resource "kubernetes_storage_class_v1" "default_gp3" { - metadata { - name = "gp3" - annotations = { - "storageclass.kubernetes.io/is-default-class" : "true" - } - } - - storage_provisioner = "ebs.csi.aws.com" - reclaim_policy = "Delete" - allow_volume_expansion = true - volume_binding_mode = "WaitForFirstConsumer" - parameters = { - fsType = "xfs" - encrypted = true - type = "gp3" - } - - depends_on = [kubernetes_annotations.disable_gp2] -} - -#--------------------------------------------------------------- -# EKS Pod identiity association -#--------------------------------------------------------------- - -module "aws_ebs_csi_pod_identity" { - source = "terraform-aws-modules/eks-pod-identity/aws" - version = "~> 1.4.0" - - name = "aws-ebs-csi" - attach_aws_ebs_csi_policy = true - - # Pod Identity Associations - associations = { - ebs-csi-controller = { - namespace = "kube-system" - service_account = "ebs-csi-controller-sa" - cluster_name = module.eks.cluster_name - } - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# EKS Blueprints Addons -#--------------------------------------------------------------- -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.16" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - aws-ebs-csi-driver = {} - coredns = {} - eks-pod-identity-agent = {} - kube-proxy = {} - vpc-cni = {} - amazon-cloudwatch-observability = { - preserve = true - service_account_role_arn = aws_iam_role.cloudwatch_observability_role.arn - } - } - - #--------------------------------------- - # Kubernetes Add-ons - #--------------------------------------- - - #--------------------------------------- - # Metrics Server - #--------------------------------------- - enable_metrics_server = true - metrics_server = { - values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] - } - - #--------------------------------------- - # Cluster Autoscaler - #--------------------------------------- - enable_cluster_autoscaler = true - cluster_autoscaler = { - values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {})] - } - - #--------------------------------------- - # Enable FSx for Lustre CSI Driver - #--------------------------------------- - enable_aws_fsx_csi_driver = var.enable_fsx_for_lustre - aws_fsx_csi_driver = { - # INFO: fsx node daemonset won't be placed on Karpenter nodes with taints without the following toleration - values = [ - <<-EOT - node: - tolerations: - - operator: Exists - EOT - ] - } - - #--------------------------------------- - # AWS for FluentBit - DaemonSet - #--------------------------------------- - enable_aws_for_fluentbit = true - aws_for_fluentbit_cw_log_group = { - use_name_prefix = false - name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group - retention_in_days = 30 - } - aws_for_fluentbit = { - s3_bucket_arns = [ - module.s3_bucket.s3_bucket_arn, - "${module.s3_bucket.s3_bucket_arn}/*" - ] - values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", { - region = local.region, - cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" - s3_bucket_name = module.s3_bucket.s3_bucket_id - cluster_name = module.eks.cluster_name - })] - } - - #--------------------------------------- - # Prommetheus and Grafana stack - #--------------------------------------- - #--------------------------------------------------------------- - # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` - # 2- Grafana Admin user: admin - # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id kafka-on-eks-grafana --region $AWS_REGION --query "SecretString" --output text` - #--------------------------------------------------------------- - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - values = [templatefile("${path.module}/helm-values/kube-prometheus.yaml", { - storage_class_type = kubernetes_storage_class_v1.default_gp3.id - }) - ] - chart_version = "48.1.1" - set_sensitive = [ - { - name = "grafana.adminPassword" - value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string - } - ], - } - - #--------------------------------------- - # AWS Load Balancer Controller Add-on - #--------------------------------------- - enable_aws_load_balancer_controller = true - # turn off the mutating webhook for services because we are using - # service.beta.kubernetes.io/aws-load-balancer-type: external - aws_load_balancer_controller = { - set = [{ - name = "enableServiceMutatorWebhook" - value = "false" - }] - } - - #--------------------------------------- - # Ingress Nginx Add-on - #--------------------------------------- - enable_ingress_nginx = true - ingress_nginx = { - values = [templatefile("${path.module}/helm-values/ingress-nginx-values.yaml", {})] - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- -module "eks_data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "1.35.0" # ensure to update this to the latest/desired version - - oidc_provider_arn = module.eks.oidc_provider_arn - - enable_aws_neuron_device_plugin = true - - aws_neuron_device_plugin_helm_config = { - # Enable default scheduler - values = [ - <<-EOT - devicePlugin: - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: hub.jupyter.org/dedicated - operator: Exists - effect: NoSchedule - scheduler: - enabled: true - npd: - enabled: false - EOT - ] - } - - enable_aws_efa_k8s_device_plugin = true - - aws_efa_k8s_device_plugin_helm_config = { - version = "v0.5.3" - } - - #--------------------------------------- - # Volcano Scheduler for TorchX used in BERT-Large distributed training example - # Volcano is also a default scheduler for KubeRay Operator - #--------------------------------------- - enable_volcano = var.enable_volcano - - #--------------------------------------- - # Kuberay Operator - #--------------------------------------- - enable_kuberay_operator = var.enable_kuberay_operator - kuberay_operator_helm_config = { - version = "1.1.1" - # Enabling Volcano as Batch scheduler for KubeRay Operator - values = [ - <<-EOT - batchScheduler: - enabled: ${var.enable_volcano} - EOT - ] - } - - #--------------------------------------- - # JupyterHub Addon - #--------------------------------------- - enable_jupyterhub = var.enable_jupyterhub - jupyterhub_helm_config = { - values = [ - templatefile("${path.module}/helm-values/jupyterhub-values.yaml", { - jupyter_single_user_sa_name = "${module.eks.cluster_name}-jupyterhub-single-user" - }) - ] - } - - #--------------------------------------- - # Deploying Karpenter resources(Nodepool and NodeClass) with Helm Chart - #--------------------------------------- - enable_karpenter_resources = true - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - karpenter_resources_helm_config = { - trainium-trn1 = { - values = [ - <<-EOT - name: trainium-trn1 - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${module.karpenter.node_iam_role_name} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - id: ${module.eks.node_security_group_id} - tags: - Name: ${module.eks.cluster_name}-node - blockDevice: - deviceName: /dev/xvda - volumeSize: 500Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - amiSelectorTerms: - - alias: al2023@v20241024 - nodePool: - labels: - - instanceType: trainium-trn1 - - provisionerType: Karpenter - - hub.jupyter.org/node-purpose: user - - karpenterVersion: ${resource.helm_release.karpenter.version} - taints: - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["trn1"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s - expireAfter: 720h - weight: 100 - EOT - ] - } - inferentia-inf2 = { - values = [ - <<-EOT - name: inferentia-inf2 - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${module.karpenter.node_iam_role_name} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - id: ${module.eks.node_security_group_id} - tags: - Name: ${module.eks.cluster_name}-node - blockDevice: - deviceName: /dev/xvda - volumeSize: 500Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - amiSelectorTerms: - - alias: al2023@v20241024 - nodePool: - labels: - - instanceType: inferentia-inf2 - - provisionerType: Karpenter - - hub.jupyter.org/node-purpose: user - - karpenterVersion: ${resource.helm_release.karpenter.version} - taints: - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["inf2"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: [ "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s - expireAfter: 720h - weight: 100 - EOT - ] - } - default = { - values = [ - <<-EOT - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${module.karpenter.node_iam_role_name} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - id: ${module.eks.node_security_group_id} - tags: - Name: ${module.eks.cluster_name}-node - blockDevice: - deviceName: /dev/xvda - volumeSize: 200Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - amiSelectorTerms: - - alias: al2023@v20241024 - nodePool: - labels: - - instanceType: mixed-x86 - - provisionerType: Karpenter - - workload: rayhead - - karpenterVersion: ${resource.helm_release.karpenter.version} - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["c5", "m5", "r5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 300s - expireAfter: 720h - weight: 100 - EOT - ] - } - } -} - -#--------------------------------------------------------------- -# IAM Role for Amazon CloudWatch Observability -#--------------------------------------------------------------- -resource "aws_iam_role" "cloudwatch_observability_role" { - name_prefix = format("%s-%s", local.name, "cloudwatch-agent") - description = "The IAM role for amazon-cloudwatch-observability addon" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRoleWithWebIdentity" - Effect = "Allow" - Principal = { - Federated = module.eks.oidc_provider_arn - } - Condition = { - StringEquals = { - "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:sub" : "system:serviceaccount:amazon-cloudwatch:cloudwatch-agent", - "${replace(module.eks.cluster_oidc_issuer_url, "https://", "")}:aud" : "sts.amazonaws.com" - } - } - } - ] - }) -} - -resource "aws_iam_role_policy_attachment" "cloudwatch_observability_policy_attachment" { - policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" - role = aws_iam_role.cloudwatch_observability_role.name -} - -#--------------------------------------------------------------- -# ETCD for TorchX -#--------------------------------------------------------------- -data "http" "torchx_etcd_yaml" { - url = "https://raw.githubusercontent.com/pytorch/torchx/main/resources/etcd.yaml" -} - -data "kubectl_file_documents" "torchx_etcd_yaml" { - content = data.http.torchx_etcd_yaml.response_body -} - -resource "kubectl_manifest" "torchx_etcd" { - for_each = var.enable_torchx_etcd ? data.kubectl_file_documents.torchx_etcd_yaml.manifests : {} - yaml_body = each.value - depends_on = [module.eks.eks_cluster_id] -} - -#--------------------------------------------------------------- -# Grafana Admin credentials resources -# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana" -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] -} - -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" -} - -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name_prefix = "${local.name}-oss-grafana" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result -} - -#tfsec:ignore:* -module "s3_bucket" { - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - bucket_prefix = "${local.name}-logs-" - # For example only - please evaluate for your environment - force_destroy = true - - tags = local.tags -} - -#--------------------------------------------------------------- -# MPI Operator for distributed training on Trainium -#--------------------------------------------------------------- -data "http" "mpi_operator_yaml" { - url = "https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.4.0/deploy/v2beta1/mpi-operator.yaml" -} - -data "kubectl_file_documents" "mpi_operator_yaml" { - content = data.http.mpi_operator_yaml.response_body -} - -resource "kubectl_manifest" "mpi_operator" { - for_each = var.enable_mpi_operator ? data.kubectl_file_documents.mpi_operator_yaml.manifests : {} - yaml_body = each.value - depends_on = [module.eks.eks_cluster_id] -} diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf deleted file mode 100644 index 642fd472b..000000000 --- a/ai-ml/trainium-inferentia/eks.tf +++ /dev/null @@ -1,410 +0,0 @@ -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 20.17" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - - cluster_endpoint_public_access = true - - enable_efa_support = true - - # Gives Terraform identity admin access to cluster which will - # allow deploying resources (Karpenter) into the cluster - enable_cluster_creator_admin_permissions = true - - access_entries = var.access_entries - - vpc_id = module.vpc.vpc_id - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - # Combine root account, current user/role and additinoal roles to be able to access the cluster KMS key - required for terraform updates - kms_key_administrators = distinct(concat([ - "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"], - var.kms_key_admin_roles, - [data.aws_iam_session_context.current.issuer_arn] - )) - - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 0 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - # security group rule from all ipv4 to nodes for port 22 - node_security_group_additional_rules = { - # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc. - # Update this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - - ebs_optimized = true - # This block device is used only for root volume. Adjust volume according to your size. - # NOTE: Don't use this volume for ML workloads - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - } - - eks_managed_node_groups = { - # It's recommended to have a Managed Node group for hosting critical add-ons - # It's recommended to use Karpenter to place your workloads instead of using Managed Node groups - # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. - core_node_group = { - name = "core-node-group" - description = "EKS Core node group for hosting system add-ons" - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - ) - - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2/recommended/image_id --region us-west-2 - ami_type = "AL2_x86_64" # Use this for Graviton AL2_ARM_64 - min_size = 3 - max_size = 8 - desired_size = 3 - - instance_types = ["m5.2xlarge"] - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - workload = "rayhead" - } - - tags = merge(local.tags, { - Name = "core-node-grp" - }) - } - - #-------------------------------------------------- - # Trainium node group for Trn1.32xlarge - #-------------------------------------------------- - # Trainium node group creation can take upto 6 mins - trn1-32xl-ng1 = { - name = "trn1-32xl-ng1" - description = "Tran1 32xlarge node group for hosting ML workloads" - # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ - # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf. - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - subnet_ids = [module.vpc.private_subnets[2]] - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 - # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type - ami_type = "AL2_x86_64_GPU" # Contains Neuron driver - instance_types = ["trn1.32xlarge"] - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - - # Install Neuron monitoring tools - yum install aws-neuronx-tools-2.* -y - export PATH=/opt/aws/neuron/bin:$PATH - - # Install latest version of aws cli - mkdir /awscli \ - && wget https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O /awscli/awscliv2.zip \ - && unzip /awscli/awscliv2.zip -d /awscli/ \ - && /awscli/aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli --update \ - && rm -rf /awscli - EOT - - min_size = var.trn1_32xl_min_size - max_size = 4 - desired_size = var.trn1_32xl_desired_size - - # This will: - # 1. Create a placement group to place the instances close to one another - # 2. Ignore subnets that reside in AZs that do not support the instance type - # 3. Expose all of the available EFA interfaces on the launch template - enable_efa_support = true - - labels = { - "vpc.amazonaws.com/efa.present" = "true" - instance-type = "trn1-32xl" - provisioner = "cluster-autoscaler" - } - - taints = [ - { - key = "aws.amazon.com/neuron", - value = true, - effect = "NO_SCHEDULE" - } - ] - - tags = merge(local.tags, { - Name = "trn1-32xl-ng1", - }) - } - - #-------------------------------------------------- - # Trainium node group for Trn1n.32xlarge - #-------------------------------------------------- - trn1n-32xl-ng = { - name = "trn1n-32xl-ng" - description = "trn1n 32xlarge node group for hosting ML workloads" - # All trn1 instances should be launched into the same subnet in the preferred trn1 AZ - # The preferred AZ is the first AZ listed in the AZ id <-> region mapping in main.tf. - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - subnet_ids = [module.vpc.private_subnets[2]] - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 - # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type - ami_type = "AL2_x86_64_GPU" # Contains Neuron driver - instance_types = ["trn1n.32xlarge"] - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - - # Install Neuron monitoring tools - yum install aws-neuronx-tools-2.* -y - export PATH=/opt/aws/neuron/bin:$PATH - EOT - - min_size = var.trn1n_32xl_min_size - max_size = 2 - desired_size = var.trn1n_32xl_desired_size - - # This will: - # 1. Create a placement group to place the instances close to one another - # 2. Ignore subnets that reside in AZs that do not support the instance type - # 3. Expose all of the available EFA interfaces on the launch template - enable_efa_support = true - - labels = { - instance-type = "trn1n-32xl" - provisioner = "cluster-autoscaler" - "vpc.amazonaws.com/efa.present" = "true" - } - - taints = [ - { - key = "aws.amazon.com/neuron", - value = true, - effect = "NO_SCHEDULE" - } - ] - - tags = merge(local.tags, { - Name = "trn1n-32xl-ng1", - }) - } - - #-------------------------------------------------- - # Inferentia2 Spot node group - #-------------------------------------------------- - inf2-24xl-ng = { - name = "inf2-24xl-ng" - description = "inf2 24xl node group for ML inference workloads" - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - subnet_ids = [module.vpc.private_subnets[2]] - - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 - # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type - ami_type = "AL2_x86_64_GPU" - capacity_type = "ON_DEMAND" # Use SPOT for Spot instances - instance_types = ["inf2.24xlarge"] - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - - # Install Neuron monitoring tools - yum install aws-neuronx-tools-2.* -y - export PATH=/opt/aws/neuron/bin:$PATH - EOT - - min_size = var.inf2_24xl_min_size - max_size = 2 - desired_size = var.inf2_24xl_desired_size - - labels = { - instanceType = "inf2-24xl" - provisionerType = "cluster-autoscaler" - } - - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 500 - volume_type = "gp3" - } - } - } - - taints = [ - { - key = "aws.amazon.com/neuron", - value = "true", - effect = "NO_SCHEDULE" - } - ] - - tags = merge(local.tags, { - Name = "inf2-24xl-ng", - "karpenter.sh/discovery" = local.name - }) - } - - inf2-48xl-ng = { - name = "inf2-48xl-ng" - description = "inf2 48x large node group for ML inference workloads" - # We use index 2 to select the subnet in AZ1 with the 100.x CIDR: - # module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] - subnet_ids = [module.vpc.private_subnets[2]] - - # aws ssm get-parameters --names /aws/service/eks/optimized-ami/1.27/amazon-linux-2-gpu/recommended/image_id --region us-west-2 - # ami_id = "ami-0e0deb7ae582f6fe9" # Use this to pass custom AMI ID and ignore ami_type - ami_type = "AL2_x86_64_GPU" - capacity_type = "ON_DEMAND" # Use SPOT for Spot instances - instance_types = ["inf2.48xlarge"] - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - - # Install Neuron monitoring tools - yum install aws-neuronx-tools-2.* -y - export PATH=/opt/aws/neuron/bin:$PATH - EOT - - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 500 - volume_type = "gp3" - } - } - } - - min_size = var.inf2_48xl_min_size - max_size = 2 - desired_size = var.inf2_48xl_desired_size - - labels = { - instanceType = "inf2-48xl" - provisionerType = "cluster-autoscaler" - } - - taints = [ - { - key = "aws.amazon.com/neuron", - value = true, - effect = "NO_SCHEDULE" - } - ] - - tags = merge(local.tags, { - Name = "inf2-48xl-ng", - }) - } - } - - tags = merge(local.tags, { - # NOTE - if creating multiple security groups with this module, only tag the - # security group that Karpenter should utilize with the following tag - # (i.e. - at most, only one security group should have this tag in your account) - "karpenter.sh/discovery" = local.name - }) -} - - -################################################################################ -# Karpenter Controller & Node IAM roles, SQS Queue, Eventbridge Rules -################################################################################ - -module "karpenter" { - source = "terraform-aws-modules/eks/aws//modules/karpenter" - version = "~> 20.24" - - cluster_name = module.eks.cluster_name - enable_v1_permissions = true - - # Use Pod Identity - enable_pod_identity = true - create_pod_identity_association = true - - # Used to attach additional IAM policies to the Karpenter node IAM role - node_iam_role_additional_policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - - tags = local.tags -} - -################################################################################ -# Karpenter Helm chart -################################################################################ - -resource "helm_release" "karpenter" { - name = "karpenter" - namespace = "kube-system" - create_namespace = true - repository = "oci://public.ecr.aws/karpenter" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - chart = "karpenter" - version = "1.0.6" - wait = true - - values = [ - <<-EOT - settings: - clusterName: ${module.eks.cluster_name} - clusterEndpoint: ${module.eks.cluster_endpoint} - interruptionQueue: ${module.karpenter.queue_name} - serviceAccount: - name: ${module.karpenter.service_account} - EOT - ] - - lifecycle { - ignore_changes = [ - repository_password - ] - } -} diff --git a/ai-ml/trainium-inferentia/elastic-cache-redis.tf b/ai-ml/trainium-inferentia/elastic-cache-redis.tf deleted file mode 100644 index df3c3c6a8..000000000 --- a/ai-ml/trainium-inferentia/elastic-cache-redis.tf +++ /dev/null @@ -1,57 +0,0 @@ -#------------------------------------------- -# For Rayhead High availability cluster -#------------------------------------------- -module "elasticache" { - create = var.enable_rayserve_ha_elastic_cache_redis - source = "terraform-aws-modules/elasticache/aws" - version = "1.2.0" - - cluster_id = local.name - create_cluster = true - create_replication_group = false - - engine_version = "7.1" - node_type = "cache.t4g.small" - - apply_immediately = true - - # Security Group - vpc_id = module.vpc.vpc_id - security_group_rules = { - ingress_vpc = { - # Default type is `ingress` - # Default port is based on the default engine port - description = "VPC traffic" - cidr_ipv4 = module.vpc.vpc_cidr_block - } - - ingress_from_eks_worker_node_tcp = { - description = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node" - protocol = "tcp" - from_port = 6379 - referenced_security_group_id = module.eks.node_security_group_id - to_port = 6379 - type = "ingress" - } - } - - # Subnet Group - subnet_group_name = local.name - subnet_group_description = "${title(local.name)} subnet group" - subnet_ids = module.vpc.private_subnets - - # Parameter Group - create_parameter_group = true - parameter_group_name = local.name - parameter_group_family = "redis7" - parameter_group_description = "${title(local.name)} parameter group" - parameters = [ - { - name = "latency-tracking" - value = "yes" - } - ] - - tags = local.tags - -} diff --git a/ai-ml/trainium-inferentia/fsx-for-lustre.tf b/ai-ml/trainium-inferentia/fsx-for-lustre.tf deleted file mode 100644 index 6c88aa2cb..000000000 --- a/ai-ml/trainium-inferentia/fsx-for-lustre.tf +++ /dev/null @@ -1,118 +0,0 @@ -#--------------------------------------------------------------- -# FSx for Lustre File system Static provisioning -# 1> Create Fsx for Lustre filesystem (Lustre FS storage capacity must be 1200, 2400, or a multiple of 3600) -# 2> Create Storage Class for Filesystem (Cluster scoped) -# 3> Persistent Volume with Hardcoded reference to Fsx for Lustre filesystem with filesystem_id and dns_name (Cluster scoped) -# 4> Persistent Volume claim for this persistent volume will always use the same file system (Namespace scoped) -#--------------------------------------------------------------- - -#--------------------------------------------------------------- -# Sec group for FSx for Lustre -#--------------------------------------------------------------- -resource "aws_security_group" "fsx" { - count = var.enable_fsx_for_lustre ? 1 : 0 - - name = "${local.name}-fsx" - description = "Allow inbound traffic from private subnets of the VPC to FSx filesystem" - vpc_id = module.vpc.vpc_id - - ingress { - description = "Allows Lustre traffic between Lustre clients" - cidr_blocks = module.vpc.private_subnets_cidr_blocks - from_port = 1021 - to_port = 1023 - protocol = "tcp" - } - ingress { - description = "Allows Lustre traffic between Lustre clients" - cidr_blocks = module.vpc.private_subnets_cidr_blocks - from_port = 988 - to_port = 988 - protocol = "tcp" - } - tags = local.tags -} - -#--------------------------------------------------------------- -# Storage Class - FSx for Lustre -#--------------------------------------------------------------- -resource "kubectl_manifest" "fsx_storageclass" { - count = var.enable_fsx_for_lustre ? 1 : 0 - - yaml_body = <[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ - -input: - name: "tail" - enabled: true - tag: "systempods....-" - path: "/var/log/containers/*.log" - db: "/var/log/flb_kube.db" - memBufLimit: 5MB - skipLongLines: "On" - refreshInterval: 10 - extraInputs: | - multiline.parser docker, cri - Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ - - -# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters -filter: - name: "kubernetes" - match: "systempods.*" - kubeURL: "https://kubernetes.default.svc.cluster.local:443" - mergeLog: "On" - mergeLogKey: "log_processed" - keepLog: "On" - k8sLoggingParser: "On" - k8sLoggingExclude: "Off" - bufferSize: "0" - extraFilters: | - Kube_Tag_Prefix systempods. - Regex_Parser kubernetes - Labels On - Annotations Off - Use_Kubelet true - Kubelet_Port 10250 - Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token - -# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. -# cloudWatch: -# enabled: false - -# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch -cloudWatchLogs: - enabled: true - match: "systempods.*" - region: ${region} - logGroupName: ${cloudwatch_log_group} - autoCreateGroup: false - extraOutputs: | - log_key log - -#----------------------------------------------------------# -# OUTPUT logs to S3 -#----------------------------------------------------------# - -# This is an example for writing logs to S3 bucket. -# This example writes system pod logs and spark logs into dedicated prefix. -# This second output is using the rewrite_tag filter commented above - -additionalOutputs: | - [OUTPUT] - Name s3 - Match systempods.* - region ${region} - bucket ${s3_bucket_name} - total_file_size 100M - s3_key_format /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log - s3_key_format_tag_delimiters .. - store_dir /home/ec2-user/buffer - upload_timeout 10m - log_key log - - -# Resource config for large clusters -resources: - limits: - cpu: 1000m - memory: 1500Mi - requests: - cpu: 500m - memory: 500Mi - -## Assign a PriorityClassName to pods if set -priorityClassName: system-node-critical - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml b/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml deleted file mode 100644 index 1b1d9af71..000000000 --- a/ai-ml/trainium-inferentia/helm-values/cluster-autoscaler-values.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# Best practice to update the resource requests and limits for each add-on -resources: - limits: - cpu: 1000m - memory: 1G - requests: - cpu: 200m - memory: 512Mi - -# Best practice to updateStrategy for each add-on -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 diff --git a/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml b/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml deleted file mode 100644 index 10ae9bfc2..000000000 --- a/ai-ml/trainium-inferentia/helm-values/ingress-nginx-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -controller: - service: - externalTrafficPolicy: "Local" - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external # nlb-ip or external - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # Valid values are internal, internet-facing - targetPorts: - http: http - https: http diff --git a/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml b/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml deleted file mode 100644 index 8582145fd..000000000 --- a/ai-ml/trainium-inferentia/helm-values/jupyterhub-values.yaml +++ /dev/null @@ -1,139 +0,0 @@ -hub: - extraConfig: - jupyterhub_config.py: |- - c.KubeSpawner.start_timeout = 1200 - -proxy: - https: - enabled: false - type: offload - service: - type: ClusterIP -singleuser: - startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull - profileList: - - display_name: Trainium (trn1) - description: "Trainium | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch 1.13.1 + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - karpenter.sh/nodepool: trainium-trn1 # Label is added by the karpenter to the nodes. `trainium-trn1` is the nodepool name created by this blueprint - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - # trn1.32xlarge | 16 Neurons (32 cores) | 512 GB Accelerator memory | 128 vCPus and 512 GiB - cpu_guarantee: 100 - mem_guarantee: 450G - cpu_limit: 120 - mem_limit: 500G - extra_resource_limits: - aws.amazon.com/neuron: "16" - cmd: "start-singleuser.sh" - - display_name: Inferentia (inf2) - description: "Inferentia | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch 1.13.1 + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - karpenter.sh/nodepool: inferentia-inf2 # Label is added by the karpenter to the nodes. `inferentia-inf2` is the nodepool name created by this blueprint - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 90 # 96 vCPU for inf2.24x large - mem_guarantee: 300G # 384Gib for inf2.24x large - cpu_limit: 90 - mem_limit: 300G - extra_resource_limits: - aws.amazon.com/neuron: "6" # 12 NeuronCores , 384 GB Memory, vCPU 192, Mem 768 GB - cmd: null - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: null - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - -prePuller: - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -global: - safeToShowValues: false diff --git a/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml b/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml deleted file mode 100644 index 498fb2824..000000000 --- a/ai-ml/trainium-inferentia/helm-values/kube-prometheus.yaml +++ /dev/null @@ -1,23 +0,0 @@ -prometheus: - prometheusSpec: - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: ${storage_class_type} - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true diff --git a/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml b/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml deleted file mode 100644 index 026d97a6a..000000000 --- a/ai-ml/trainium-inferentia/helm-values/metrics-server-values.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# HA config for metrics-server -image: - repository: registry.k8s.io/metrics-server/metrics-server - pullPolicy: IfNotPresent - -serviceAccount: - create: true - name: metrics-server - -rbac: - create: true - pspEnabled: false - -apiService: - create: true - -podLabels: - k8s-app: metrics-server - -# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true -replicas: 2 - -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 - -podDisruptionBudget: - enabled: true - minAvailable: 1 - -defaultArgs: - - --cert-dir=/tmp - - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname - - --kubelet-use-node-status-port - - --metric-resolution=15s - -resources: - requests: - cpu: 200m - memory: 512Mi - -affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - k8s-app: metrics-server - namespaces: - - kube-system - topologyKey: kubernetes.io/hostname diff --git a/ai-ml/trainium-inferentia/jupyterhub.tf b/ai-ml/trainium-inferentia/jupyterhub.tf deleted file mode 100644 index 8d2754597..000000000 --- a/ai-ml/trainium-inferentia/jupyterhub.tf +++ /dev/null @@ -1,181 +0,0 @@ -#----------------------------------------------------------------------------------------- -# JupyterHub Single User IRSA, maybe that block could be incorporated in add-on registry -#----------------------------------------------------------------------------------------- -resource "kubernetes_namespace_v1" "jupyterhub" { - count = var.enable_jupyterhub ? 1 : 0 - - metadata { - name = "jupyterhub" - } -} - -module "jupyterhub_single_user_irsa" { - count = var.enable_jupyterhub ? 1 : 0 - - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - - role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa" - - role_policy_arns = { - policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances. - } - - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["${kubernetes_namespace_v1.jupyterhub[0].metadata[0].name}:jupyterhub-single-user"] - } - } -} - -resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { - count = var.enable_jupyterhub ? 1 : 0 - - metadata { - name = "${module.eks.cluster_name}-jupyterhub-single-user" - namespace = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name - annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa[0].iam_role_arn } - } - - automount_service_account_token = true -} - -resource "kubernetes_secret_v1" "jupyterhub_single_user" { - count = var.enable_jupyterhub ? 1 : 0 - - metadata { - name = "${module.eks.cluster_name}-jupyterhub-single-user-secret" - namespace = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name - annotations = { - "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa[0].metadata[0].name - "kubernetes.io/service-account.namespace" = kubernetes_namespace_v1.jupyterhub[0].metadata[0].name - } - } - - type = "kubernetes.io/service-account-token" -} - -#--------------------------------------------------------------- -# EFS Filesystem for private volumes per user -# This will be replaced with Dynamic EFS provision using EFS CSI Driver -#--------------------------------------------------------------- -resource "aws_efs_file_system" "efs" { - count = var.enable_jupyterhub ? 1 : 0 - - creation_token = "efs-jupyter-single-user" - encrypted = true - - tags = local.tags -} - -resource "aws_efs_mount_target" "efs_mt" { - count = var.enable_jupyterhub ? 1 : 0 - - file_system_id = aws_efs_file_system.efs[0].id - subnet_id = module.vpc.private_subnets[2] - security_groups = [aws_security_group.efs[0].id] -} - -resource "aws_security_group" "efs" { - count = var.enable_jupyterhub ? 1 : 0 - - name = "${local.name}-efs" - description = "Allow inbound NFS traffic from private subnets of the VPC" - vpc_id = module.vpc.vpc_id - - ingress { - description = "Allow NFS 2049/tcp" - cidr_blocks = module.vpc.vpc_secondary_cidr_blocks - from_port = 2049 - to_port = 2049 - protocol = "tcp" - } - - tags = local.tags -} - -resource "kubectl_manifest" "pv" { - count = var.enable_jupyterhub ? 1 : 0 - - yaml_body = < Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- -# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. -# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = var.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods - # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - enable_nat_gateway = true - single_nat_gateway = true - #------------------------------- - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - # Tags subnets for Karpenter auto-discovery - "karpenter.sh/discovery" = local.name - } - - tags = local.tags -} From 06de4c60ca5f569bfc3f6df88f420c8e22a88a6e Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:00:20 -0800 Subject: [PATCH 03/16] add 12xlarge to g5 --- ai-ml/infrastructure/terraform/addons.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index 93cc471cb..3b7a936ed 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -351,7 +351,7 @@ module "data_addons" { values: ["g5"] - key: "karpenter.k8s.aws/instance-size" operator: In - values: [ "2xlarge", "4xlarge", "8xlarge" ] + values: [ "2xlarge", "4xlarge", "8xlarge", "12xlarge" ] - key: "kubernetes.io/arch" operator: In values: ["amd64"] From 0119abd81dc5a8bcbc102b914a5791ebc2b50cfd Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:20:04 -0800 Subject: [PATCH 04/16] add in emr and amp --- ai-ml/infrastructure/terraform/addons.tf | 12 +- ai-ml/infrastructure/terraform/amp.tf | 137 ++++++++++++++++++++ ai-ml/infrastructure/terraform/eks.tf | 8 +- ai-ml/infrastructure/terraform/emr-eks.tf | 22 ++++ ai-ml/infrastructure/terraform/variables.tf | 13 +- 5 files changed, 185 insertions(+), 7 deletions(-) create mode 100644 ai-ml/infrastructure/terraform/amp.tf create mode 100644 ai-ml/infrastructure/terraform/emr-eks.tf diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index 3b7a936ed..9ba662a22 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -155,9 +155,13 @@ module "eks_blueprints_addons" { enable_kube_prometheus_stack = var.enable_kube_prometheus_stack kube_prometheus_stack = { values = [ - templatefile("${path.module}/helm-values/kube-prometheus.yaml", { - storage_class_type = kubernetes_storage_class.default_gp3.id - }) + var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", { + region = local.region + amp_sa = local.amp_ingest_service_account + amp_irsa = module.amp_ingest_irsa[0].iam_role_arn + amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write" + amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}" + }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {storage_class_type = kubernetes_storage_class.default_gp3.id}) ] chart_version = "48.1.1" set_sensitive = [ @@ -640,7 +644,7 @@ resource "kubectl_manifest" "dcgm" { yaml_body = file("${path.module}/monitoring/dcgm.yaml") } -resource "kubectl_manifest" "dcgm" { +resource "kubectl_manifest" "dcgm_service" { yaml_body = file("${path.module}/monitoring/dcgm-service.yaml") } diff --git a/ai-ml/infrastructure/terraform/amp.tf b/ai-ml/infrastructure/terraform/amp.tf new file mode 100644 index 000000000..96df2a495 --- /dev/null +++ b/ai-ml/infrastructure/terraform/amp.tf @@ -0,0 +1,137 @@ +#IAM Policy for Amazon Prometheus & Grafana +resource "aws_iam_policy" "grafana" { + count = var.enable_amazon_prometheus ? 1 : 0 + + description = "IAM policy for Grafana Pod" + name_prefix = format("%s-%s-", local.name, "grafana") + path = "/" + policy = data.aws_iam_policy_document.grafana[0].json +} + +data "aws_iam_policy_document" "grafana" { + count = var.enable_amazon_prometheus ? 1 : 0 + + statement { + sid = "AllowReadingMetricsFromCloudWatch" + effect = "Allow" + resources = ["*"] + + actions = [ + "cloudwatch:DescribeAlarmsForMetric", + "cloudwatch:ListMetrics", + "cloudwatch:GetMetricData", + "cloudwatch:GetMetricStatistics" + ] + } + + statement { + sid = "AllowGetInsightsCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"] + + actions = [ + "cloudwatch:GetInsightRuleReport", + ] + } + + statement { + sid = "AllowReadingAlarmHistoryFromCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"] + + actions = [ + "cloudwatch:DescribeAlarmHistory", + "cloudwatch:DescribeAlarms", + ] + } + + statement { + sid = "AllowReadingLogsFromCloudWatch" + effect = "Allow" + resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"] + + actions = [ + "logs:DescribeLogGroups", + "logs:GetLogGroupFields", + "logs:StartQuery", + "logs:StopQuery", + "logs:GetQueryResults", + "logs:GetLogEvents", + ] + } + + statement { + sid = "AllowReadingTagsInstancesRegionsFromEC2" + effect = "Allow" + resources = ["*"] + + actions = [ + "ec2:DescribeTags", + "ec2:DescribeInstances", + "ec2:DescribeRegions", + ] + } + + statement { + sid = "AllowReadingResourcesForTags" + effect = "Allow" + resources = ["*"] + actions = ["tag:GetResources"] + } + + statement { + sid = "AllowListApsWorkspaces" + effect = "Allow" + resources = [ + "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*", + "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*", + "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*", + ] + actions = [ + "aps:ListWorkspaces", + "aps:DescribeWorkspace", + "aps:GetMetricMetadata", + "aps:GetSeries", + "aps:QueryMetrics", + "aps:RemoteWrite", + "aps:GetLabels" + ] + } +} + +#------------------------------------------ +# Amazon Prometheus +#------------------------------------------ +locals { + amp_ingest_service_account = "amp-iamproxy-ingest-service-account" + amp_namespace = "kube-prometheus-stack" +} + +resource "aws_prometheus_workspace" "amp" { + count = var.enable_amazon_prometheus ? 1 : 0 + + alias = format("%s-%s", "amp-ws", local.name) + tags = local.tags +} + +module "amp_ingest_irsa" { + count = var.enable_amazon_prometheus ? 1 : 0 + + source = "aws-ia/eks-blueprints-addon/aws" + version = "~> 1.0" + create_release = false + create_role = true + create_policy = false + role_name = format("%s-%s", local.name, "amp-ingest") + role_policies = { amp_policy = aws_iam_policy.grafana[0].arn } + + oidc_providers = { + this = { + provider_arn = module.eks.oidc_provider_arn + namespace = local.amp_namespace + service_account = local.amp_ingest_service_account + } + } + + tags = local.tags +} diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf index 3543232ec..169b19bac 100644 --- a/ai-ml/infrastructure/terraform/eks.tf +++ b/ai-ml/infrastructure/terraform/eks.tf @@ -29,7 +29,13 @@ module "eks" { "system:bootstrappers", "system:nodes", ] - } + }, + { + # Required for EMR on EKS virtual cluster + rolearn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSServiceRoleForAmazonEMRContainers" + username = "emr-containers" + groups = [] + }, ] #--------------------------------------- # Note: This can further restricted to specific required for each Add-on and your application diff --git a/ai-ml/infrastructure/terraform/emr-eks.tf b/ai-ml/infrastructure/terraform/emr-eks.tf new file mode 100644 index 000000000..ff9280e99 --- /dev/null +++ b/ai-ml/infrastructure/terraform/emr-eks.tf @@ -0,0 +1,22 @@ +module "emr_containers" { + source = "terraform-aws-modules/emr/aws//modules/virtual-cluster" + version = "~> 1.0" + + for_each = var.enable_amazon_emr ? toset(["ml-team-a", "ml-team-b"]) : toset([]) + + eks_cluster_id = module.eks.cluster_name + oidc_provider_arn = module.eks.oidc_provider_arn + + name = "${module.eks.cluster_name}-emr-${each.value}" + namespace = "emr-${each.value}" + + role_name = "${module.eks.cluster_name}-emr-${each.value}" + iam_role_use_name_prefix = false + iam_role_description = "EMR Execution Role for emr-${each.value}" + # NOTE: S3 full access added only for testing purpose. You should modify this policy to restrict access to S3 buckets + iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] + + cloudwatch_log_group_name = "/emr-on-eks-logs/${module.eks.cluster_name}/emr-${each.value}/" + + tags = merge(local.tags, { Name = "emr-${each.value}" }) +} diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf index f0606cd0e..60618324a 100644 --- a/ai-ml/infrastructure/terraform/variables.tf +++ b/ai-ml/infrastructure/terraform/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "ai-stack" + default = "ml-stack" type = string } @@ -57,7 +57,16 @@ variable "deploy_fsx_volume" { type = bool default = false } - +variable "enable_amazon_prometheus" { + description = "Enable Amazon Prometheus" + type = bool + default = false +} +variable "enable_amazon_emr" { + description = "Enable Amazon EMR" + type = bool + default = false +} # Addon Variables variable "enable_kube_prometheus_stack" { description = "Enable Kube Prometheus addon" From da356f463e4c48db0429b05b061a201a93f56a8a Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:20:51 -0800 Subject: [PATCH 05/16] add missing kube-prometheus-amp-enable.yaml file --- .../kube-prometheus-amp-enable.yaml | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml diff --git a/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml new file mode 100644 index 000000000..078f33318 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/kube-prometheus-amp-enable.yaml @@ -0,0 +1,65 @@ +prometheus: + serviceAccount: + create: true + name: ${amp_sa} + annotations: + eks.amazonaws.com/role-arn: ${amp_irsa} + prometheusSpec: + remoteWrite: + - url: ${amp_remotewrite_url} + sigv4: + region: ${region} + queueConfig: + maxSamplesPerSend: 1000 + maxShards: 200 + capacity: 2500 + retention: 5h + scrapeInterval: 30s + evaluationInterval: 30s + scrapeTimeout: 10s + storageSpec: + volumeClaimTemplate: + metadata: + name: data + spec: + storageClassName: gp2 + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + # Scrape Cost metrics for Kubecost add-on + # additionalScrapeConfigs: + # - job_name: kubecost + # honor_labels: true + # scrape_interval: 1m + # scrape_timeout: 10s + # metrics_path: /metrics + # scheme: http + # dns_sd_configs: + # - names: + # - kubecost-cost-analyzer.kubecost.svc + # type: 'A' + # port: 9003 +alertmanager: + enabled: false + +grafana: + enabled: true + defaultDashboardsEnabled: true +# Adding AMP datasource to Grafana config + serviceAccount: + create: false + name: ${amp_sa} + grafana.ini: + auth: + sigv4_auth_enabled: true + additionalDataSources: + - name: AMP + editable: true + jsonData: + sigV4Auth: true + sigV4Region: ${region} + type: prometheus + isDefault: false + url: ${amp_url} From a6c978315955eb277fa36a6e20fefb5e9b2e8041 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 15:28:27 -0800 Subject: [PATCH 06/16] add missing dcgm components --- ai-ml/emr-spark-rapids/addons.tf | 301 ------------------ ai-ml/emr-spark-rapids/amp.tf | 137 -------- ai-ml/emr-spark-rapids/cleanup.sh | 55 ---- ai-ml/emr-spark-rapids/eks.tf | 218 ------------- ai-ml/emr-spark-rapids/emr-eks.tf | 22 -- .../aws-cloudwatch-metrics-values.yaml | 11 - .../helm-values/aws-for-fluentbit-values.yaml | 102 ------ .../cluster-autoscaler-values.yaml | 25 -- .../coredns-autoscaler-values.yaml | 40 --- .../kube-prometheus-amp-enable.yaml | 65 ---- .../helm-values/kube-prometheus.yaml | 36 --- .../helm-values/kubecost-values.yaml | 62 ---- .../helm-values/metrics-server-values.yaml | 52 --- .../helm-values/nvidia-operator-values.yaml | 96 ------ ai-ml/emr-spark-rapids/main.tf | 61 ---- ai-ml/emr-spark-rapids/outputs.tf | 51 --- ai-ml/emr-spark-rapids/providers.tf | 0 ai-ml/emr-spark-rapids/versions.tf | 33 -- ai-ml/emr-spark-rapids/vpc.tf | 50 --- .../terraform/monitoring/dcgm-service.yaml | 15 + .../terraform/monitoring/dcgm.yaml | 18 +- 21 files changed, 16 insertions(+), 1434 deletions(-) delete mode 100644 ai-ml/emr-spark-rapids/addons.tf delete mode 100644 ai-ml/emr-spark-rapids/amp.tf delete mode 100755 ai-ml/emr-spark-rapids/cleanup.sh delete mode 100644 ai-ml/emr-spark-rapids/eks.tf delete mode 100644 ai-ml/emr-spark-rapids/emr-eks.tf delete mode 100644 ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml delete mode 100755 ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml delete mode 100644 ai-ml/emr-spark-rapids/main.tf delete mode 100644 ai-ml/emr-spark-rapids/outputs.tf delete mode 100644 ai-ml/emr-spark-rapids/providers.tf delete mode 100644 ai-ml/emr-spark-rapids/versions.tf delete mode 100644 ai-ml/emr-spark-rapids/vpc.tf create mode 100644 ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml diff --git a/ai-ml/emr-spark-rapids/addons.tf b/ai-ml/emr-spark-rapids/addons.tf deleted file mode 100644 index 408277b00..000000000 --- a/ai-ml/emr-spark-rapids/addons.tf +++ /dev/null @@ -1,301 +0,0 @@ -#--------------------------------------------------------------- -# IRSA for EBS CSI Driver -#--------------------------------------------------------------- -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.34" - role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver") - attach_ebs_csi_policy = true - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - tags = local.tags -} - -#--------------------------------------------------------------- -# EKS Blueprints Addons -#--------------------------------------------------------------- -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.2" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - } - coredns = { - preserve = true - } - vpc-cni = { - preserve = true - } - kube-proxy = { - preserve = true - } - } - - #--------------------------------------- - # Kubernetes Add-ons - #--------------------------------------- - #--------------------------------------------------------------- - # CoreDNS Autoscaler helps to scale for large EKS Clusters - # Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/ - #--------------------------------------------------------------- - enable_cluster_proportional_autoscaler = true - cluster_proportional_autoscaler = { - values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", { - target = "deployment/coredns" - })] - description = "Cluster Proportional Autoscaler for CoreDNS Service" - } - - #--------------------------------------- - # Metrics Server - #--------------------------------------- - enable_metrics_server = true - metrics_server = { - values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})] - } - - #--------------------------------------- - # Cluster Autoscaler - #--------------------------------------- - enable_cluster_autoscaler = true - cluster_autoscaler = { - values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", { - aws_region = var.region, - eks_cluster_id = module.eks.cluster_name - })] - } - - #--------------------------------------- - # Karpenter Autoscaler for EKS Cluster - #--------------------------------------- - enable_karpenter = true - karpenter_enable_spot_termination = true - karpenter_node = { - iam_role_additional_policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - karpenter = { - chart_version = "v0.34.0" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------- - # CloudWatch metrics for EKS - #--------------------------------------- - enable_aws_cloudwatch_metrics = true - aws_cloudwatch_metrics = { - values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})] - } - - #--------------------------------------- - # Prommetheus and Grafana stack - #--------------------------------------- - #--------------------------------------------------------------- - # Install Kafka Monitoring Stack with Prometheus and Grafana - # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` - # 2- Grafana Admin user: admin - # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` - #--------------------------------------------------------------- - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - values = [ - var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", { - region = local.region - amp_sa = local.amp_ingest_service_account - amp_irsa = module.amp_ingest_irsa[0].iam_role_arn - amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write" - amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}" - }) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {}) - ] - chart_version = "48.1.1" - set_sensitive = [ - { - name = "grafana.adminPassword" - value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string - } - ], - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- -module "eks_data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "1.33.0" # ensure to update this to the latest/desired version - - oidc_provider_arn = module.eks.oidc_provider_arn - - enable_karpenter_resources = true - karpenter_resources_helm_config = { - spark-gpu-karpenter = { - values = [ - <<-EOT - name: spark-gpu-karpenter - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodeGroupType: spark-executor-gpu-karpenter - taints: - - key: nvidia.com/gpu - value: "Exists" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["g5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: [ "2xlarge" ] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 - EOT - ] - } - spark-driver-cpu-karpenter = { - values = [ - <<-EOT - name: spark-driver-cpu-karpenter - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[3]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodeGroupType: spark-driver-cpu-karpenter - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["m5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 - EOT - ] - } - } - - #--------------------------------------------------------------- - # NVIDIA GPU Operator Add-on - #--------------------------------------------------------------- - enable_nvidia_gpu_operator = var.enable_nvidia_gpu_operator - - nvidia_gpu_operator_helm_config = { - version = "v23.9.1" - values = [templatefile("${path.module}/helm-values/nvidia-operator-values.yaml", {})] - } - - #--------------------------------------------------------------- - # NVIDIA Device Plugin Add-on - #--------------------------------------------------------------- - # Enable only when NVIDIA GPU Operator is disabled - enable_nvidia_device_plugin = !(var.enable_nvidia_gpu_operator) - -} - -#--------------------------------------------------------------- -# Grafana Admin credentials resources -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] -} - -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" -} - -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name = "${local.name}-grafana" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result -} - -#--------------------------------------------------------------- -# S3 bucket for Spark jobs -#--------------------------------------------------------------- -module "s3_bucket" { - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - bucket_prefix = "${local.name}-spark-" - - # For example only - please evaluate for your environment - force_destroy = true - - server_side_encryption_configuration = { - rule = { - apply_server_side_encryption_by_default = { - sse_algorithm = "AES256" - } - } - } - - tags = local.tags -} diff --git a/ai-ml/emr-spark-rapids/amp.tf b/ai-ml/emr-spark-rapids/amp.tf deleted file mode 100644 index 96df2a495..000000000 --- a/ai-ml/emr-spark-rapids/amp.tf +++ /dev/null @@ -1,137 +0,0 @@ -#IAM Policy for Amazon Prometheus & Grafana -resource "aws_iam_policy" "grafana" { - count = var.enable_amazon_prometheus ? 1 : 0 - - description = "IAM policy for Grafana Pod" - name_prefix = format("%s-%s-", local.name, "grafana") - path = "/" - policy = data.aws_iam_policy_document.grafana[0].json -} - -data "aws_iam_policy_document" "grafana" { - count = var.enable_amazon_prometheus ? 1 : 0 - - statement { - sid = "AllowReadingMetricsFromCloudWatch" - effect = "Allow" - resources = ["*"] - - actions = [ - "cloudwatch:DescribeAlarmsForMetric", - "cloudwatch:ListMetrics", - "cloudwatch:GetMetricData", - "cloudwatch:GetMetricStatistics" - ] - } - - statement { - sid = "AllowGetInsightsCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:insight-rule/*"] - - actions = [ - "cloudwatch:GetInsightRuleReport", - ] - } - - statement { - sid = "AllowReadingAlarmHistoryFromCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:cloudwatch:${local.region}:${local.account_id}:alarm:*"] - - actions = [ - "cloudwatch:DescribeAlarmHistory", - "cloudwatch:DescribeAlarms", - ] - } - - statement { - sid = "AllowReadingLogsFromCloudWatch" - effect = "Allow" - resources = ["arn:${local.partition}:logs:${local.region}:${local.account_id}:log-group:*:log-stream:*"] - - actions = [ - "logs:DescribeLogGroups", - "logs:GetLogGroupFields", - "logs:StartQuery", - "logs:StopQuery", - "logs:GetQueryResults", - "logs:GetLogEvents", - ] - } - - statement { - sid = "AllowReadingTagsInstancesRegionsFromEC2" - effect = "Allow" - resources = ["*"] - - actions = [ - "ec2:DescribeTags", - "ec2:DescribeInstances", - "ec2:DescribeRegions", - ] - } - - statement { - sid = "AllowReadingResourcesForTags" - effect = "Allow" - resources = ["*"] - actions = ["tag:GetResources"] - } - - statement { - sid = "AllowListApsWorkspaces" - effect = "Allow" - resources = [ - "arn:${local.partition}:aps:${local.region}:${local.account_id}:/*", - "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*", - "arn:${local.partition}:aps:${local.region}:${local.account_id}:workspace/*/*", - ] - actions = [ - "aps:ListWorkspaces", - "aps:DescribeWorkspace", - "aps:GetMetricMetadata", - "aps:GetSeries", - "aps:QueryMetrics", - "aps:RemoteWrite", - "aps:GetLabels" - ] - } -} - -#------------------------------------------ -# Amazon Prometheus -#------------------------------------------ -locals { - amp_ingest_service_account = "amp-iamproxy-ingest-service-account" - amp_namespace = "kube-prometheus-stack" -} - -resource "aws_prometheus_workspace" "amp" { - count = var.enable_amazon_prometheus ? 1 : 0 - - alias = format("%s-%s", "amp-ws", local.name) - tags = local.tags -} - -module "amp_ingest_irsa" { - count = var.enable_amazon_prometheus ? 1 : 0 - - source = "aws-ia/eks-blueprints-addon/aws" - version = "~> 1.0" - create_release = false - create_role = true - create_policy = false - role_name = format("%s-%s", local.name, "amp-ingest") - role_policies = { amp_policy = aws_iam_policy.grafana[0].arn } - - oidc_providers = { - this = { - provider_arn = module.eks.oidc_provider_arn - namespace = local.amp_namespace - service_account = local.amp_ingest_service_account - } - } - - tags = local.tags -} diff --git a/ai-ml/emr-spark-rapids/cleanup.sh b/ai-ml/emr-spark-rapids/cleanup.sh deleted file mode 100755 index 1f084486e..000000000 --- a/ai-ml/emr-spark-rapids/cleanup.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -set -o errexit -set -o pipefail - -read -p "Enter the region: " region -export AWS_DEFAULT_REGION=$region - -targets=( - "module.emr_containers" - "module.eks_data_addons" - "module.eks_blueprints_addons" -) - -#------------------------------------------- -# Helpful to delete the stuck in "Terminating" namespaces -# Rerun the cleanup.sh script to detect and delete the stuck resources -#------------------------------------------- -terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name') - -# If there are no terminating namespaces, exit the script -if [[ -z $terminating_namespaces ]]; then - echo "No terminating namespaces found" -fi - -for ns in $terminating_namespaces; do - echo "Terminating namespace: $ns" - kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - -done - -#------------------------------------------- -# Terraform destroy per module target -#------------------------------------------- -for target in "${targets[@]}" -do - terraform destroy -target="$target" -auto-approve - destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1) - if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of $target completed successfully" - else - echo "FAILED: Terraform destroy of $target failed" - exit 1 - fi -done - -#------------------------------------------- -# Terraform destroy full -#------------------------------------------- -terraform destroy -auto-approve -destroy_output=$(terraform destroy -auto-approve 2>&1) -if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of all targets completed successfully" -else - echo "FAILED: Terraform destroy of all targets failed" - exit 1 -fi diff --git a/ai-ml/emr-spark-rapids/eks.tf b/ai-ml/emr-spark-rapids/eks.tf deleted file mode 100644 index 4d01ae4fe..000000000 --- a/ai-ml/emr-spark-rapids/eks.tf +++ /dev/null @@ -1,218 +0,0 @@ -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- - -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.21" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - - #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. - cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. - - vpc_id = module.vpc.vpc_id - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - manage_aws_auth_configmap = true - aws_auth_roles = [ - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - }, - { - # Required for EMR on EKS virtual cluster - rolearn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSServiceRoleForAmazonEMRContainers" - username = "emr-containers" - groups = [] - }, - ] - - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 1025 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - # Extend node-to-node security group rules - node_security_group_additional_rules = { - ingress_self_all = { - description = "Node to node all ports/protocols" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - self = true - } - # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc. - # Change this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - - eks_managed_node_groups = { - # We recommend to have a MNG to place your critical workloads and add-ons - # Then rely on Karpenter to scale your workloads - # You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners - core_node_group = { - name = "core-node-group" - description = "EKS managed node group example launch template" - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - min_size = 3 - max_size = 9 - desired_size = 3 - - ami_type = "AL2_x86_64" - instance_types = ["m5.xlarge"] - - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - "nvidia.com/gpu.deploy.operands" = false - } - - tags = { - Name = "core-node-grp", - "karpenter.sh/discovery" = local.name - } - } - - spark_driver_ng = { - name = "spark-driver-ng" - description = "Spark managed node group for Driver pods" - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)] - - ami_type = "AL2_x86_64" - - min_size = 1 - max_size = 8 - desired_size = 1 - - force_update_version = true - instance_types = ["m5.xlarge"] # 4 vCPU and 16GB - - ebs_optimized = true - - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "spark-driver-cpu-ca" - "nvidia.com/gpu.deploy.operands" = false - } - - tags = { - Name = "spark-driver-ca" - } - } - spark_gpu_ng = { - name = "spark-gpu-ng" - description = "Spark managed GPU node group for executor pods with launch template" - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)] - - ami_type = "AL2_x86_64_GPU" - - # NVMe instance store volumes are automatically enumerated and assigned a device - pre_bootstrap_user_data = <<-EOT - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - # Configure NVMe volumes in RAID0 configuration - # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 - # Mount will be: /mnt/k8s-disks - export LOCAL_DISKS='raid0' - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - EOT - - # Change min_size, max_size and desired_size to 8 before running xgboost example - min_size = 0 - max_size = 8 - desired_size = 0 - - capacity_type = "ON_DEMAND" - instance_types = ["g5.2xlarge"] - - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "spark-executor-gpu-ca" - } - - taints = [{ - key = "nvidia.com/gpu", - value = "EXISTS", - effect = "NO_SCHEDULE" - }] - - tags = { - Name = "spark-gpu", - } - } - } - - tags = local.tags -} diff --git a/ai-ml/emr-spark-rapids/emr-eks.tf b/ai-ml/emr-spark-rapids/emr-eks.tf deleted file mode 100644 index aa1ec0282..000000000 --- a/ai-ml/emr-spark-rapids/emr-eks.tf +++ /dev/null @@ -1,22 +0,0 @@ -module "emr_containers" { - source = "terraform-aws-modules/emr/aws//modules/virtual-cluster" - version = "~> 1.0" - - for_each = toset(["ml-team-a", "ml-team-b"]) - - eks_cluster_id = module.eks.cluster_name - oidc_provider_arn = module.eks.oidc_provider_arn - - name = "${module.eks.cluster_name}-emr-${each.value}" - namespace = "emr-${each.value}" - - role_name = "${module.eks.cluster_name}-emr-${each.value}" - iam_role_use_name_prefix = false - iam_role_description = "EMR Execution Role for emr-${each.value}" - # NOTE: S3 full access added only for testing purpose. You should modify this policy to restrict access to S3 buckets - iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] - - cloudwatch_log_group_name = "/emr-on-eks-logs/${module.eks.cluster_name}/emr-${each.value}/" - - tags = merge(local.tags, { Name = "emr-${each.value}" }) -} diff --git a/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml b/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml deleted file mode 100644 index ae3c41d44..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/aws-cloudwatch-metrics-values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - limits: - cpu: 500m - memory: 2Gi - requests: - cpu: 200m - memory: 1Gi - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml b/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml deleted file mode 100755 index 0bea5188d..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/aws-for-fluentbit-values.yaml +++ /dev/null @@ -1,102 +0,0 @@ -global: - -#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server -# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata -hostNetwork: true -dnsPolicy: ClusterFirstWithHostNet - -service: - parsersFiles: - - /fluent-bit/parsers/parsers.conf - extraParsers: | - [PARSER] - Name kubernetes - Format regex - Regex ^(?[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ - -input: - name: "tail" - enabled: true - tag: "systempods....-" - path: "/var/log/containers/*.log" - db: "/var/log/flb_kube.db" - memBufLimit: 5MB - skipLongLines: "On" - refreshInterval: 10 - extraInputs: | - multiline.parser docker, cri - Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ - - -# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters -filter: - name: "kubernetes" - match: "systempods.*" - kubeURL: "https://kubernetes.default.svc.cluster.local:443" - mergeLog: "On" - mergeLogKey: "log_processed" - keepLog: "On" - k8sLoggingParser: "On" - k8sLoggingExclude: "Off" - bufferSize: "0" - extraFilters: | - Kube_Tag_Prefix systempods. - Regex_Parser kubernetes - Labels On - Annotations Off - Use_Kubelet true - Kubelet_Port 10250 - Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token - -# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. -# cloudWatch: -# enabled: false - -# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch -cloudWatchLogs: - enabled: true - match: "systempods.*" - region: ${region} - logGroupName: ${cloudwatch_log_group} - autoCreateGroup: false - extraOutputs: | - log_key log - -#----------------------------------------------------------# -# OUTPUT logs to S3 -#----------------------------------------------------------# - -# This is an example for writing logs to S3 bucket. -# This example writes system pod logs and spark logs into dedicated prefix. -# This second output is using the rewrite_tag filter commented above - -additionalOutputs: | - [OUTPUT] - Name s3 - Match systempods.* - region ${region} - bucket ${s3_bucket_name} - total_file_size 100M - s3_key_format /${cluster_name}/system-pod-logs/$TAG[1]/$TAG[2]/$TAG[3]/$TAG[3]_%H%M%S_$UUID.log - s3_key_format_tag_delimiters .. - store_dir /home/ec2-user/buffer - upload_timeout 10m - log_key log - - -# Resource config for large clusters -resources: - limits: - cpu: 1000m - memory: 1500Mi - requests: - cpu: 500m - memory: 500Mi - -## Assign a PriorityClassName to pods if set -priorityClassName: system-node-critical - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml b/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml deleted file mode 100644 index 5a42794f2..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/cluster-autoscaler-values.yaml +++ /dev/null @@ -1,25 +0,0 @@ -autoDiscovery: - clusterName: ${eks_cluster_id} - -awsRegion: ${aws_region} - -cloudProvider: aws - -extraArgs: - aws-use-static-instance-list: true - -# Best practice to update the resource requests and limits for each add-on -resources: - limits: - cpu: 1000m - memory: 1G - requests: - cpu: 200m - memory: 512Mi - -# Best practice to updateStrategy for each add-on -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 diff --git a/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml b/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml deleted file mode 100644 index 64cb540bf..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/coredns-autoscaler-values.yaml +++ /dev/null @@ -1,40 +0,0 @@ -nameOverride: kube-dns-autoscaler - -# Formula for controlling the replicas. Adjust according to your needs -# replicas = max( ceil( cores * 1/coresPerReplica ) , ceil( nodes * 1/nodesPerReplica ) ) -# replicas = min(replicas, max) -# replicas = max(replicas, min) -config: - linear: - coresPerReplica: 256 - nodesPerReplica: 16 - min: 1 - max: 100 - preventSinglePointFailure: true - includeUnschedulableNodes: true - -# Target to scale. In format: deployment/*, replicationcontroller/* or replicaset/* (not case sensitive). -options: - target: ${target} - -serviceAccount: - create: true - name: kube-dns-autoscaler - -podSecurityContext: - seccompProfile: - type: RuntimeDefault - supplementalGroups: [ 65534 ] - fsGroup: 65534 - -resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - -tolerations: - - key: "CriticalAddonsOnly" - operator: "Exists" diff --git a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml b/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml deleted file mode 100644 index 078f33318..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus-amp-enable.yaml +++ /dev/null @@ -1,65 +0,0 @@ -prometheus: - serviceAccount: - create: true - name: ${amp_sa} - annotations: - eks.amazonaws.com/role-arn: ${amp_irsa} - prometheusSpec: - remoteWrite: - - url: ${amp_remotewrite_url} - sigv4: - region: ${region} - queueConfig: - maxSamplesPerSend: 1000 - maxShards: 200 - capacity: 2500 - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: gp2 - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - # Scrape Cost metrics for Kubecost add-on - # additionalScrapeConfigs: - # - job_name: kubecost - # honor_labels: true - # scrape_interval: 1m - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # dns_sd_configs: - # - names: - # - kubecost-cost-analyzer.kubecost.svc - # type: 'A' - # port: 9003 -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true -# Adding AMP datasource to Grafana config - serviceAccount: - create: false - name: ${amp_sa} - grafana.ini: - auth: - sigv4_auth_enabled: true - additionalDataSources: - - name: AMP - editable: true - jsonData: - sigV4Auth: true - sigV4Region: ${region} - type: prometheus - isDefault: false - url: ${amp_url} diff --git a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml b/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml deleted file mode 100644 index 54c1f690f..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/kube-prometheus.yaml +++ /dev/null @@ -1,36 +0,0 @@ -prometheus: - prometheusSpec: - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: gp2 - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - # Scrape Cost metrics for Kubecost add-on - # additionalScrapeConfigs: - # - job_name: kubecost - # honor_labels: true - # scrape_interval: 1m - # scrape_timeout: 10s - # metrics_path: /metrics - # scheme: http - # dns_sd_configs: - # - names: - # - kubecost-cost-analyzer.kubecost.svc - # type: 'A' - # port: 9003 -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true diff --git a/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml b/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml deleted file mode 100644 index f781ec5ce..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/kubecost-values.yaml +++ /dev/null @@ -1,62 +0,0 @@ - -# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090 - -global: - # pricingCsv: - # enabled: false - # location: - # provider: "AWS" - # region: "us-east-1" - # URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI - # csvAccessCredentials: pricing-schema-access-secret - - # This Prometheus setup is reusing the existing Prometheus deployment - # Check for more docs under https://guide.kubecost.com/hc/en-us/articles/4407595941015 - prometheus: - fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090 - enabled: false - -# If you have node-exporter and/or KSM running on your cluster, follow this step to disable the Kubecost included versions. -prometheus: - nodeExporter: - enabled: false - serviceAccounts: - nodeExporter: - create: false - kubeStateMetrics: - enabled: false - -#imageVersion: prod-1.96.0 # commented to use the latest - -kubecostFrontend: - image: public.ecr.aws/kubecost/frontend - resources: - requests: - cpu: "200m" - memory: "512Mi" - -kubecostMetrics: - emitPodAnnotations: true - emitNamespaceAnnotations: true - -kubecostModel: - image: public.ecr.aws/kubecost/cost-model - resources: - requests: - cpu: "500m" - memory: "512Mi" - -# Set this to false if you're bringing your own service account. -#serviceAccount: -# create: false -# name: kubecost-cost-analyzer -# annotations: -# eks.amazonaws.com/role-arn: - -# Define persistence volume for cost-analyzer -persistentVolume: - size: 32Gi - dbSize: 32.0Gi - enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. - storageClass: gp2 - # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost diff --git a/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml b/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml deleted file mode 100644 index 026d97a6a..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/metrics-server-values.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# HA config for metrics-server -image: - repository: registry.k8s.io/metrics-server/metrics-server - pullPolicy: IfNotPresent - -serviceAccount: - create: true - name: metrics-server - -rbac: - create: true - pspEnabled: false - -apiService: - create: true - -podLabels: - k8s-app: metrics-server - -# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true -replicas: 2 - -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 - -podDisruptionBudget: - enabled: true - minAvailable: 1 - -defaultArgs: - - --cert-dir=/tmp - - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname - - --kubelet-use-node-status-port - - --metric-resolution=15s - -resources: - requests: - cpu: 200m - memory: 512Mi - -affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - k8s-app: metrics-server - namespaces: - - kube-system - topologyKey: kubernetes.io/hostname diff --git a/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml b/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml deleted file mode 100644 index 532ee11ff..000000000 --- a/ai-ml/emr-spark-rapids/helm-values/nvidia-operator-values.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# Default values for gpu-operator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -daemonsets: - labels: {} - annotations: {} - priorityClassName: system-node-critical - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes - -operator: - repository: nvcr.io/nvidia - priorityClassName: system-node-critical - defaultRuntime: containerd - image: gpu-operator - cleanupCRD: false # This option doesn't do anything even if you change this to true. NVIDIA recommends to use the manual approach of upgrading the CRDs - # upgrade CRD on chart upgrade, requires --disable-openapi-validation flag - # to be passed during helm upgrade. - upgradeCRD: false - resources: - limits: - cpu: 500m - memory: 350Mi - requests: - cpu: 200m - memory: 100Mi - -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/install-precompiled-signed-drivers.html -# Currently NVIDIA Operator takes more than 5 mins to make the node GPU ready with all the required drivers. -# With pre-compiled NVIDIA Drivers this process can be faster hence we are using the config values as driver.version: "515-signed" - -driver: - enabled: false # Disabled as we are using latest EKS AMI 1.29 which comes with NVIDIA drivers pre-installed - # repository: nvcr.io/nvidia - # image: driver - # # Commented this as latest Ubuntu AMIs are failing with this option enabled - # # version: "515-signed" # supported DRIVER_BRANCH value currently are 470, 510 and 515 which will install latest drivers available on that branch for current running kernel version. - # manager: - # image: k8s-driver-manager - # repository: nvcr.io/nvidia/cloud-native - -# to ensure containers can properly access GPUs -toolkit: - enabled: true - -# to discover and advertise GPU resources to kubelet -devicePlugin: - enabled: true - -dcgm: - enabled: false - -# to monitor the GPU(s) on the node -dcgmExporter: - enabled: true - -gfd: - enabled: true - -migManager: - enabled: false - -nodeStatusExporter: - enabled: false - -gds: - enabled: false - -vgpuManager: - enabled: false - -vgpuDeviceManager: - enabled: false - -vfioManager: - enabled: false - -sandboxDevicePlugin: - enabled: false - -node-feature-discovery: - enableNodeFeatureApi: true - worker: - tolerations: - - key: "node-role.kubernetes.io/master" - operator: "Equal" - value: "" - effect: "NoSchedule" - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" # Added this to ensure it can tolerate any custom Taints added to the GPU nodes diff --git a/ai-ml/emr-spark-rapids/main.tf b/ai-ml/emr-spark-rapids/main.tf deleted file mode 100644 index 809cc6343..000000000 --- a/ai-ml/emr-spark-rapids/main.tf +++ /dev/null @@ -1,61 +0,0 @@ -provider "aws" { - region = local.region -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -provider "kubectl" { - apply_retry_count = 30 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - token = data.aws_eks_cluster_auth.this.token -} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -data "aws_availability_zones" "available" {} - -data "aws_caller_identity" "current" {} -data "aws_partition" "current" {} - -locals { - name = var.name - region = var.region - - # Only two AZs for this example - azs = slice(data.aws_availability_zones.available.names, 0, 2) - - account_id = data.aws_caller_identity.current.account_id - partition = data.aws_partition.current.partition - - tags = merge(var.tags, { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - }) -} diff --git a/ai-ml/emr-spark-rapids/outputs.tf b/ai-ml/emr-spark-rapids/outputs.tf deleted file mode 100644 index 8645d7977..000000000 --- a/ai-ml/emr-spark-rapids/outputs.tf +++ /dev/null @@ -1,51 +0,0 @@ -################################################################################ -# Cluster -################################################################################ - -output "cluster_arn" { - description = "The Amazon Resource Name (ARN) of the cluster" - value = module.eks.cluster_arn -} - -output "cluster_name" { - description = "The Amazon Resource Name (ARN) of the cluster" - value = module.eks.cluster_id -} - -output "oidc_provider_arn" { - description = "The ARN of the OIDC Provider if `enable_irsa = true`" - value = module.eks.oidc_provider_arn -} - -################################################################################ -# EKS Managed Node Group -################################################################################ - -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" -} - -output "emr_on_eks" { - description = "EMR on EKS" - value = module.emr_containers -} - -################################################################################ -# AMP -################################################################################ - -output "amp_workspace_id" { - description = "The id of amp" - value = aws_prometheus_workspace.amp[0].id -} - -output "grafana_secret_name" { - description = "Grafana password secret name" - value = aws_secretsmanager_secret.grafana.name -} - -output "s3_bucket_id" { - description = "S3 bucket for Spark input and output data" - value = module.s3_bucket.s3_bucket_id -} diff --git a/ai-ml/emr-spark-rapids/providers.tf b/ai-ml/emr-spark-rapids/providers.tf deleted file mode 100644 index e69de29bb..000000000 diff --git a/ai-ml/emr-spark-rapids/versions.tf b/ai-ml/emr-spark-rapids/versions.tf deleted file mode 100644 index 182b26581..000000000 --- a/ai-ml/emr-spark-rapids/versions.tf +++ /dev/null @@ -1,33 +0,0 @@ -terraform { - required_version = ">= 1.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 4.47" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4" - } - kubectl = { - source = "gavinbunney/kubectl" - version = ">= 1.14" - } - random = { - source = "hashicorp/random" - version = ">= 3.3" - } - } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "doeks-github-actions-e2e-test-state" - # region = "us-west-2" - # key = "e2e/emr-spark-rapids/terraform.tfstate" - # } -} diff --git a/ai-ml/emr-spark-rapids/vpc.tf b/ai-ml/emr-spark-rapids/vpc.tf deleted file mode 100644 index e7e6473ee..000000000 --- a/ai-ml/emr-spark-rapids/vpc.tf +++ /dev/null @@ -1,50 +0,0 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = var.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods - # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - enable_nat_gateway = true - single_nat_gateway = true - #------------------------------- - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - # Tags subnets for Karpenter auto-discovery - "karpenter.sh/discovery" = local.name - } - - tags = local.tags -} diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml new file mode 100644 index 000000000..9217d4437 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/dcgm-service.yaml @@ -0,0 +1,15 @@ +kind: Service +apiVersion: v1 +metadata: + name: "dcgm-exporter" + namespace: nvidia-device-plugin + labels: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" +spec: + selector: + app.kubernetes.io/name: "dcgm-exporter" + app.kubernetes.io/version: "3.6.1" + ports: + - name: "metrics" + port: 9400 diff --git a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml index c3ffe67d3..0fd459865 100644 --- a/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml +++ b/ai-ml/infrastructure/terraform/monitoring/dcgm.yaml @@ -16,7 +16,7 @@ apiVersion: apps/v1 kind: DaemonSet metadata: name: "dcgm-exporter" - namespace: kube-system + namespace: nvidia-device-plugin labels: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "3.6.1" @@ -64,19 +64,3 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule ---- -kind: Service -apiVersion: v1 -metadata: - name: "dcgm-exporter" - namespace: kube-system - labels: - app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" -spec: - selector: - app.kubernetes.io/name: "dcgm-exporter" - app.kubernetes.io/version: "3.6.1" - ports: - - name: "metrics" - port: 9400 From b79fd2c10bd517bb0375375cc3ac85e80a75fab6 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 28 Jan 2025 20:56:19 -0800 Subject: [PATCH 07/16] initial move from genai to ai --- .../excalidraw/llama2-raytrain.excalidraw | 0 website/docs/{gen-ai => ai}/excalidraw/llama3.svg | 0 .../excalidraw/nvidia-triton-vllm.excalidraw | 0 .../excalidraw/stable-diffusion-inf2.excalidraw | 0 website/docs/{gen-ai => ai}/index.md | 6 +++--- .../inference/GPUs/nvidia-nim-llama3.md | 0 .../inference/GPUs/stablediffusion-gpus.md | 0 .../inference/GPUs/vLLM-NVIDIATritonServer.md | 0 .../{gen-ai => ai}/inference/GPUs/vLLM-rayserve.md | 0 .../inference/Neuron/Mistral-7b-inf2.md | 0 .../{gen-ai => ai}/inference/Neuron/llama2-inf2.md | 0 .../{gen-ai => ai}/inference/Neuron/llama3-inf2.md | 0 .../{gen-ai => ai}/inference/Neuron/rayserve-ha.md | 0 .../inference/Neuron/stablediffusion-inf2.md | 0 .../inference/Neuron/vllm-ray-inf2.md | 0 .../docs/{gen-ai => ai}/inference/_category_.json | 0 .../{gen-ai => ai}/inference/img/answer-1-contd.png | Bin .../docs/{gen-ai => ai}/inference/img/answer-1.png | Bin .../inference/img/excali-draw-sdxl-inf2.png | Bin .../{gen-ai => ai}/inference/img/gradio-app-gpu.png | Bin .../inference/img/gradio-llama-ai-chat.png | Bin .../inference/img/gradio-llama2-13b-chat.png | Bin .../{gen-ai => ai}/inference/img/gradio-test-ft.png | Bin .../inference/img/head-pod-deleted.png | Bin .../inference/img/llama-2-chat-ouput.png | Bin .../inference/img/llama2-13b-response.png | Bin .../{gen-ai => ai}/inference/img/llama2-inf2.png | Bin .../docs/{gen-ai => ai}/inference/img/llama3.png | Bin .../{gen-ai => ai}/inference/img/llma27b-hg.png | Bin .../{gen-ai => ai}/inference/img/mistral-conv-1.png | Bin .../{gen-ai => ai}/inference/img/mistral-conv-2.png | Bin .../{gen-ai => ai}/inference/img/mistral-gradio.png | Bin .../inference/img/mistral-sample-prompt-1.png | Bin .../{gen-ai => ai}/inference/img/mistral7b-hg.png | Bin .../inference/img/neuron-monitor-cwci.png | Bin .../inference/img/nim-dashboard-2.png | Bin .../{gen-ai => ai}/inference/img/nim-dashboard.png | Bin .../inference/img/nim-ngc-api-key.png | Bin .../inference/img/nim-on-eks-arch.png | Bin .../inference/img/openweb-ui-nim-1.png | Bin .../inference/img/openweb-ui-nim-2.png | Bin .../inference/img/openweb-ui-ray-vllm-inf2-1.png | Bin .../inference/img/openweb-ui-ray-vllm-inf2-2.png | Bin .../img/ray-dashboard-deployed-mistral-inf2.png | Bin .../img/ray-dashboard-deploying-mistral-inf2.png | Bin .../inference/img/ray-dashboard-sdxl.png | Bin .../img/ray-dashboard-vllm-llama3-inf2.png | Bin .../inference/img/ray-dashboard-vllm-mistral.png | Bin .../{gen-ai => ai}/inference/img/ray-dashboard.png | Bin .../inference/img/ray-deplo-logs-vllm-mistral.png | Bin .../inference/img/ray-grafana-dashboard.png | Bin .../{gen-ai => ai}/inference/img/ray-head-ha-1.png | Bin .../{gen-ai => ai}/inference/img/ray-head-ha-2.png | Bin .../inference/img/ray-logs-vllm-llama3-inf2.png | Bin .../{gen-ai => ai}/inference/img/ray-prometheus.png | Bin .../inference/img/ray-serve-gpu-sd-cluster.png | Bin .../inference/img/ray-serve-gpu-sd.png | Bin .../img/ray-serve-inf2-mistral-cluster.png | Bin .../{gen-ai => ai}/inference/img/ray-vllm-inf2.png | Bin .../inference/img/rayserve-llama2-13b-dashboard.png | Bin .../inference/img/stable-diffusion-xl-gradio.png | Bin .../inference/img/stable-diffusion-xl-prompt_3.png | Bin .../inference/img/triton-architecture.png | Bin .../inference/img/triton-grafana-dash2.png | Bin .../inference/img/triton-internals.png | Bin .../inference/img/triton-observability.png | Bin .../inference/img/worker-pod-running.png | Bin website/docs/{gen-ai => ai}/ml-platforms-eks.png | Bin .../docs/{gen-ai => ai}/training/GPUs/bionemo.md | 0 .../{gen-ai => ai}/training/Neuron/BERT-Large.md | 0 .../docs/{gen-ai => ai}/training/Neuron/Llama2.md | 0 .../training/Neuron/RayTrain-Llama2.md | 0 .../docs/{gen-ai => ai}/training/_category_.json | 0 .../training/img/Llama2-RayTrain-Trn1.png | Bin .../{gen-ai => ai}/training/img/llama2-trainium.png | Bin .../training/img/raytrain-precomplilation1.png | Bin .../training/img/raytrain-precomplilation2.png | Bin .../training/img/raytrain-precomplilation3.png | Bin .../training/img/raytrain-testdata-lens.png | Bin .../training/img/raytrain-testdata-raydash.png | Bin .../training/img/raytrain-testdata-raydash1.png | Bin .../training/img/raytrain-testdata-raydash2.png | Bin .../training/img/raytrain-testdata-raydash3.png | Bin .../training/img/raytrain-training-progress1.png | Bin .../training/img/raytrain-training-progress2.png | Bin .../training/img/raytrain-training-progress3.png | Bin website/docs/blueprints/ai-ml/index.md | 11 ++--------- website/docusaurus.config.js | 2 +- website/sidebars.js | 2 +- website/src/pages/index.js | 4 ++-- 90 files changed, 9 insertions(+), 16 deletions(-) rename website/docs/{gen-ai => ai}/excalidraw/llama2-raytrain.excalidraw (100%) rename website/docs/{gen-ai => ai}/excalidraw/llama3.svg (100%) rename website/docs/{gen-ai => ai}/excalidraw/nvidia-triton-vllm.excalidraw (100%) rename website/docs/{gen-ai => ai}/excalidraw/stable-diffusion-inf2.excalidraw (100%) rename website/docs/{gen-ai => ai}/index.md (81%) rename website/docs/{gen-ai => ai}/inference/GPUs/nvidia-nim-llama3.md (100%) rename website/docs/{gen-ai => ai}/inference/GPUs/stablediffusion-gpus.md (100%) rename website/docs/{gen-ai => ai}/inference/GPUs/vLLM-NVIDIATritonServer.md (100%) rename website/docs/{gen-ai => ai}/inference/GPUs/vLLM-rayserve.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/Mistral-7b-inf2.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/llama2-inf2.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/llama3-inf2.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/rayserve-ha.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/stablediffusion-inf2.md (100%) rename website/docs/{gen-ai => ai}/inference/Neuron/vllm-ray-inf2.md (100%) rename website/docs/{gen-ai => ai}/inference/_category_.json (100%) rename website/docs/{gen-ai => ai}/inference/img/answer-1-contd.png (100%) rename website/docs/{gen-ai => ai}/inference/img/answer-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/excali-draw-sdxl-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/gradio-app-gpu.png (100%) rename website/docs/{gen-ai => ai}/inference/img/gradio-llama-ai-chat.png (100%) rename website/docs/{gen-ai => ai}/inference/img/gradio-llama2-13b-chat.png (100%) rename website/docs/{gen-ai => ai}/inference/img/gradio-test-ft.png (100%) rename website/docs/{gen-ai => ai}/inference/img/head-pod-deleted.png (100%) rename website/docs/{gen-ai => ai}/inference/img/llama-2-chat-ouput.png (100%) rename website/docs/{gen-ai => ai}/inference/img/llama2-13b-response.png (100%) rename website/docs/{gen-ai => ai}/inference/img/llama2-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/llama3.png (100%) rename website/docs/{gen-ai => ai}/inference/img/llma27b-hg.png (100%) rename website/docs/{gen-ai => ai}/inference/img/mistral-conv-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/mistral-conv-2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/mistral-gradio.png (100%) rename website/docs/{gen-ai => ai}/inference/img/mistral-sample-prompt-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/mistral7b-hg.png (100%) rename website/docs/{gen-ai => ai}/inference/img/neuron-monitor-cwci.png (100%) rename website/docs/{gen-ai => ai}/inference/img/nim-dashboard-2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/nim-dashboard.png (100%) rename website/docs/{gen-ai => ai}/inference/img/nim-ngc-api-key.png (100%) rename website/docs/{gen-ai => ai}/inference/img/nim-on-eks-arch.png (100%) rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-nim-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-nim-2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-ray-vllm-inf2-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/openweb-ui-ray-vllm-inf2-2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-deployed-mistral-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-deploying-mistral-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-sdxl.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-vllm-llama3-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard-vllm-mistral.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-dashboard.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-deplo-logs-vllm-mistral.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-grafana-dashboard.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-head-ha-1.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-head-ha-2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-logs-vllm-llama3-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-prometheus.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-serve-gpu-sd-cluster.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-serve-gpu-sd.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-serve-inf2-mistral-cluster.png (100%) rename website/docs/{gen-ai => ai}/inference/img/ray-vllm-inf2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/rayserve-llama2-13b-dashboard.png (100%) rename website/docs/{gen-ai => ai}/inference/img/stable-diffusion-xl-gradio.png (100%) rename website/docs/{gen-ai => ai}/inference/img/stable-diffusion-xl-prompt_3.png (100%) rename website/docs/{gen-ai => ai}/inference/img/triton-architecture.png (100%) rename website/docs/{gen-ai => ai}/inference/img/triton-grafana-dash2.png (100%) rename website/docs/{gen-ai => ai}/inference/img/triton-internals.png (100%) rename website/docs/{gen-ai => ai}/inference/img/triton-observability.png (100%) rename website/docs/{gen-ai => ai}/inference/img/worker-pod-running.png (100%) rename website/docs/{gen-ai => ai}/ml-platforms-eks.png (100%) rename website/docs/{gen-ai => ai}/training/GPUs/bionemo.md (100%) rename website/docs/{gen-ai => ai}/training/Neuron/BERT-Large.md (100%) rename website/docs/{gen-ai => ai}/training/Neuron/Llama2.md (100%) rename website/docs/{gen-ai => ai}/training/Neuron/RayTrain-Llama2.md (100%) rename website/docs/{gen-ai => ai}/training/_category_.json (100%) rename website/docs/{gen-ai => ai}/training/img/Llama2-RayTrain-Trn1.png (100%) rename website/docs/{gen-ai => ai}/training/img/llama2-trainium.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation1.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation2.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-precomplilation3.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-lens.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash1.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash2.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-testdata-raydash3.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress1.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress2.png (100%) rename website/docs/{gen-ai => ai}/training/img/raytrain-training-progress3.png (100%) diff --git a/website/docs/gen-ai/excalidraw/llama2-raytrain.excalidraw b/website/docs/ai/excalidraw/llama2-raytrain.excalidraw similarity index 100% rename from website/docs/gen-ai/excalidraw/llama2-raytrain.excalidraw rename to website/docs/ai/excalidraw/llama2-raytrain.excalidraw diff --git a/website/docs/gen-ai/excalidraw/llama3.svg b/website/docs/ai/excalidraw/llama3.svg similarity index 100% rename from website/docs/gen-ai/excalidraw/llama3.svg rename to website/docs/ai/excalidraw/llama3.svg diff --git a/website/docs/gen-ai/excalidraw/nvidia-triton-vllm.excalidraw b/website/docs/ai/excalidraw/nvidia-triton-vllm.excalidraw similarity index 100% rename from website/docs/gen-ai/excalidraw/nvidia-triton-vllm.excalidraw rename to website/docs/ai/excalidraw/nvidia-triton-vllm.excalidraw diff --git a/website/docs/gen-ai/excalidraw/stable-diffusion-inf2.excalidraw b/website/docs/ai/excalidraw/stable-diffusion-inf2.excalidraw similarity index 100% rename from website/docs/gen-ai/excalidraw/stable-diffusion-inf2.excalidraw rename to website/docs/ai/excalidraw/stable-diffusion-inf2.excalidraw diff --git a/website/docs/gen-ai/index.md b/website/docs/ai/index.md similarity index 81% rename from website/docs/gen-ai/index.md rename to website/docs/ai/index.md index 7b531f2f9..1e5bed78d 100644 --- a/website/docs/gen-ai/index.md +++ b/website/docs/ai/index.md @@ -3,9 +3,9 @@ sidebar_position: 1 sidebar_label: Overview --- -# Generative AI on EKS +# AI on EKS -Welcome to generative AI on [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/), your gateway to harnessing the power of Large Language Models (LLMs) for a wide range of applications. This introduction page serves as your starting point to explore our offerings for Training, Fine-tuning, and Inference using various LLMs, including BERT-Large, Llama2, Stable Diffusion, and more. +Welcome to AI on [Amazon Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/), your gateway to harnessing the power of Large Language Models (LLMs) for a wide range of applications. This introduction page serves as your starting point to explore our offerings for Training, Fine-tuning, and Inference using various LLMs, including BERT-Large, Llama2, Stable Diffusion, and more. Our platform provides multiple patterns for users to scale their generative AI workloads on EKS using a comprehensive suite of open-source ML tools/frameworks. @@ -28,4 +28,4 @@ Unlock the potential of LLMs for powerful inference tasks. Our Inference resourc ## Storage and Data Management Efficient data storage and management are fundamental to successful AI/ML operations. Our platform integrates with AWS storage solutions such as S3, EBS, EFS, and FSx to ensure scalable and reliable data handling. Utilize MLflow for model registry and versioning, and manage container images with Amazon ECR. This ensures a seamless workflow from model development to deployment, with robust data management practices to support your ML lifecycle. -Whether you're an experienced practitioner or new to the field, our generative AI on EKS capabilities empower you to harness the latest advancements in language modeling. Dive into each section to begin your journey, and explore how you can leverage these tools and frameworks to build, fine-tune, and deploy powerful AI models on Amazon EKS. +Whether you're an experienced practitioner or new to the field, our AI on EKS capabilities empower you to harness the latest advancements in language modeling. Dive into each section to begin your journey, and explore how you can leverage these tools and frameworks to build, fine-tune, and deploy powerful AI models on Amazon EKS. diff --git a/website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md b/website/docs/ai/inference/GPUs/nvidia-nim-llama3.md similarity index 100% rename from website/docs/gen-ai/inference/GPUs/nvidia-nim-llama3.md rename to website/docs/ai/inference/GPUs/nvidia-nim-llama3.md diff --git a/website/docs/gen-ai/inference/GPUs/stablediffusion-gpus.md b/website/docs/ai/inference/GPUs/stablediffusion-gpus.md similarity index 100% rename from website/docs/gen-ai/inference/GPUs/stablediffusion-gpus.md rename to website/docs/ai/inference/GPUs/stablediffusion-gpus.md diff --git a/website/docs/gen-ai/inference/GPUs/vLLM-NVIDIATritonServer.md b/website/docs/ai/inference/GPUs/vLLM-NVIDIATritonServer.md similarity index 100% rename from website/docs/gen-ai/inference/GPUs/vLLM-NVIDIATritonServer.md rename to website/docs/ai/inference/GPUs/vLLM-NVIDIATritonServer.md diff --git a/website/docs/gen-ai/inference/GPUs/vLLM-rayserve.md b/website/docs/ai/inference/GPUs/vLLM-rayserve.md similarity index 100% rename from website/docs/gen-ai/inference/GPUs/vLLM-rayserve.md rename to website/docs/ai/inference/GPUs/vLLM-rayserve.md diff --git a/website/docs/gen-ai/inference/Neuron/Mistral-7b-inf2.md b/website/docs/ai/inference/Neuron/Mistral-7b-inf2.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/Mistral-7b-inf2.md rename to website/docs/ai/inference/Neuron/Mistral-7b-inf2.md diff --git a/website/docs/gen-ai/inference/Neuron/llama2-inf2.md b/website/docs/ai/inference/Neuron/llama2-inf2.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/llama2-inf2.md rename to website/docs/ai/inference/Neuron/llama2-inf2.md diff --git a/website/docs/gen-ai/inference/Neuron/llama3-inf2.md b/website/docs/ai/inference/Neuron/llama3-inf2.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/llama3-inf2.md rename to website/docs/ai/inference/Neuron/llama3-inf2.md diff --git a/website/docs/gen-ai/inference/Neuron/rayserve-ha.md b/website/docs/ai/inference/Neuron/rayserve-ha.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/rayserve-ha.md rename to website/docs/ai/inference/Neuron/rayserve-ha.md diff --git a/website/docs/gen-ai/inference/Neuron/stablediffusion-inf2.md b/website/docs/ai/inference/Neuron/stablediffusion-inf2.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/stablediffusion-inf2.md rename to website/docs/ai/inference/Neuron/stablediffusion-inf2.md diff --git a/website/docs/gen-ai/inference/Neuron/vllm-ray-inf2.md b/website/docs/ai/inference/Neuron/vllm-ray-inf2.md similarity index 100% rename from website/docs/gen-ai/inference/Neuron/vllm-ray-inf2.md rename to website/docs/ai/inference/Neuron/vllm-ray-inf2.md diff --git a/website/docs/gen-ai/inference/_category_.json b/website/docs/ai/inference/_category_.json similarity index 100% rename from website/docs/gen-ai/inference/_category_.json rename to website/docs/ai/inference/_category_.json diff --git a/website/docs/gen-ai/inference/img/answer-1-contd.png b/website/docs/ai/inference/img/answer-1-contd.png similarity index 100% rename from website/docs/gen-ai/inference/img/answer-1-contd.png rename to website/docs/ai/inference/img/answer-1-contd.png diff --git a/website/docs/gen-ai/inference/img/answer-1.png b/website/docs/ai/inference/img/answer-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/answer-1.png rename to website/docs/ai/inference/img/answer-1.png diff --git a/website/docs/gen-ai/inference/img/excali-draw-sdxl-inf2.png b/website/docs/ai/inference/img/excali-draw-sdxl-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/excali-draw-sdxl-inf2.png rename to website/docs/ai/inference/img/excali-draw-sdxl-inf2.png diff --git a/website/docs/gen-ai/inference/img/gradio-app-gpu.png b/website/docs/ai/inference/img/gradio-app-gpu.png similarity index 100% rename from website/docs/gen-ai/inference/img/gradio-app-gpu.png rename to website/docs/ai/inference/img/gradio-app-gpu.png diff --git a/website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png b/website/docs/ai/inference/img/gradio-llama-ai-chat.png similarity index 100% rename from website/docs/gen-ai/inference/img/gradio-llama-ai-chat.png rename to website/docs/ai/inference/img/gradio-llama-ai-chat.png diff --git a/website/docs/gen-ai/inference/img/gradio-llama2-13b-chat.png b/website/docs/ai/inference/img/gradio-llama2-13b-chat.png similarity index 100% rename from website/docs/gen-ai/inference/img/gradio-llama2-13b-chat.png rename to website/docs/ai/inference/img/gradio-llama2-13b-chat.png diff --git a/website/docs/gen-ai/inference/img/gradio-test-ft.png b/website/docs/ai/inference/img/gradio-test-ft.png similarity index 100% rename from website/docs/gen-ai/inference/img/gradio-test-ft.png rename to website/docs/ai/inference/img/gradio-test-ft.png diff --git a/website/docs/gen-ai/inference/img/head-pod-deleted.png b/website/docs/ai/inference/img/head-pod-deleted.png similarity index 100% rename from website/docs/gen-ai/inference/img/head-pod-deleted.png rename to website/docs/ai/inference/img/head-pod-deleted.png diff --git a/website/docs/gen-ai/inference/img/llama-2-chat-ouput.png b/website/docs/ai/inference/img/llama-2-chat-ouput.png similarity index 100% rename from website/docs/gen-ai/inference/img/llama-2-chat-ouput.png rename to website/docs/ai/inference/img/llama-2-chat-ouput.png diff --git a/website/docs/gen-ai/inference/img/llama2-13b-response.png b/website/docs/ai/inference/img/llama2-13b-response.png similarity index 100% rename from website/docs/gen-ai/inference/img/llama2-13b-response.png rename to website/docs/ai/inference/img/llama2-13b-response.png diff --git a/website/docs/gen-ai/inference/img/llama2-inf2.png b/website/docs/ai/inference/img/llama2-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/llama2-inf2.png rename to website/docs/ai/inference/img/llama2-inf2.png diff --git a/website/docs/gen-ai/inference/img/llama3.png b/website/docs/ai/inference/img/llama3.png similarity index 100% rename from website/docs/gen-ai/inference/img/llama3.png rename to website/docs/ai/inference/img/llama3.png diff --git a/website/docs/gen-ai/inference/img/llma27b-hg.png b/website/docs/ai/inference/img/llma27b-hg.png similarity index 100% rename from website/docs/gen-ai/inference/img/llma27b-hg.png rename to website/docs/ai/inference/img/llma27b-hg.png diff --git a/website/docs/gen-ai/inference/img/mistral-conv-1.png b/website/docs/ai/inference/img/mistral-conv-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/mistral-conv-1.png rename to website/docs/ai/inference/img/mistral-conv-1.png diff --git a/website/docs/gen-ai/inference/img/mistral-conv-2.png b/website/docs/ai/inference/img/mistral-conv-2.png similarity index 100% rename from website/docs/gen-ai/inference/img/mistral-conv-2.png rename to website/docs/ai/inference/img/mistral-conv-2.png diff --git a/website/docs/gen-ai/inference/img/mistral-gradio.png b/website/docs/ai/inference/img/mistral-gradio.png similarity index 100% rename from website/docs/gen-ai/inference/img/mistral-gradio.png rename to website/docs/ai/inference/img/mistral-gradio.png diff --git a/website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png b/website/docs/ai/inference/img/mistral-sample-prompt-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/mistral-sample-prompt-1.png rename to website/docs/ai/inference/img/mistral-sample-prompt-1.png diff --git a/website/docs/gen-ai/inference/img/mistral7b-hg.png b/website/docs/ai/inference/img/mistral7b-hg.png similarity index 100% rename from website/docs/gen-ai/inference/img/mistral7b-hg.png rename to website/docs/ai/inference/img/mistral7b-hg.png diff --git a/website/docs/gen-ai/inference/img/neuron-monitor-cwci.png b/website/docs/ai/inference/img/neuron-monitor-cwci.png similarity index 100% rename from website/docs/gen-ai/inference/img/neuron-monitor-cwci.png rename to website/docs/ai/inference/img/neuron-monitor-cwci.png diff --git a/website/docs/gen-ai/inference/img/nim-dashboard-2.png b/website/docs/ai/inference/img/nim-dashboard-2.png similarity index 100% rename from website/docs/gen-ai/inference/img/nim-dashboard-2.png rename to website/docs/ai/inference/img/nim-dashboard-2.png diff --git a/website/docs/gen-ai/inference/img/nim-dashboard.png b/website/docs/ai/inference/img/nim-dashboard.png similarity index 100% rename from website/docs/gen-ai/inference/img/nim-dashboard.png rename to website/docs/ai/inference/img/nim-dashboard.png diff --git a/website/docs/gen-ai/inference/img/nim-ngc-api-key.png b/website/docs/ai/inference/img/nim-ngc-api-key.png similarity index 100% rename from website/docs/gen-ai/inference/img/nim-ngc-api-key.png rename to website/docs/ai/inference/img/nim-ngc-api-key.png diff --git a/website/docs/gen-ai/inference/img/nim-on-eks-arch.png b/website/docs/ai/inference/img/nim-on-eks-arch.png similarity index 100% rename from website/docs/gen-ai/inference/img/nim-on-eks-arch.png rename to website/docs/ai/inference/img/nim-on-eks-arch.png diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-1.png b/website/docs/ai/inference/img/openweb-ui-nim-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/openweb-ui-nim-1.png rename to website/docs/ai/inference/img/openweb-ui-nim-1.png diff --git a/website/docs/gen-ai/inference/img/openweb-ui-nim-2.png b/website/docs/ai/inference/img/openweb-ui-nim-2.png similarity index 100% rename from website/docs/gen-ai/inference/img/openweb-ui-nim-2.png rename to website/docs/ai/inference/img/openweb-ui-nim-2.png diff --git a/website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-1.png b/website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-1.png rename to website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-1.png diff --git a/website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-2.png b/website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-2.png similarity index 100% rename from website/docs/gen-ai/inference/img/openweb-ui-ray-vllm-inf2-2.png rename to website/docs/ai/inference/img/openweb-ui-ray-vllm-inf2-2.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png b/website/docs/ai/inference/img/ray-dashboard-deployed-mistral-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard-deployed-mistral-inf2.png rename to website/docs/ai/inference/img/ray-dashboard-deployed-mistral-inf2.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png b/website/docs/ai/inference/img/ray-dashboard-deploying-mistral-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard-deploying-mistral-inf2.png rename to website/docs/ai/inference/img/ray-dashboard-deploying-mistral-inf2.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-sdxl.png b/website/docs/ai/inference/img/ray-dashboard-sdxl.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard-sdxl.png rename to website/docs/ai/inference/img/ray-dashboard-sdxl.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-vllm-llama3-inf2.png b/website/docs/ai/inference/img/ray-dashboard-vllm-llama3-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard-vllm-llama3-inf2.png rename to website/docs/ai/inference/img/ray-dashboard-vllm-llama3-inf2.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard-vllm-mistral.png b/website/docs/ai/inference/img/ray-dashboard-vllm-mistral.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard-vllm-mistral.png rename to website/docs/ai/inference/img/ray-dashboard-vllm-mistral.png diff --git a/website/docs/gen-ai/inference/img/ray-dashboard.png b/website/docs/ai/inference/img/ray-dashboard.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-dashboard.png rename to website/docs/ai/inference/img/ray-dashboard.png diff --git a/website/docs/gen-ai/inference/img/ray-deplo-logs-vllm-mistral.png b/website/docs/ai/inference/img/ray-deplo-logs-vllm-mistral.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-deplo-logs-vllm-mistral.png rename to website/docs/ai/inference/img/ray-deplo-logs-vllm-mistral.png diff --git a/website/docs/gen-ai/inference/img/ray-grafana-dashboard.png b/website/docs/ai/inference/img/ray-grafana-dashboard.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-grafana-dashboard.png rename to website/docs/ai/inference/img/ray-grafana-dashboard.png diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-1.png b/website/docs/ai/inference/img/ray-head-ha-1.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-head-ha-1.png rename to website/docs/ai/inference/img/ray-head-ha-1.png diff --git a/website/docs/gen-ai/inference/img/ray-head-ha-2.png b/website/docs/ai/inference/img/ray-head-ha-2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-head-ha-2.png rename to website/docs/ai/inference/img/ray-head-ha-2.png diff --git a/website/docs/gen-ai/inference/img/ray-logs-vllm-llama3-inf2.png b/website/docs/ai/inference/img/ray-logs-vllm-llama3-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-logs-vllm-llama3-inf2.png rename to website/docs/ai/inference/img/ray-logs-vllm-llama3-inf2.png diff --git a/website/docs/gen-ai/inference/img/ray-prometheus.png b/website/docs/ai/inference/img/ray-prometheus.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-prometheus.png rename to website/docs/ai/inference/img/ray-prometheus.png diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png b/website/docs/ai/inference/img/ray-serve-gpu-sd-cluster.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-serve-gpu-sd-cluster.png rename to website/docs/ai/inference/img/ray-serve-gpu-sd-cluster.png diff --git a/website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png b/website/docs/ai/inference/img/ray-serve-gpu-sd.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-serve-gpu-sd.png rename to website/docs/ai/inference/img/ray-serve-gpu-sd.png diff --git a/website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png b/website/docs/ai/inference/img/ray-serve-inf2-mistral-cluster.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-serve-inf2-mistral-cluster.png rename to website/docs/ai/inference/img/ray-serve-inf2-mistral-cluster.png diff --git a/website/docs/gen-ai/inference/img/ray-vllm-inf2.png b/website/docs/ai/inference/img/ray-vllm-inf2.png similarity index 100% rename from website/docs/gen-ai/inference/img/ray-vllm-inf2.png rename to website/docs/ai/inference/img/ray-vllm-inf2.png diff --git a/website/docs/gen-ai/inference/img/rayserve-llama2-13b-dashboard.png b/website/docs/ai/inference/img/rayserve-llama2-13b-dashboard.png similarity index 100% rename from website/docs/gen-ai/inference/img/rayserve-llama2-13b-dashboard.png rename to website/docs/ai/inference/img/rayserve-llama2-13b-dashboard.png diff --git a/website/docs/gen-ai/inference/img/stable-diffusion-xl-gradio.png b/website/docs/ai/inference/img/stable-diffusion-xl-gradio.png similarity index 100% rename from website/docs/gen-ai/inference/img/stable-diffusion-xl-gradio.png rename to website/docs/ai/inference/img/stable-diffusion-xl-gradio.png diff --git a/website/docs/gen-ai/inference/img/stable-diffusion-xl-prompt_3.png b/website/docs/ai/inference/img/stable-diffusion-xl-prompt_3.png similarity index 100% rename from website/docs/gen-ai/inference/img/stable-diffusion-xl-prompt_3.png rename to website/docs/ai/inference/img/stable-diffusion-xl-prompt_3.png diff --git a/website/docs/gen-ai/inference/img/triton-architecture.png b/website/docs/ai/inference/img/triton-architecture.png similarity index 100% rename from website/docs/gen-ai/inference/img/triton-architecture.png rename to website/docs/ai/inference/img/triton-architecture.png diff --git a/website/docs/gen-ai/inference/img/triton-grafana-dash2.png b/website/docs/ai/inference/img/triton-grafana-dash2.png similarity index 100% rename from website/docs/gen-ai/inference/img/triton-grafana-dash2.png rename to website/docs/ai/inference/img/triton-grafana-dash2.png diff --git a/website/docs/gen-ai/inference/img/triton-internals.png b/website/docs/ai/inference/img/triton-internals.png similarity index 100% rename from website/docs/gen-ai/inference/img/triton-internals.png rename to website/docs/ai/inference/img/triton-internals.png diff --git a/website/docs/gen-ai/inference/img/triton-observability.png b/website/docs/ai/inference/img/triton-observability.png similarity index 100% rename from website/docs/gen-ai/inference/img/triton-observability.png rename to website/docs/ai/inference/img/triton-observability.png diff --git a/website/docs/gen-ai/inference/img/worker-pod-running.png b/website/docs/ai/inference/img/worker-pod-running.png similarity index 100% rename from website/docs/gen-ai/inference/img/worker-pod-running.png rename to website/docs/ai/inference/img/worker-pod-running.png diff --git a/website/docs/gen-ai/ml-platforms-eks.png b/website/docs/ai/ml-platforms-eks.png similarity index 100% rename from website/docs/gen-ai/ml-platforms-eks.png rename to website/docs/ai/ml-platforms-eks.png diff --git a/website/docs/gen-ai/training/GPUs/bionemo.md b/website/docs/ai/training/GPUs/bionemo.md similarity index 100% rename from website/docs/gen-ai/training/GPUs/bionemo.md rename to website/docs/ai/training/GPUs/bionemo.md diff --git a/website/docs/gen-ai/training/Neuron/BERT-Large.md b/website/docs/ai/training/Neuron/BERT-Large.md similarity index 100% rename from website/docs/gen-ai/training/Neuron/BERT-Large.md rename to website/docs/ai/training/Neuron/BERT-Large.md diff --git a/website/docs/gen-ai/training/Neuron/Llama2.md b/website/docs/ai/training/Neuron/Llama2.md similarity index 100% rename from website/docs/gen-ai/training/Neuron/Llama2.md rename to website/docs/ai/training/Neuron/Llama2.md diff --git a/website/docs/gen-ai/training/Neuron/RayTrain-Llama2.md b/website/docs/ai/training/Neuron/RayTrain-Llama2.md similarity index 100% rename from website/docs/gen-ai/training/Neuron/RayTrain-Llama2.md rename to website/docs/ai/training/Neuron/RayTrain-Llama2.md diff --git a/website/docs/gen-ai/training/_category_.json b/website/docs/ai/training/_category_.json similarity index 100% rename from website/docs/gen-ai/training/_category_.json rename to website/docs/ai/training/_category_.json diff --git a/website/docs/gen-ai/training/img/Llama2-RayTrain-Trn1.png b/website/docs/ai/training/img/Llama2-RayTrain-Trn1.png similarity index 100% rename from website/docs/gen-ai/training/img/Llama2-RayTrain-Trn1.png rename to website/docs/ai/training/img/Llama2-RayTrain-Trn1.png diff --git a/website/docs/gen-ai/training/img/llama2-trainium.png b/website/docs/ai/training/img/llama2-trainium.png similarity index 100% rename from website/docs/gen-ai/training/img/llama2-trainium.png rename to website/docs/ai/training/img/llama2-trainium.png diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation1.png b/website/docs/ai/training/img/raytrain-precomplilation1.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-precomplilation1.png rename to website/docs/ai/training/img/raytrain-precomplilation1.png diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation2.png b/website/docs/ai/training/img/raytrain-precomplilation2.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-precomplilation2.png rename to website/docs/ai/training/img/raytrain-precomplilation2.png diff --git a/website/docs/gen-ai/training/img/raytrain-precomplilation3.png b/website/docs/ai/training/img/raytrain-precomplilation3.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-precomplilation3.png rename to website/docs/ai/training/img/raytrain-precomplilation3.png diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-lens.png b/website/docs/ai/training/img/raytrain-testdata-lens.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-testdata-lens.png rename to website/docs/ai/training/img/raytrain-testdata-lens.png diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash.png b/website/docs/ai/training/img/raytrain-testdata-raydash.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash.png rename to website/docs/ai/training/img/raytrain-testdata-raydash.png diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash1.png b/website/docs/ai/training/img/raytrain-testdata-raydash1.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash1.png rename to website/docs/ai/training/img/raytrain-testdata-raydash1.png diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash2.png b/website/docs/ai/training/img/raytrain-testdata-raydash2.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash2.png rename to website/docs/ai/training/img/raytrain-testdata-raydash2.png diff --git a/website/docs/gen-ai/training/img/raytrain-testdata-raydash3.png b/website/docs/ai/training/img/raytrain-testdata-raydash3.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-testdata-raydash3.png rename to website/docs/ai/training/img/raytrain-testdata-raydash3.png diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress1.png b/website/docs/ai/training/img/raytrain-training-progress1.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-training-progress1.png rename to website/docs/ai/training/img/raytrain-training-progress1.png diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress2.png b/website/docs/ai/training/img/raytrain-training-progress2.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-training-progress2.png rename to website/docs/ai/training/img/raytrain-training-progress2.png diff --git a/website/docs/gen-ai/training/img/raytrain-training-progress3.png b/website/docs/ai/training/img/raytrain-training-progress3.png similarity index 100% rename from website/docs/gen-ai/training/img/raytrain-training-progress3.png rename to website/docs/ai/training/img/raytrain-training-progress3.png diff --git a/website/docs/blueprints/ai-ml/index.md b/website/docs/blueprints/ai-ml/index.md index 379fa8f73..15e6ea212 100644 --- a/website/docs/blueprints/ai-ml/index.md +++ b/website/docs/blueprints/ai-ml/index.md @@ -32,15 +32,8 @@ By choosing Amazon EKS, you gain access to a robust infrastructure that can hand ## Deploying Generative AI Models on Amazon EKS -Deploying Generative AI models on Amazon EKS is supported through two major blueprints: - -- **For GPUs**: Use the [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark). -- **For Neuron**: Start with the [Trainium on EKS blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/trainium). - -In addition to these, this section provides other valuable ML blueprints: +Deploying an AI stack on EKS starts with infrastructure [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark). This blueprint provides a customizable environment on which to build an AI platform. For task specific workloads, AI on EKS has a few blueprints that preconfigures the environment: +- **For Ray**: Use the [JARK stack blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jark). - **NVIDIA Spark RAPIDS**: For Spark on GPU workloads, refer to the [NVIDIA Spark RAPIDS blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/emr-spark-rapids). - -- **JupyterHub on EKS**: Explore the [JupyterHub blueprint](https://awslabs.github.io/data-on-eks/docs/blueprints/ai-ml/jupyterhub), which showcases Time Slicing and MIG features, as well as multi-tenant configurations with profiles. This is ideal for deploying large-scale JupyterHub platforms on EKS. - - **Additional Patterns**: For other patterns using NVIDIA Triton server, NVIDIA NGC, and more, refer to the [Gen AI page](https://awslabs.github.io/data-on-eks/docs/gen-ai). diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 845b739da..6a06345c0 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -54,7 +54,7 @@ const config = { }, items: [ { type: 'doc', docId: 'introduction/intro', position: 'left', label: 'Introduction' }, - { type: 'doc', docId: 'gen-ai/index', position: 'left', label: 'Gen AI' }, + { type: 'doc', docId: 'ai/index', position: 'left', label: 'Gen AI' }, { type: 'doc', docId: 'blueprints/amazon-emr-on-eks/index', position: 'left', label: 'Blueprints' }, { type: 'doc', docId: 'bestpractices/intro', position: 'left', label: 'Best Practices' }, { type: 'doc', docId: 'benchmarks/emr-on-eks', position: 'left', label: 'Benchmarks' }, diff --git a/website/sidebars.js b/website/sidebars.js index e69016bab..01144305f 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -16,7 +16,7 @@ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure // docs: [{type: 'autogenerated', dirName: '.'}], // But you can create a sidebar manually - genai: [{type: 'autogenerated', dirName: 'gen-ai'}], + ai: [{type: 'autogenerated', dirName: 'ai'}], blueprints: [{type: 'autogenerated', dirName: 'blueprints'}], bestpractices: [{type: 'autogenerated', dirName: 'bestpractices'}], benchmarks: [{type: 'autogenerated', dirName: 'benchmarks'}], diff --git a/website/src/pages/index.js b/website/src/pages/index.js index d9b7d05d0..e0ed10e81 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -35,8 +35,8 @@ function HomepageHeader() { - Explore Gen AI + to="/docs/ai"> + AI on EKS From c66a3514efd141c16982568072b7de8c235162e7 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Wed, 29 Jan 2025 11:10:45 -0800 Subject: [PATCH 08/16] set ai stack defaults and add jupyterhub --- ai-ml/infrastructure/terraform/addons.tf | 72 ++--- ai-ml/infrastructure/terraform/cognito.tf | 224 +++++++++++++++ .../terraform/helm-values/efs/Chart.yaml | 5 + .../helm-values/efs/templates/efs-pv.yaml | 14 + .../helm-values/efs/templates/efs-pvc.yaml | 11 + .../terraform/helm-values/efs/values.yaml | 5 + .../jupyterhub-values-cognito.yaml | 264 ++++++++++++++++++ .../helm-values/jupyterhub-values-dummy.yaml | 219 +++++++++++++++ .../helm-values/jupyterhub-values-oauth.yaml | 232 +++++++++++++++ ai-ml/infrastructure/terraform/jupyterhub.tf | 163 +++++++++++ ai-ml/infrastructure/terraform/variables.tf | 74 ++++- 11 files changed, 1245 insertions(+), 38 deletions(-) create mode 100644 ai-ml/infrastructure/terraform/cognito.tf create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml create mode 100644 ai-ml/infrastructure/terraform/helm-values/efs/values.yaml create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml create mode 100755 ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml create mode 100644 ai-ml/infrastructure/terraform/jupyterhub.tf diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index 9ba662a22..57e63d062 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -1,3 +1,14 @@ +# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM) +data "aws_acm_certificate" "issued" { + count = var.jupyter_hub_auth_mechanism != "dummy" ? 1 : 0 + domain = var.acm_certificate_domain + statuses = ["ISSUED"] +} + +locals { + cognito_custom_domain = var.cognito_custom_domain +} + #--------------------------------------------------------------- # GP3 Encrypted Storage Class #--------------------------------------------------------------- @@ -83,7 +94,7 @@ module "eks_blueprints_addons" { preserve = true } } - + enable_aws_efs_csi_driver = var.enable_aws_efs_csi_driver #--------------------------------------- # AWS Load Balancer Controller Add-on #--------------------------------------- @@ -204,9 +215,21 @@ module "data_addons" { #--------------------------------------------------------------- enable_jupyterhub = var.enable_jupyterhub jupyterhub_helm_config = { - namespace = kubernetes_namespace_v1.jupyterhub.id - create_namespace = false - values = [file("${path.module}/helm-values/jupyterhub-values.yaml")] + values = [templatefile("${path.module}/helm-values/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", { + ssl_cert_arn = try(data.aws_acm_certificate.issued[0].arn, "") + jupyterdomain = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "") + authorize_url = var.oauth_domain != "" ? "${var.oauth_domain}/auth" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "") + token_url = var.oauth_domain != "" ? "${var.oauth_domain}/token" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "") + userdata_url = var.oauth_domain != "" ? "${var.oauth_domain}/userinfo" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "") + username_key = try(var.oauth_username_key, "") + client_id = var.oauth_jupyter_client_id != "" ? var.oauth_jupyter_client_id : try(aws_cognito_user_pool_client.user_pool_client[0].id, "") + client_secret = var.oauth_jupyter_client_secret != "" ? var.oauth_jupyter_client_secret : try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "") + user_pool_id = try(aws_cognito_user_pool.pool[0].id, "") + identity_pool_id = try(aws_cognito_identity_pool.identity_pool[0].id, "") + jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name + region = var.region + })] + version = "3.2.1" } enable_volcano = var.enable_volcano @@ -551,36 +574,6 @@ module "data_addons" { ] } - -#--------------------------------------------------------------- -# Additional Resources -#--------------------------------------------------------------- - -resource "kubernetes_namespace_v1" "jupyterhub" { - metadata { - name = "jupyterhub" - } -} - - -resource "kubernetes_secret_v1" "huggingface_token" { - metadata { - name = "hf-token" - namespace = kubernetes_namespace_v1.jupyterhub.id - } - - data = { - token = var.huggingface_token - } -} - -resource "kubernetes_config_map_v1" "notebook" { - metadata { - name = "notebook" - namespace = kubernetes_namespace_v1.jupyterhub.id - } -} - #--------------------------------------------------------------- # MLflow Tracking Add-on #--------------------------------------------------------------- @@ -648,6 +641,17 @@ resource "kubectl_manifest" "dcgm_service" { yaml_body = file("${path.module}/monitoring/dcgm-service.yaml") } +resource "kubectl_manifest" "efs_sc" { + count = var.enable_aws_efs_csi_driver ? 1 : 0 + yaml_body = < { + event.response = { + claimsOverrideDetails: { + claimsToAddOrOverride: { + department: "engineering", + }, + }, + }; + + return event; + }; + + EOF + } +} + +resource "aws_lambda_function" "pretoken_trigger" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + function_name = "pretoken-trigger-function" + filename = data.archive_file.lambda.output_path + source_code_hash = data.archive_file.lambda.output_base64sha256 + + runtime = "nodejs18.x" + handler = "index.handler" + + role = aws_iam_role.iam_for_lambda[0].arn +} + +#--------------------------------------------------------------- +# Cognito pool, domain and client creation. +# This can be used +# Auth integration later. +#---------------------------------------------------------------- +resource "aws_cognito_user_pool" "pool" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "jupyterhub-userpool" + + username_attributes = ["email"] + auto_verified_attributes = ["email"] + + password_policy { + minimum_length = 6 + } + + lambda_config { + pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn + } +} + +resource "aws_cognito_user_pool_domain" "domain" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + domain = local.cognito_custom_domain + user_pool_id = aws_cognito_user_pool.pool[0].id +} + +resource "aws_cognito_user_pool_client" "user_pool_client" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "jupyter-client" + access_token_validity = 1 + token_validity_units { + access_token = "days" + } + callback_urls = ["https://${var.jupyterhub_domain}/hub/oauth_callback"] + user_pool_id = aws_cognito_user_pool.pool[0].id + allowed_oauth_flows_user_pool_client = true + allowed_oauth_flows = ["code"] + allowed_oauth_scopes = ["openid", "email"] + generate_secret = true + supported_identity_providers = [ + "COGNITO" + ] + + depends_on = [aws_cognito_user_pool_domain.domain] +} + +#--------------------------------------------------------------- +# Cognito identity pool creation. +#---------------------------------------------------------------- +resource "aws_cognito_identity_pool" "identity_pool" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_name = "jupyterhub-identity-pool" + allow_unauthenticated_identities = false + cognito_identity_providers { + client_id = aws_cognito_user_pool_client.user_pool_client[0].id + provider_name = aws_cognito_user_pool.pool[0].endpoint + server_side_token_check = true + } + + depends_on = [aws_cognito_user_pool_client.user_pool_client] +} + +resource "aws_s3_bucket" "jupyterhub_bucket" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket_prefix = "jupyterhub-test-bucket-" +} + +resource "aws_s3_object" "engineering_object" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket = aws_s3_bucket.jupyterhub_bucket[0].id + key = "engineering/" +} + +resource "aws_s3_object" "legal_object" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + bucket = aws_s3_bucket.jupyterhub_bucket[0].id + key = "legal/" +} + +#--------------------------------------------------------------- +# IAM role for a team member from the engineering department +# In theory there would be other departments such as "legal" +#---------------------------------------------------------------- +resource "aws_iam_role" "cognito_authenticated_engineering_role" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + + name = "EngineeringTeamRole" + + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"], + Effect = "Allow", + Principal = { + Federated = "cognito-identity.amazonaws.com" + }, + Condition = { + StringEquals = { + "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id + }, + "ForAnyValue:StringLike" : { + "cognito-identity.amazonaws.com:amr" : "authenticated" + } + } + } + ] + }) +} + +resource "aws_iam_role_policy" "s3_cognito_engineering_policy" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "s3_cognito_engineering_policy" + role = aws_iam_role.cognito_authenticated_engineering_role[0].id + + policy = <<-EOF +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["s3:List*"], + "Resource": "*", + "Condition": { + "StringEquals": { + "s3:prefix": "$${aws:PrincipalTag/department}" + } + } + } + ] +} +EOF +} + +resource "aws_cognito_identity_pool_provider_principal_tag" "example" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id + identity_provider_name = aws_cognito_user_pool.pool[0].endpoint + use_defaults = false + principal_tags = { + department = "department" + } +} + +resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + name = "S3ReadOnlyAccessAttachment" + policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" + roles = [aws_iam_role.cognito_authenticated_engineering_role[0].name] +} + +resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" { + count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 + identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id + roles = { + authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn + } +} diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml new file mode 100644 index 000000000..e69ed7f3d --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/efs/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +name: efs +description: Helm chart for efs options on the cluster +version: 0.0.1 +appVersion: 0.0.1 diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml new file mode 100644 index 000000000..3098ce85e --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pv.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: {{ .Values.pv.name }} +spec: + capacity: + storage: 123Gi + accessModes: + - ReadWriteMany + storageClassName: efs-sc + persistentVolumeReclaimPolicy: Retain + csi: + driver: efs.csi.aws.com + volumeHandle: {{ .Values.pv.volumeHandle }} diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml new file mode 100644 index 000000000..1ab334f6d --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/efs/templates/efs-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.pvc.name }} +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc + resources: + requests: + storage: 1Gi diff --git a/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml b/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml new file mode 100644 index 000000000..c0fee0a22 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/efs/values.yaml @@ -0,0 +1,5 @@ +pv: + name: efs-persist + volumeHandle: +pvc: + name: efs-persist diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml new file mode 100755 index 000000000..4e2073836 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml @@ -0,0 +1,264 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + command: ["sh", "-c", "pip install boto3 && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py"] + config: + GenericOAuthenticator: + oauth_callback_url: ${jupyterdomain} + client_id: ${client_id} + client_secret: ${client_secret} + authorize_url: ${authorize_url} + token_url: ${token_url} + userdata_url: ${userdata_url} + scope: + - openid + - email + username_key: "username" + login_service : "AWS Cognito" + userdata_method: "POST" + JupyterHub: + authenticator_class: generic-oauth + extraConfig: + jupyterhub_config.py: |- + c.KubeSpawner.start_timeout = 1200 + c.Authenticator.enable_auth_state = True + + cognito_config.py: |- + import boto3 + def auth_state_hook(spawner, auth_state): + client_idp = boto3.client('cognito-idp', region_name="${region}") + auth_response = client_idp.initiate_auth( + AuthFlow="REFRESH_TOKEN_AUTH", + AuthParameters={ + "REFRESH_TOKEN": auth_state['refresh_token'], + "SECRET_HASH": "${client_secret}" + }, + ClientId="${client_id}" + ) + id_token = auth_response["AuthenticationResult"]["IdToken"] + client_identity = boto3.client("cognito-identity", region_name="${region}") + identity_response = client_identity.get_id( + IdentityPoolId="${identity_pool_id}", + Logins={ + f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token + } + ) + identity_id = identity_response['IdentityId'] + credentials = client_identity.get_credentials_for_identity( + IdentityId=identity_id, + Logins={ + f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token + } + ) + key = credentials["Credentials"]["AccessKeyId"] + secret = credentials["Credentials"]["SecretKey"] + token = credentials["Credentials"]["SessionToken"] + spawner.environment['AWS_ACCESS_KEY_ID'] = key + spawner.environment['AWS_SECRET_ACCESS_KEY'] = secret + spawner.environment['AWS_SESSION_TOKEN'] = token + + c.Spawner.auth_state_hook = auth_state_hook + +proxy: + https: + enabled: true + type: offload + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} + service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' + service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 + +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull + profileList: + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + display_name: "PySpark 3.5.0 + Python 3.11" + default: true + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.5.0 + pyspark341: + display_name: "PySpark 3.4.1 + Python 3.11" + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.4.1 + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + # NOTE: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 20 + mem_guarantee: 100G + cpu_limit: 20 + mem_limit: 100G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: null + - display_name: Data Science (GPU + Time-Slicing - G5) + default: true + description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" + kubespawner_override: + # namespace: data-team-a + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_limit: 2 + mem_limit: 4G + cpu_guarantee: 2 + mem_guarantee: 4G + cmd: "start-singleuser.sh" + # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. + # Hence, this profile relies on Managed node groups with GPU MIG enabled + - display_name: Data Science (GPU + MIG on P4d.24xlarge) + description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_guarantees: + nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb + # extra_resource_limits: + # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + - display_name: Data Science (GPU - P4d.24xlarge) + description: "GPU with P4d instances | Karpenter Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "8" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + readOnly: false + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + HUGGING_FACE_HUB_TOKEN: + valueFrom: + secretKeyRef: + name: hf-token + key: token + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 +# userPods: +# nodeAffinity: +# matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml new file mode 100755 index 000000000..0d1fcdc4e --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml @@ -0,0 +1,219 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + +proxy: + https: + enabled: false + type: offload + service: + type: ClusterIP + # Disabled LoadBalancer type +# annotations: +# service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "ssl_cert_arn" +# service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" +# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" +# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" +# service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip +# service.beta.kubernetes.io/aws-load-balancer-scheme: internal +# service.beta.kubernetes.io/aws-load-balancer-type: external +# service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' +# service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull + profileList: + - display_name: Elyra (CPU) + description: "Elyra Notebooks | Karpenter Autoscaling" + kubespawner_override: + image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0 + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + display_name: "PySpark 3.5.0 + Python 3.11" + default: true + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.5.0 + pyspark341: + display_name: "PySpark 3.4.1 + Python 3.11" + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.4.1 + kubespawner_override: + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + # NOTE: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 20 + mem_guarantee: 100G + cpu_limit: 20 + mem_limit: 100G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: null + - display_name: Data Science (GPU + Time-Slicing - G5) + default: true + description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" + kubespawner_override: + # namespace: data-team-a + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + node_selector: + NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_limit: 2 + mem_limit: 4G + cpu_guarantee: 2 + mem_guarantee: 4G + cmd: "start-singleuser.sh" + # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. + # Hence, this profile relies on Managed node groups with GPU MIG enabled + - display_name: Data Science (GPU + MIG on P4d.24xlarge) + description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_guarantees: + nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb + # extra_resource_limits: + # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + - display_name: Data Science (GPU - P4d.24xlarge) + description: "GPU with P4d instances | Karpenter Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "8" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + HUGGING_FACE_HUB_TOKEN: + valueFrom: + secretKeyRef: + name: hf-token + key: token + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 +# userPods: +# nodeAffinity: +# matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml new file mode 100755 index 000000000..486a750a8 --- /dev/null +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml @@ -0,0 +1,232 @@ +hub: + db: + pvc: + storage: 50Gi + storageClassName: gp3 + authenticatePrometheus: false + config: + GenericOAuthenticator: + oauth_callback_url: ${jupyterdomain} + client_id: ${client_id} + client_secret: ${client_secret} + authorize_url: ${authorize_url} + token_url: ${token_url} + userdata_url: ${userdata_url} + scope: + - openid + - profile + username_key: "${username_key}" + login_service: "oauth" + allow_all: true # Allows all oauth authenticated users to use Jupyterhub. For finer grained control, you can use `allowed_users`: https://jupyterhub.readthedocs.io/en/stable/tutorial/getting-started/authenticators-users-basics.html#deciding-who-is-allowed + JupyterHub: + authenticator_class: generic-oauth +proxy: + https: + enabled: true + type: offload + service: + annotations: + service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} + service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" + service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" + service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip + service.beta.kubernetes.io/aws-load-balancer-scheme: internal + service.beta.kubernetes.io/aws-load-balancer-type: external + service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' + service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 + +singleuser: + startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull + profileList: + - display_name: Elyra (CPU) + description: "Elyra Notebooks | Karpenter Autoscaling" + kubespawner_override: + image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0 + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + - display_name: Data Engineering (CPU) + description: "PySpark Notebooks | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pyspark350: + display_name: "PySpark 3.5.0 + Python 3.11" + default: true + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.5.0 + pyspark341: + display_name: "PySpark 3.4.1 + Python 3.11" + kubespawner_override: + image: jupyter/pyspark-notebook:spark-3.4.1 + kubespawner_override: + cpu_guarantee: 2 + mem_guarantee: 8G + cpu_limit: 4 + mem_limit: 8G + cmd: null + # NOTE: + - display_name: Trainium (trn1) + description: "Trainium | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch 1.13.1 + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: "start-singleuser.sh" + - display_name: Inferentia (inf2) + description: "Inferentia | Karpenter AutoScaling" + profile_options: + image: + display_name: "Image" + choices: + pytorch1131: + display_name: "PyTorch + torch-neuronx" + default: true + kubespawner_override: + image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest + tflow2101: + display_name: "Tensorflow + tensorflow-neuronx" + kubespawner_override: + image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest + kubespawner_override: + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule + cpu_guarantee: 20 + mem_guarantee: 100G + cpu_limit: 20 + mem_limit: 100G + extra_resource_limits: + aws.amazon.com/neuron: "1" + cmd: null + - display_name: Data Science (GPU + Time-Slicing - G5) + default: true + description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" + kubespawner_override: + # namespace: data-team-a + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_limit: 2 + mem_limit: 4G + cpu_guarantee: 2 + mem_guarantee: 4G + cmd: "start-singleuser.sh" + # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. + # Hence, this profile relies on Managed node groups with GPU MIG enabled + - display_name: Data Science (GPU + MIG on P4d.24xlarge) + description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_guarantees: + nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb + # extra_resource_limits: + # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + - display_name: Data Science (GPU - P4d.24xlarge) + description: "GPU with P4d instances | Karpenter Autoscaler" + kubespawner_override: + image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + extra_resource_limits: + nvidia.com/gpu: "8" + cpu_guarantee: 2 + mem_guarantee: 10G + cpu_limit: 2 + mem_limit: 10G + cmd: "start-singleuser.sh" + storage: + type: "static" + static: + pvcName: "efs-persist" + subPath: "home/{username}" + extraVolumes: + - name: jupyterhub-shared + persistentVolumeClaim: + claimName: efs-persist-shared + extraVolumeMounts: + - name: jupyterhub-shared + mountPath: /home/shared + readOnly: false + serviceAccountName: ${jupyter_single_user_sa_name} + allowPrivilegeEscalation: true + extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account + securityContext: + fsGroup: 100 + extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance + GRANT_SUDO: "yes" + NOTEBOOK_ARGS: "--allow-root" + CHOWN_HOME: "yes" + CHOWN_HOME_OPTS: "-R" + CHOWN_EXTRA: "/home/shared" + HUGGING_FACE_HUB_TOKEN: + valueFrom: + secretKeyRef: + name: hf-token + key: token + uid: 0 + fsGid: 0 + cmd: null + +# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html +scheduling: + userScheduler: + enabled: true + podPriority: + enabled: true + userPlaceholder: + enabled: false + replicas: 1 + userPods: + nodeAffinity: + matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner + +prePuller: + hook: + enabled: false + continuous: + # NOTE: if used with Karpenter, also add user-placeholders + enabled: false + +global: + safeToShowValues: false diff --git a/ai-ml/infrastructure/terraform/jupyterhub.tf b/ai-ml/infrastructure/terraform/jupyterhub.tf new file mode 100644 index 000000000..7170c9ff9 --- /dev/null +++ b/ai-ml/infrastructure/terraform/jupyterhub.tf @@ -0,0 +1,163 @@ +#----------------------------------------------------------------------------------------- +# JupyterHub Sinlgle User IRSA, maybe that block could be incorporated in add-on registry +#----------------------------------------------------------------------------------------- +resource "kubernetes_namespace" "jupyterhub" { + metadata { + name = "jupyterhub" + } +} + +module "jupyterhub_single_user_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + + role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa" + + role_policy_arns = { + policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances. + } + + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["${kubernetes_namespace.jupyterhub.metadata[0].name}:jupyterhub-single-user"] + } + } +} + +resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa.iam_role_arn } + } + + automount_service_account_token = true +} + +resource "kubernetes_secret_v1" "jupyterhub_single_user" { + metadata { + name = "${module.eks.cluster_name}-jupyterhub-single-user-secret" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + annotations = { + "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name + "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub.metadata[0].name + } + } + + type = "kubernetes.io/service-account-token" +} + +#--------------------------------------------------------------- +# EFS Filesystem for private volumes per user +# This will be replaced with Dynamic EFS provision using EFS CSI Driver +#--------------------------------------------------------------- +resource "aws_efs_file_system" "efs" { + encrypted = true + + tags = local.tags +} + +#--------------------------------------------------------------- +# module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] +# We use index 2 and 3 to select the subnet in AZ1 with the 100.x CIDR: +# Create EFS mount targets for the 3rd subnet +resource "aws_efs_mount_target" "efs_mt_1" { + file_system_id = aws_efs_file_system.efs.id + subnet_id = module.vpc.private_subnets[2] + security_groups = [aws_security_group.efs.id] +} + +# Create EFS mount target for the 4th subnet +resource "aws_efs_mount_target" "efs_mt_2" { + file_system_id = aws_efs_file_system.efs.id + subnet_id = module.vpc.private_subnets[3] + security_groups = [aws_security_group.efs.id] +} + +resource "aws_security_group" "efs" { + name = "${local.name}-efs" + description = "Allow inbound NFS traffic from private subnets of the VPC" + vpc_id = module.vpc.vpc_id + + ingress { + description = "Allow NFS 2049/tcp" + cidr_blocks = module.vpc.vpc_secondary_cidr_blocks + from_port = 2049 + to_port = 2049 + protocol = "tcp" + } + + tags = local.tags +} + +#--------------------------------------- +# EFS Configuration +#--------------------------------------- +module "efs_config" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "~> 1.2" + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + cluster_version = module.eks.cluster_version + oidc_provider_arn = module.eks.oidc_provider_arn + + helm_releases = { + efs = { + name = "efs" + description = "A Helm chart for storage configurations" + namespace = "jupyterhub" + create_namespace = false + chart = "${path.module}/helm-values/efs" + chart_version = "0.0.1" + values = [ + <<-EOT + pv: + name: efs-persist + volumeHandle: ${aws_efs_file_system.efs.id}:/home + pvc: + name: efs-persist + EOT + ] + } + efs-shared = { + name = "efs-shared" + description = "A Helm chart for shared storage configurations" + namespace = "jupyterhub" + create_namespace = false + chart = "${path.module}/helm-values/efs" + chart_version = "0.0.1" + values = [ + <<-EOT + pv: + name: efs-persist-shared + volumeHandle: ${aws_efs_file_system.efs.id}:/shared + pvc: + name: efs-persist-shared + EOT + ] + } + } + + depends_on = [kubernetes_namespace.jupyterhub] +} +#--------------------------------------------------------------- +# Additional Resources +#--------------------------------------------------------------- +resource "kubernetes_secret_v1" "huggingface_token" { + metadata { + name = "hf-token" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + } + + data = { + token = var.huggingface_token + } +} + +resource "kubernetes_config_map_v1" "notebook" { + metadata { + name = "notebook" + namespace = kubernetes_namespace.jupyterhub.metadata[0].name + } +} diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf index 60618324a..d768410c7 100644 --- a/ai-ml/infrastructure/terraform/variables.tf +++ b/ai-ml/infrastructure/terraform/variables.tf @@ -1,6 +1,6 @@ variable "name" { description = "Name of the VPC and EKS Cluster" - default = "ml-stack" + default = "ai-stack" type = string } @@ -35,13 +35,18 @@ variable "secondary_cidr_blocks" { variable "enable_aws_cloudwatch_metrics" { description = "Enable AWS Cloudwatch Metrics addon" type = bool - default = true + default = false } variable "bottlerocket_data_disk_snapshot_id" { description = "Bottlerocket Data Disk Snapshot ID" type = string default = "" } +variable "enable_aws_efs_csi_driver" { + description = "Enable AWS EFS CSI Driver" + type = bool + default = false +} variable "enable_aws_efa_k8s_device_plugin" { description = "Enable AWS EFA K8s Device Plugin" type = bool @@ -101,12 +106,12 @@ variable "enable_jupyterhub" { variable "enable_volcano" { description = "Enable Volcano" type = bool - default = true + default = false } variable "enable_kuberay_operator" { description = "Enable KubeRay Operator" type = bool - default = true + default = false } variable "huggingface_token" { description = "Hugging Face Secret Token" @@ -114,3 +119,64 @@ variable "huggingface_token" { default = "DUMMY_TOKEN_REPLACE_ME" sensitive = true } + +# Jupyterhub Specific Variables + +# NOTE: You need to use private domain or public domain name with ACM certificate +# Data-on-EKS website docs will show you how to create free public domain name with ACM certificate for testing purpose only +# Example of public domain name(..com): eks.jupyter-doeks.dynamic-dns.com +variable "jupyter_hub_auth_mechanism" { + type = string + description = "Allowed values: cognito, dummy, oauth" + default = "dummy" +} + +# Domain name is public so make sure you use a unique while deploying, Only needed if auth mechanism is set to cognito +variable "cognito_custom_domain" { + description = "Cognito domain prefix for Hosted UI authentication endpoints" + type = string + default = "eks" +} + +# Only needed if auth mechanism is set to cognito +variable "acm_certificate_domain" { + type = string + description = "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com" + default = "" +} + +# Only needed if auth mechanism is set to cognito or oauth. This is the domain for jupyterhub +variable "jupyterhub_domain" { + type = string + description = "Enter domain name for jupyterhub to be hosted, e.g. eks.example.com. Only needed if auth mechanism is set to cognito or oauth" + default = "" +} + +# Only needed if auth mechanism is set to oauth. This is the root path for the oidc endpoints +variable "oauth_domain" { + type = string + description = "Enter oauth domain and endpoint, e.g. https://keycloak.example.com/realms/master/protocol/openid-connect. Only needed if auth mechanism is set to oauth" + default = "" +} + +# Only needed if auth mechanism is set to oauth. This is the id of the client +variable "oauth_jupyter_client_id" { + type = string + description = "Enter oauth client id for jupyterhub, e.g. jupyterhub. Only needed if auth mechanism is set to oauth" + default = "" +} + +# Only needed if auth mechanism is set to oauth. This is the secret for the client +variable "oauth_jupyter_client_secret" { + type = string + description = "Enter oauth client secret. Only needed if auth mechanism is set to oauth" + default = "" + sensitive = true +} + +# Only needed if auth mechanism is set to oauth. This is the key to use for looking up the username. +variable "oauth_username_key" { + type = string + description = "oauth field for the username. e.g. 'preferred_username' Only needed if auth mechanism is set to oauth" + default = "" +} From 07cffe3d7e3624deb34ee7c4c946eae460247ae5 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:40:20 -0800 Subject: [PATCH 09/16] jark stack consolidation --- ai-ml/infrastructure/terraform/cleanup.sh | 10 +++- ai-ml/infrastructure/terraform/install.sh | 10 +++- ai-ml/jark-stack/install.sh | 6 ++ ai-ml/jark-stack/terraform/cleanup.sh | 71 ----------------------- ai-ml/jark-stack/terraform/install.sh | 33 ----------- ai-ml/jark-stack/terraform/variables.tf | 60 ------------------- 6 files changed, 22 insertions(+), 168 deletions(-) create mode 100755 ai-ml/jark-stack/install.sh delete mode 100755 ai-ml/jark-stack/terraform/cleanup.sh delete mode 100755 ai-ml/jark-stack/terraform/install.sh delete mode 100644 ai-ml/jark-stack/terraform/variables.tf diff --git a/ai-ml/infrastructure/terraform/cleanup.sh b/ai-ml/infrastructure/terraform/cleanup.sh index b09efd384..bbf91142d 100755 --- a/ai-ml/infrastructure/terraform/cleanup.sh +++ b/ai-ml/infrastructure/terraform/cleanup.sh @@ -1,5 +1,11 @@ #!/bin/bash +TERRAFORM_COMMAND="terraform destroy -auto-approve" +# Check if blueprint.tfvars exists +if [ -f "blueprint.tfvars" ]; then + TERRAFORM_COMMAND="$TERRAFORM_COMMAND -var-file=blueprint.tfvars" +fi + echo "Destroying RayService..." # Delete the Ingress/SVC before removing the addons @@ -25,7 +31,7 @@ targets=( for target in "${targets[@]}" do echo "Destroying module $target..." - destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty) + destroy_output=$($TERRAFORM_COMMAND -target="$target" 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then echo "SUCCESS: Terraform destroy of $target completed successfully" else @@ -62,7 +68,7 @@ for sg in $(aws ec2 describe-security-groups \ ## Final destroy to catch any remaining resources echo "Destroying remaining resources..." -destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) +destroy_output=$($TERRAFORM_COMMAND -var="region=$region" 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then echo "SUCCESS: Terraform destroy of all modules completed successfully" else diff --git a/ai-ml/infrastructure/terraform/install.sh b/ai-ml/infrastructure/terraform/install.sh index 1814a9044..af8345f6a 100755 --- a/ai-ml/infrastructure/terraform/install.sh +++ b/ai-ml/infrastructure/terraform/install.sh @@ -9,11 +9,17 @@ targets=( # Initialize Terraform terraform init -upgrade +TERRAFORM_COMMAND="terraform apply -auto-approve" +# Check if blueprint.tfvars exists +if [ -f "blueprint.tfvars" ]; then + TERRAFORM_COMMAND="$TERRAFORM_COMMAND -var-file=blueprint.tfvars" +fi + # Apply modules in sequence for target in "${targets[@]}" do echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) + apply_output=$( $TERRAFORM_COMMAND -target="$target" 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then echo "SUCCESS: Terraform apply of $target completed successfully" else @@ -24,7 +30,7 @@ done # Final apply to catch any remaining resources echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) +apply_output=$( $TERRAFORM_COMMAND 2>&1 | tee /dev/tty) if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then echo "SUCCESS: Terraform apply of all modules completed successfully" else diff --git a/ai-ml/jark-stack/install.sh b/ai-ml/jark-stack/install.sh new file mode 100755 index 000000000..77838a56e --- /dev/null +++ b/ai-ml/jark-stack/install.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Copy the base infrastructure into the folder +cp -r ../infrastructure/terraform/* ./terraform + +cd terraform +source ./install.sh diff --git a/ai-ml/jark-stack/terraform/cleanup.sh b/ai-ml/jark-stack/terraform/cleanup.sh deleted file mode 100755 index b09efd384..000000000 --- a/ai-ml/jark-stack/terraform/cleanup.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -echo "Destroying RayService..." - -# Delete the Ingress/SVC before removing the addons -TMPFILE=$(mktemp) -terraform output -raw configure_kubectl > "$TMPFILE" -# check if TMPFILE contains the string "No outputs found" -if [[ ! $(cat $TMPFILE) == *"No outputs found"* ]]; then - echo "No outputs found, skipping kubectl delete" - source "$TMPFILE" - kubectl delete -f src/service/ray-service.yaml -fi - - -# List of Terraform modules to apply in sequence -targets=( - "module.data_addons" - "module.eks_blueprints_addons" - "module.eks" - "module.vpc" -) - -# Destroy modules in sequence -for target in "${targets[@]}" -do - echo "Destroying module $target..." - destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then - echo "SUCCESS: Terraform destroy of $target completed successfully" - else - echo "FAILED: Terraform destroy of $target failed" - exit 1 - fi -done - -echo "Destroying Load Balancers..." - -for arn in $(aws resourcegroupstaggingapi get-resources \ - --resource-type-filters elasticloadbalancing:loadbalancer \ - --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ - --query 'ResourceTagMappingList[].ResourceARN' \ - --output text); do \ - aws elbv2 delete-load-balancer --load-balancer-arn "$arn"; \ - done - -echo "Destroying Target Groups..." -for arn in $(aws resourcegroupstaggingapi get-resources \ - --resource-type-filters elasticloadbalancing:targetgroup \ - --tag-filters "Key=elbv2.k8s.aws/cluster,Values=jark-stack" \ - --query 'ResourceTagMappingList[].ResourceARN' \ - --output text); do \ - aws elbv2 delete-target-group --target-group-arn "$arn"; \ - done - -echo "Destroying Security Groups..." -for sg in $(aws ec2 describe-security-groups \ - --filters "Name=tag:elbv2.k8s.aws/cluster,Values=jark-stack" \ - --query 'SecurityGroups[].GroupId' --output text); do \ - aws ec2 delete-security-group --group-id "$sg"; \ - done - -## Final destroy to catch any remaining resources -echo "Destroying remaining resources..." -destroy_output=$(terraform destroy -var="region=$region" -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete"* ]]; then - echo "SUCCESS: Terraform destroy of all modules completed successfully" -else - echo "FAILED: Terraform destroy of all modules failed" - exit 1 -fi diff --git a/ai-ml/jark-stack/terraform/install.sh b/ai-ml/jark-stack/terraform/install.sh deleted file mode 100755 index 1814a9044..000000000 --- a/ai-ml/jark-stack/terraform/install.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# List of Terraform modules to apply in sequence -targets=( - "module.vpc" - "module.eks" -) - -# Initialize Terraform -terraform init -upgrade - -# Apply modules in sequence -for target in "${targets[@]}" -do - echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of $target completed successfully" - else - echo "FAILED: Terraform apply of $target failed" - exit 1 - fi -done - -# Final apply to catch any remaining resources -echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of all modules completed successfully" -else - echo "FAILED: Terraform apply of all modules failed" - exit 1 -fi diff --git a/ai-ml/jark-stack/terraform/variables.tf b/ai-ml/jark-stack/terraform/variables.tf deleted file mode 100644 index cfc27f17c..000000000 --- a/ai-ml/jark-stack/terraform/variables.tf +++ /dev/null @@ -1,60 +0,0 @@ -variable "name" { - description = "Name of the VPC and EKS Cluster" - default = "jark-stack" - type = string -} - -# NOTE: Trainium and Inferentia are only available in us-west-2 and us-east-1 regions -variable "region" { - description = "region" - default = "us-west-2" - type = string -} - -variable "eks_cluster_version" { - description = "EKS Cluster version" - default = "1.30" - type = string -} - -# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs -variable "vpc_cidr" { - description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range" - default = "10.1.0.0/21" - type = string -} - -# RFC6598 range 100.64.0.0/10 -# Note you can only /16 range to VPC. You can add multiples of /16 if required -variable "secondary_cidr_blocks" { - description = "Secondary CIDR blocks to be attached to VPC" - default = ["100.64.0.0/16"] - type = list(string) -} - -variable "huggingface_token" { - description = "Hugging Face Secret Token" - type = string - default = "DUMMY_TOKEN_REPLACE_ME" - sensitive = true -} - -variable "enable_aws_efa_k8s_device_plugin" { - description = "Enable AWS EFA K8s Device Plugin" - type = bool - default = false -} - -variable "enable_kubecost" { - description = "Enable Kubecost addon" - type = bool - default = false -} - - -variable "bottlerocket_data_disk_snpashot_id" { - description = "Bottlerocket Data Disk Snapshot ID" - type = string - default = "" - -} From 5566f806f1ef70dcddb466214beeea271234e604 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Wed, 29 Jan 2025 13:04:56 -0800 Subject: [PATCH 10/16] consolidated blueprints --- ai-ml/bionemo/install.sh | 36 +- ai-ml/emr-spark-rapids/install.sh | 35 +- .../monitoring/serviceMonitor-dcgm.yaml | 26 + .../{terraform => }/src/app/Dockerfile | 0 .../jark-stack/{terraform => }/src/app/run.sh | 0 .../{terraform => }/src/app/streamlit.py | 0 .../{terraform => }/src/app/streamlit.yaml | 0 .../{terraform => }/src/notebook/Dockerfile | 0 .../src/notebook/dogbooth.ipynb | 0 .../{terraform => }/src/service/Dockerfile | 0 .../{terraform => }/src/service/dogbooth.py | 0 .../src/service/ray-service.yaml | 0 ai-ml/jupyterhub/addons.tf | 539 ------------------ ai-ml/jupyterhub/cleanup.sh | 51 -- ai-ml/jupyterhub/cognito.tf | 224 -------- .../helm/aws-for-fluentbit/values.yaml | 80 --- .../helm/cluster-autoscaler/values.yaml | 25 - ai-ml/jupyterhub/helm/efs/Chart.yaml | 5 - .../jupyterhub/helm/efs/templates/efs-pv.yaml | 12 - .../helm/efs/templates/efs-pvc.yaml | 11 - ai-ml/jupyterhub/helm/efs/values.yaml | 5 - .../jupyterhub/jupyterhub-values-cognito.yaml | 304 ---------- .../jupyterhub/jupyterhub-values-dummy.yaml | 259 --------- .../jupyterhub/jupyterhub-values-oauth.yaml | 273 --------- .../helm/kube-prometheus-stack/values.yaml | 80 --- ai-ml/jupyterhub/helm/kubecost/values.yaml | 65 --- .../helm/metrics-server/values.yaml | 52 -- ai-ml/jupyterhub/install.sh | 35 +- ai-ml/jupyterhub/jupyterhub.tf | 143 ----- ai-ml/jupyterhub/main.tf | 157 ----- ai-ml/jupyterhub/outputs.tf | 4 - ai-ml/jupyterhub/variables.tf | 91 --- ai-ml/jupyterhub/versions.tf | 27 - ai-ml/jupyterhub/vpc.tf | 53 -- ai-ml/mlflow/cleanup.sh | 45 -- ai-ml/mlflow/install.sh | 39 +- ai-ml/mlflow/variables.tf | 44 -- 37 files changed, 42 insertions(+), 2678 deletions(-) create mode 100644 ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml rename ai-ml/jark-stack/{terraform => }/src/app/Dockerfile (100%) rename ai-ml/jark-stack/{terraform => }/src/app/run.sh (100%) rename ai-ml/jark-stack/{terraform => }/src/app/streamlit.py (100%) rename ai-ml/jark-stack/{terraform => }/src/app/streamlit.yaml (100%) rename ai-ml/jark-stack/{terraform => }/src/notebook/Dockerfile (100%) rename ai-ml/jark-stack/{terraform => }/src/notebook/dogbooth.ipynb (100%) rename ai-ml/jark-stack/{terraform => }/src/service/Dockerfile (100%) rename ai-ml/jark-stack/{terraform => }/src/service/dogbooth.py (100%) rename ai-ml/jark-stack/{terraform => }/src/service/ray-service.yaml (100%) delete mode 100755 ai-ml/jupyterhub/addons.tf delete mode 100755 ai-ml/jupyterhub/cleanup.sh delete mode 100644 ai-ml/jupyterhub/cognito.tf delete mode 100644 ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml delete mode 100644 ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml delete mode 100644 ai-ml/jupyterhub/helm/efs/Chart.yaml delete mode 100644 ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml delete mode 100644 ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml delete mode 100644 ai-ml/jupyterhub/helm/efs/values.yaml delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml delete mode 100755 ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml delete mode 100644 ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml delete mode 100644 ai-ml/jupyterhub/helm/kubecost/values.yaml delete mode 100644 ai-ml/jupyterhub/helm/metrics-server/values.yaml delete mode 100644 ai-ml/jupyterhub/jupyterhub.tf delete mode 100755 ai-ml/jupyterhub/main.tf delete mode 100755 ai-ml/jupyterhub/outputs.tf delete mode 100755 ai-ml/jupyterhub/variables.tf delete mode 100755 ai-ml/jupyterhub/versions.tf delete mode 100755 ai-ml/jupyterhub/vpc.tf delete mode 100755 ai-ml/mlflow/cleanup.sh delete mode 100644 ai-ml/mlflow/variables.tf diff --git a/ai-ml/bionemo/install.sh b/ai-ml/bionemo/install.sh index 8430565fc..77838a56e 100755 --- a/ai-ml/bionemo/install.sh +++ b/ai-ml/bionemo/install.sh @@ -1,34 +1,6 @@ #!/bin/bash +# Copy the base infrastructure into the folder +cp -r ../infrastructure/terraform/* ./terraform -# List of Terraform modules to apply in sequence -targets=( - "module.vpc" - "module.eks" -) - -# Initialize Terraform -echo "Initializing ..." -terraform init --upgrade || echo "\"terraform init\" failed" - -# Apply modules in sequence -for target in "${targets[@]}" -do - echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of $target completed successfully" - else - echo "FAILED: Terraform apply of $target failed" - exit 1 - fi -done - -# Final apply to catch any remaining resources -echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of all modules completed successfully" -else - echo "FAILED: Terraform apply of all modules failed" - exit 1 -fi +cd terraform +source ./install.sh diff --git a/ai-ml/emr-spark-rapids/install.sh b/ai-ml/emr-spark-rapids/install.sh index b87db5117..77838a56e 100755 --- a/ai-ml/emr-spark-rapids/install.sh +++ b/ai-ml/emr-spark-rapids/install.sh @@ -1,33 +1,6 @@ #!/bin/bash +# Copy the base infrastructure into the folder +cp -r ../infrastructure/terraform/* ./terraform -echo "Initializing ..." -terraform init || echo "\"terraform init\" failed" - -# List of Terraform modules to apply in sequence -targets=( - "module.vpc" - "module.eks" -) - -# Apply modules in sequence -for target in "${targets[@]}" -do - echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of $target completed successfully" - else - echo "FAILED: Terraform apply of $target failed" - exit 1 - fi -done - -# Final apply to catch any remaining resources -echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of all modules completed successfully" -else - echo "FAILED: Terraform apply of all modules failed" - exit 1 -fi +cd terraform +source ./install.sh diff --git a/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml new file mode 100644 index 000000000..681298d40 --- /dev/null +++ b/ai-ml/infrastructure/terraform/monitoring/serviceMonitor-dcgm.yaml @@ -0,0 +1,26 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + release: kube-prometheus-stack + name: dcgm-exporter + namespace: nvidia-device-plugin +spec: + endpoints: + - honorLabels: false + interval: 15s + path: /metrics + port: metrics + relabelings: + - action: replace + sourceLabels: + - __meta_ec2_instance_id + targetLabel: instance + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/component: dcgm-exporter + app.kubernetes.io/instance: dcgm-exporter + app.kubernetes.io/name: dcgm-exporter diff --git a/ai-ml/jark-stack/terraform/src/app/Dockerfile b/ai-ml/jark-stack/src/app/Dockerfile similarity index 100% rename from ai-ml/jark-stack/terraform/src/app/Dockerfile rename to ai-ml/jark-stack/src/app/Dockerfile diff --git a/ai-ml/jark-stack/terraform/src/app/run.sh b/ai-ml/jark-stack/src/app/run.sh similarity index 100% rename from ai-ml/jark-stack/terraform/src/app/run.sh rename to ai-ml/jark-stack/src/app/run.sh diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.py b/ai-ml/jark-stack/src/app/streamlit.py similarity index 100% rename from ai-ml/jark-stack/terraform/src/app/streamlit.py rename to ai-ml/jark-stack/src/app/streamlit.py diff --git a/ai-ml/jark-stack/terraform/src/app/streamlit.yaml b/ai-ml/jark-stack/src/app/streamlit.yaml similarity index 100% rename from ai-ml/jark-stack/terraform/src/app/streamlit.yaml rename to ai-ml/jark-stack/src/app/streamlit.yaml diff --git a/ai-ml/jark-stack/terraform/src/notebook/Dockerfile b/ai-ml/jark-stack/src/notebook/Dockerfile similarity index 100% rename from ai-ml/jark-stack/terraform/src/notebook/Dockerfile rename to ai-ml/jark-stack/src/notebook/Dockerfile diff --git a/ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb b/ai-ml/jark-stack/src/notebook/dogbooth.ipynb similarity index 100% rename from ai-ml/jark-stack/terraform/src/notebook/dogbooth.ipynb rename to ai-ml/jark-stack/src/notebook/dogbooth.ipynb diff --git a/ai-ml/jark-stack/terraform/src/service/Dockerfile b/ai-ml/jark-stack/src/service/Dockerfile similarity index 100% rename from ai-ml/jark-stack/terraform/src/service/Dockerfile rename to ai-ml/jark-stack/src/service/Dockerfile diff --git a/ai-ml/jark-stack/terraform/src/service/dogbooth.py b/ai-ml/jark-stack/src/service/dogbooth.py similarity index 100% rename from ai-ml/jark-stack/terraform/src/service/dogbooth.py rename to ai-ml/jark-stack/src/service/dogbooth.py diff --git a/ai-ml/jark-stack/terraform/src/service/ray-service.yaml b/ai-ml/jark-stack/src/service/ray-service.yaml similarity index 100% rename from ai-ml/jark-stack/terraform/src/service/ray-service.yaml rename to ai-ml/jark-stack/src/service/ray-service.yaml diff --git a/ai-ml/jupyterhub/addons.tf b/ai-ml/jupyterhub/addons.tf deleted file mode 100755 index ed8f7db44..000000000 --- a/ai-ml/jupyterhub/addons.tf +++ /dev/null @@ -1,539 +0,0 @@ -# Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM) -data "aws_acm_certificate" "issued" { - count = var.jupyter_hub_auth_mechanism != "dummy" ? 1 : 0 - domain = var.acm_certificate_domain - statuses = ["ISSUED"] -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -locals { - cognito_custom_domain = var.cognito_custom_domain -} - -#--------------------------------------------------------------- -# GP3 Encrypted Storage Class -#--------------------------------------------------------------- -resource "kubernetes_annotations" "disable_gp2" { - annotations = { - "storageclass.kubernetes.io/is-default-class" : "false" - } - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - metadata { - name = "gp2" - } - force = true - - depends_on = [module.eks.eks_cluster_id] -} - -resource "kubernetes_storage_class" "default_gp3" { - metadata { - name = "gp3" - annotations = { - "storageclass.kubernetes.io/is-default-class" : "true" - } - } - - storage_provisioner = "ebs.csi.aws.com" - reclaim_policy = "Delete" - allow_volume_expansion = true - volume_binding_mode = "WaitForFirstConsumer" - parameters = { - fsType = "ext4" - encrypted = true - type = "gp3" - } - - depends_on = [kubernetes_annotations.disable_gp2] -} - -#--------------------------------------------------------------- -# IRSA for EBS CSI Driver -#--------------------------------------------------------------- -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.20" - role_name_prefix = format("%s-%s", local.name, "ebs-csi-driver-") - attach_ebs_csi_policy = true - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - tags = local.tags -} - -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.2" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - #--------------------------------------- - # Amazon EKS Managed Add-ons - #--------------------------------------- - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - } - coredns = {} - kube-proxy = {} - # VPC CNI uses worker node IAM role policies - vpc-cni = {} - } - - #--------------------------------------- - # Metrics Server - #--------------------------------------- - enable_metrics_server = true - metrics_server = { - timeout = "300" - values = [templatefile("${path.module}/helm/metrics-server/values.yaml", {})] - } - - #--------------------------------------- - # Cluster Autoscaler - #--------------------------------------- - enable_cluster_autoscaler = true - cluster_autoscaler = { - timeout = "300" - create_role = true - values = [templatefile("${path.module}/helm/cluster-autoscaler/values.yaml", { - aws_region = var.region, - eks_cluster_id = module.eks.cluster_name - })] - } - - #--------------------------------------- - # Karpenter Autoscaler for EKS Cluster - #--------------------------------------- - enable_karpenter = true - karpenter_enable_spot_termination = true - karpenter_node = { - iam_role_additional_policies = { - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - karpenter = { - chart_version = "0.37.0" - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------- - # AWS Load Balancer Controller Add-on - #--------------------------------------- - enable_aws_load_balancer_controller = true - # turn off the mutating webhook for services because we are using - # service.beta.kubernetes.io/aws-load-balancer-type: external - aws_load_balancer_controller = { - set = [{ - name = "enableServiceMutatorWebhook" - value = "false" - }] - } - - #--------------------------------------- - # Prometheus and Grafana stack - #--------------------------------------- - #--------------------------------------------------------------- - # Install Monitoring Stack with Prometheus and Grafana - # 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack` - # 2- Grafana Admin user: admin - # 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id --region $AWS_REGION --query "SecretString" --output text` - #--------------------------------------------------------------- - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - values = [templatefile("${path.module}/helm/kube-prometheus-stack/values.yaml", {})] - chart_version = "48.1.1" - set_sensitive = [ - { - name = "grafana.adminPassword" - value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string - } - ], - } - #--------------------------------------- - # AWS for FluentBit - #--------------------------------------- - enable_aws_for_fluentbit = true - aws_for_fluentbit_cw_log_group = { - use_name_prefix = false - name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group - retention_in_days = 30 - } - aws_for_fluentbit = { - values = [templatefile("${path.module}/helm/aws-for-fluentbit/values.yaml", { - region = local.region, - cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs" - cluster_name = module.eks.cluster_name - })] - } - - tags = local.tags -} - -#--------------------------------------------------------------- -# Data on EKS Kubernetes Addons -#--------------------------------------------------------------- -module "eks_data_addons" { - source = "aws-ia/eks-data-addons/aws" - version = "1.33.0" # ensure to update this to the latest/desired version - - oidc_provider_arn = module.eks.oidc_provider_arn - - #--------------------------------------------------------------- - # Enable Neuron Device Plugin - #--------------------------------------------------------------- - enable_aws_neuron_device_plugin = true - - #--------------------------------------------------------------- - # NVIDIA Device Plugin Add-on - #--------------------------------------------------------------- - enable_nvidia_device_plugin = true - nvidia_device_plugin_helm_config = { - version = "v0.15.0" - name = "nvidia-device-plugin" - values = [ - <<-EOT - mixedStrategy: "mixed" - config: - map: - default: |- - version: v1 - flags: - migStrategy: none - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 4 - nvidia-a100g: |- - version: v1 - flags: - migStrategy: mixed - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 8 - - name: nvidia.com/mig-1g.5gb - replicas: 2 - - name: nvidia.com/mig-2g.10gb - replicas: 2 - - name: nvidia.com/mig-3g.20gb - replicas: 3 - - name: nvidia.com/mig-7g.40gb - replicas: 7 - gfd: - enabled: true - nfd: - worker: - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - operator: "Exists" - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - EOT - ] - } - - #--------------------------------------------------------------- - # JupyterHub Add-on - #--------------------------------------------------------------- - enable_jupyterhub = true - jupyterhub_helm_config = { - values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", { - ssl_cert_arn = try(data.aws_acm_certificate.issued[0].arn, "") - jupyterdomain = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "") - authorize_url = var.oauth_domain != "" ? "${var.oauth_domain}/auth" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "") - token_url = var.oauth_domain != "" ? "${var.oauth_domain}/token" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "") - userdata_url = var.oauth_domain != "" ? "${var.oauth_domain}/userinfo" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "") - username_key = try(var.oauth_username_key, "") - client_id = var.oauth_jupyter_client_id != "" ? var.oauth_jupyter_client_id : try(aws_cognito_user_pool_client.user_pool_client[0].id, "") - client_secret = var.oauth_jupyter_client_secret != "" ? var.oauth_jupyter_client_secret : try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "") - user_pool_id = try(aws_cognito_user_pool.pool[0].id, "") - identity_pool_id = try(aws_cognito_identity_pool.identity_pool[0].id, "") - jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name - region = var.region - })] - version = "3.2.1" - } - - #--------------------------------------------------------------- - # Kubecost Add-on - #--------------------------------------------------------------- - enable_kubecost = true - kubecost_helm_config = { - values = [templatefile("${path.module}/helm/kubecost/values.yaml", {})] - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } - - #--------------------------------------------------------------- - # Karpenter Resources Add-on - #--------------------------------------------------------------- - enable_karpenter_resources = true - karpenter_resources_helm_config = { - karpenter-resources-ts = { - values = [ - <<-EOT - name: gpu-ts - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodePool: gpu-ts - - hub.jupyter.org/node-purpose: user - taints: - - key: hub.jupyter.org/dedicated - value: "user" - effect: "NoSchedule" - - key: nvidia.com/gpu - value: "Exists" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["g5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 60s - expireAfter: 720h - weight: 100 - EOT - ] - } - karpenter-resources-mig = { - values = [ - <<-EOT - name: gpu-mig - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodePool: gpu-mig - - hub.jupyter.org/node-purpose: user - taints: - - key: hub.jupyter.org/dedicated - value: "user" - effect: "NoSchedule" - - key: nvidia.com/gpu - value: "Exists" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["p4d"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["24xlarge"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 60s - expireAfter: 720h - weight: 100 - EOT - ] - } - karpenter-resources-inf = { - values = [ - <<-EOT - name: inferentia - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodePool: inferentia - - hub.jupyter.org/node-purpose: user - taints: - - key: aws.amazon.com/neuroncore - value: "true" - effect: "NoSchedule" - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated - value: "user" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["inf2"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["8xlarge", "24xlarge"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 60s - expireAfter: 720h - weight: 100 - EOT - ] - } - karpenter-resources-trn = { - values = [ - <<-EOT - name: trainium - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[2]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodePool: trainium - - hub.jupyter.org/node-purpose: user - taints: - - key: aws.amazon.com/neuroncore - value: "true" - effect: "NoSchedule" - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated - value: "user" - effect: "NoSchedule" - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["trn1"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["32xlarge"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 60s - expireAfter: 720h - weight: 100 - EOT - ] - } - x86-cpu-karpenter = { - values = [ - <<-EOT - name: x86-cpu-karpenter - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - id: ${module.vpc.private_subnets[3]} - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - instanceStorePolicy: RAID0 - - nodePool: - labels: - - type: karpenter - - NodePool: default - - hub.jupyter.org/node-purpose: user - requirements: - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["m5"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: [ "xlarge", "2xlarge", "4xlarge", "8xlarge"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 60s - expireAfter: 720h - weight: 100 - EOT - ] - } - } -} - -#--------------------------------------------------------------- -# Grafana Admin credentials resources -#--------------------------------------------------------------- -data "aws_secretsmanager_secret_version" "admin_password_version" { - secret_id = aws_secretsmanager_secret.grafana.id - depends_on = [aws_secretsmanager_secret_version.grafana] -} - -resource "random_password" "grafana" { - length = 16 - special = true - override_special = "@_" -} - -#tfsec:ignore:aws-ssm-secret-use-customer-key -resource "aws_secretsmanager_secret" "grafana" { - name_prefix = "${local.name}-grafana-" - recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy -} - -resource "aws_secretsmanager_secret_version" "grafana" { - secret_id = aws_secretsmanager_secret.grafana.id - secret_string = random_password.grafana.result -} diff --git a/ai-ml/jupyterhub/cleanup.sh b/ai-ml/jupyterhub/cleanup.sh deleted file mode 100755 index 8438ddf84..000000000 --- a/ai-ml/jupyterhub/cleanup.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -set -o errexit -set -o pipefail - -targets=( - "module.eks_data_addons" - "module.eks_blueprints_addons" - "module.eks" - "module.vpc" -) - -#------------------------------------------- -# Helpful to delete the stuck in "Terminating" namespaces -# Rerun the cleanup.sh script to detect and delete the stuck resources -#------------------------------------------- -terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name') - -# If there are no terminating namespaces, exit the script -if [[ -z $terminating_namespaces ]]; then - echo "No terminating namespaces found" -fi - -for ns in $terminating_namespaces; do - echo "Terminating namespace: $ns" - kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - -done - -#------------------------------------------- -# Terraform destroy per module target -#------------------------------------------- -for target in "${targets[@]}" -do - destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of $target completed successfully" - else - echo "FAILED: Terraform destroy of $target failed" - exit 1 - fi -done - -#------------------------------------------- -# Terraform destroy full -#------------------------------------------- -destroy_output=$(terraform destroy -target="$target" -auto-approve | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of all targets completed successfully" -else - echo "FAILED: Terraform destroy of all targets failed" - exit 1 -fi diff --git a/ai-ml/jupyterhub/cognito.tf b/ai-ml/jupyterhub/cognito.tf deleted file mode 100644 index 57338986b..000000000 --- a/ai-ml/jupyterhub/cognito.tf +++ /dev/null @@ -1,224 +0,0 @@ -#--------------------------------------------------------------- -# Lambda function for pre token generation -#---------------------------------------------------------------- - -data "aws_iam_policy_document" "assume_role" { - statement { - effect = "Allow" - principals { - type = "Service" - identifiers = ["lambda.amazonaws.com", "cognito-idp.amazonaws.com"] - } - actions = ["sts:AssumeRole"] - } -} - -data "aws_iam_policy" "lambda_execution_policy" { - arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" -} - -resource "aws_iam_role" "iam_for_lambda" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - name = "iam_for_lambda" - assume_role_policy = data.aws_iam_policy_document.assume_role.json -} - -resource "aws_iam_role_policy_attachment" "lambda_policy_attachment" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - role = aws_iam_role.iam_for_lambda[0].name - policy_arn = data.aws_iam_policy.lambda_execution_policy.arn -} - -data "archive_file" "lambda" { - type = "zip" - output_path = "/tmp/lambda.zip" - source { - filename = "index.mjs" - content = <<-EOF - export const handler = async (event) => { - event.response = { - claimsOverrideDetails: { - claimsToAddOrOverride: { - department: "engineering", - }, - }, - }; - - return event; - }; - - EOF - } -} - -resource "aws_lambda_function" "pretoken_trigger" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - function_name = "pretoken-trigger-function" - filename = data.archive_file.lambda.output_path - source_code_hash = data.archive_file.lambda.output_base64sha256 - - runtime = "nodejs18.x" - handler = "index.handler" - - role = aws_iam_role.iam_for_lambda[0].arn -} - -#--------------------------------------------------------------- -# Cognito pool, domain and client creation. -# This can be used -# Auth integration later. -#---------------------------------------------------------------- -resource "aws_cognito_user_pool" "pool" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - name = "jupyterhub-userpool" - - username_attributes = ["email"] - auto_verified_attributes = ["email"] - - password_policy { - minimum_length = 6 - } - - lambda_config { - pre_token_generation = aws_lambda_function.pretoken_trigger[0].arn - } -} - -resource "aws_cognito_user_pool_domain" "domain" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - domain = local.cognito_custom_domain - user_pool_id = aws_cognito_user_pool.pool[0].id -} - -resource "aws_cognito_user_pool_client" "user_pool_client" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - name = "jupyter-client" - access_token_validity = 1 - token_validity_units { - access_token = "days" - } - callback_urls = ["https://${var.jupyterhub_domain}/hub/oauth_callback"] - user_pool_id = aws_cognito_user_pool.pool[0].id - allowed_oauth_flows_user_pool_client = true - allowed_oauth_flows = ["code"] - allowed_oauth_scopes = ["openid", "email"] - generate_secret = true - supported_identity_providers = [ - "COGNITO" - ] - - depends_on = [aws_cognito_user_pool_domain.domain] -} - -#--------------------------------------------------------------- -# Cognito identity pool creation. -#---------------------------------------------------------------- -resource "aws_cognito_identity_pool" "identity_pool" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - identity_pool_name = "jupyterhub-identity-pool" - allow_unauthenticated_identities = false - cognito_identity_providers { - client_id = aws_cognito_user_pool_client.user_pool_client[0].id - provider_name = aws_cognito_user_pool.pool[0].endpoint - server_side_token_check = true - } - - depends_on = [aws_cognito_user_pool_client.user_pool_client] -} - -resource "aws_s3_bucket" "jupyterhub_bucket" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - bucket_prefix = "jupyterhub-test-bucket-" -} - -resource "aws_s3_object" "engineering_object" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - bucket = aws_s3_bucket.jupyterhub_bucket[0].id - key = "engineering/" -} - -resource "aws_s3_object" "legal_object" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - bucket = aws_s3_bucket.jupyterhub_bucket[0].id - key = "legal/" -} - -#--------------------------------------------------------------- -# IAM role for a team member from the engineering department -# In theory there would be other departments such as "legal" -#---------------------------------------------------------------- -resource "aws_iam_role" "cognito_authenticated_engineering_role" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - - name = "EngineeringTeamRole" - - assume_role_policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Action = ["sts:AssumeRoleWithWebIdentity", "sts:TagSession"], - Effect = "Allow", - Principal = { - Federated = "cognito-identity.amazonaws.com" - }, - Condition = { - StringEquals = { - "cognito-identity.amazonaws.com:aud" = aws_cognito_identity_pool.identity_pool[0].id - }, - "ForAnyValue:StringLike" : { - "cognito-identity.amazonaws.com:amr" : "authenticated" - } - } - } - ] - }) -} - -resource "aws_iam_role_policy" "s3_cognito_engineering_policy" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - name = "s3_cognito_engineering_policy" - role = aws_iam_role.cognito_authenticated_engineering_role[0].id - - policy = <<-EOF -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": ["s3:List*"], - "Resource": "*", - "Condition": { - "StringEquals": { - "s3:prefix": "$${aws:PrincipalTag/department}" - } - } - } - ] -} -EOF -} - -resource "aws_cognito_identity_pool_provider_principal_tag" "example" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id - identity_provider_name = aws_cognito_user_pool.pool[0].endpoint - use_defaults = false - principal_tags = { - department = "department" - } -} - -resource "aws_iam_policy_attachment" "s3_readonly_policy_attachment" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - name = "S3ReadOnlyAccessAttachment" - policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" - roles = [aws_iam_role.cognito_authenticated_engineering_role[0].name] -} - -resource "aws_cognito_identity_pool_roles_attachment" "identity_pool_roles" { - count = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0 - identity_pool_id = aws_cognito_identity_pool.identity_pool[0].id - roles = { - authenticated = aws_iam_role.cognito_authenticated_engineering_role[0].arn - } -} diff --git a/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml b/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml deleted file mode 100644 index 0f05a308b..000000000 --- a/ai-ml/jupyterhub/helm/aws-for-fluentbit/values.yaml +++ /dev/null @@ -1,80 +0,0 @@ -global: - -#hostNetwork and dnsPolicy are critical for enabling large clusters to avoid making calls to API server -# see this link https://docs.fluentbit.io/manual/pipeline/filters/kubernetes#optional-feature-using-kubelet-to-get-metadata -hostNetwork: true -dnsPolicy: ClusterFirstWithHostNet - -service: - parsersFiles: - - /fluent-bit/parsers/parsers.conf - extraParsers: | - [PARSER] - Name kubernetes - Format regex - Regex ^(?[^_]+)\.(?.+)\.(?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)\.(?[a-z0-9]{64})-$ - -input: - name: "tail" - enabled: true - tag: "systempods....-" - path: "/var/log/containers/*.log" - db: "/var/log/flb_kube.db" - memBufLimit: 5MB - skipLongLines: "On" - refreshInterval: 10 - extraInputs: | - multiline.parser docker, cri - Tag_Regex (?[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)-(?[a-z0-9]{64})\.log$ - - -# NOTE: extraFilters config for using Kubelet to get the Metadata instead of talking to API server for large clusters -filter: - name: "kubernetes" - match: "systempods.*" - kubeURL: "https://kubernetes.default.svc.cluster.local:443" - mergeLog: "On" - mergeLogKey: "log_processed" - keepLog: "On" - k8sLoggingParser: "On" - k8sLoggingExclude: "Off" - bufferSize: "0" - extraFilters: | - Kube_Tag_Prefix systempods. - Regex_Parser kubernetes - Labels On - Annotations Off - Use_Kubelet true - Kubelet_Port 10250 - Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token - -# CATION: Do not use `cloudwatch` plugin. This Golang Plugin is not recommended by AWS anymore instead use C plugin(`cloudWatchLogs`) for better performance. -# cloudWatch: -# enabled: false - -# This is a new high performance C Plugin for CloudWatchLogs. See docs here https://docs.fluentbit.io/manual/pipeline/outputs/cloudwatch -cloudWatchLogs: - enabled: true - match: "systempods.*" - region: ${region} - logGroupName: ${cloudwatch_log_group} - autoCreateGroup: false - extraOutputs: | - log_key log - -# Resource config for large clusters -resources: - limits: - cpu: 1000m - memory: 1500Mi - requests: - cpu: 500m - memory: 500Mi - -## Assign a PriorityClassName to pods if set -priorityClassName: system-node-critical - -# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints. -tolerations: - - operator: Exists diff --git a/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml b/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml deleted file mode 100644 index 5a42794f2..000000000 --- a/ai-ml/jupyterhub/helm/cluster-autoscaler/values.yaml +++ /dev/null @@ -1,25 +0,0 @@ -autoDiscovery: - clusterName: ${eks_cluster_id} - -awsRegion: ${aws_region} - -cloudProvider: aws - -extraArgs: - aws-use-static-instance-list: true - -# Best practice to update the resource requests and limits for each add-on -resources: - limits: - cpu: 1000m - memory: 1G - requests: - cpu: 200m - memory: 512Mi - -# Best practice to updateStrategy for each add-on -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 diff --git a/ai-ml/jupyterhub/helm/efs/Chart.yaml b/ai-ml/jupyterhub/helm/efs/Chart.yaml deleted file mode 100644 index e69ed7f3d..000000000 --- a/ai-ml/jupyterhub/helm/efs/Chart.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v2 -name: efs -description: Helm chart for efs options on the cluster -version: 0.0.1 -appVersion: 0.0.1 diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml deleted file mode 100644 index c10646f80..000000000 --- a/ai-ml/jupyterhub/helm/efs/templates/efs-pv.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: {{ .Values.pv.name }} -spec: - capacity: - storage: 123Gi - accessModes: - - ReadWriteMany - nfs: - server: {{ .Values.pv.dnsName }} - path: "/" diff --git a/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml b/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml deleted file mode 100644 index cd0a962d9..000000000 --- a/ai-ml/jupyterhub/helm/efs/templates/efs-pvc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ .Values.pvc.name }} -spec: - accessModes: - - ReadWriteMany - storageClassName: "" - resources: - requests: - storage: 1Gi diff --git a/ai-ml/jupyterhub/helm/efs/values.yaml b/ai-ml/jupyterhub/helm/efs/values.yaml deleted file mode 100644 index 703735ddd..000000000 --- a/ai-ml/jupyterhub/helm/efs/values.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pv: - name: efs-persist - dnsName: -pvc: - name: efs-persist diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml deleted file mode 100755 index 5088511c0..000000000 --- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-cognito.yaml +++ /dev/null @@ -1,304 +0,0 @@ -hub: - db: - pvc: - storage: 50Gi - storageClassName: gp3 - authenticatePrometheus: false - command: ["sh", "-c", "pip install boto3 && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py"] - config: - GenericOAuthenticator: - oauth_callback_url: ${jupyterdomain} - client_id: ${client_id} - client_secret: ${client_secret} - authorize_url: ${authorize_url} - token_url: ${token_url} - userdata_url: ${userdata_url} - scope: - - openid - - email - username_key: "username" - login_service : "AWS Cognito" - userdata_method: "POST" - JupyterHub: - authenticator_class: generic-oauth - extraConfig: - jupyterhub_config.py: |- - c.KubeSpawner.start_timeout = 1200 - c.Authenticator.enable_auth_state = True - - cognito_config.py: |- - import boto3 - def auth_state_hook(spawner, auth_state): - client_idp = boto3.client('cognito-idp', region_name="${region}") - auth_response = client_idp.initiate_auth( - AuthFlow="REFRESH_TOKEN_AUTH", - AuthParameters={ - "REFRESH_TOKEN": auth_state['refresh_token'], - "SECRET_HASH": "${client_secret}" - }, - ClientId="${client_id}" - ) - id_token = auth_response["AuthenticationResult"]["IdToken"] - client_identity = boto3.client("cognito-identity", region_name="${region}") - identity_response = client_identity.get_id( - IdentityPoolId="${identity_pool_id}", - Logins={ - f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token - } - ) - identity_id = identity_response['IdentityId'] - credentials = client_identity.get_credentials_for_identity( - IdentityId=identity_id, - Logins={ - f"cognito-idp.${region}.amazonaws.com/${user_pool_id}": id_token - } - ) - key = credentials["Credentials"]["AccessKeyId"] - secret = credentials["Credentials"]["SecretKey"] - token = credentials["Credentials"]["SessionToken"] - spawner.environment['AWS_ACCESS_KEY_ID'] = key - spawner.environment['AWS_SECRET_ACCESS_KEY'] = secret - spawner.environment['AWS_SESSION_TOKEN'] = token - - c.Spawner.auth_state_hook = auth_state_hook - -proxy: - https: - enabled: true - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} - service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - -singleuser: - startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull - profileList: - - display_name: Data Engineering (CPU) - description: "PySpark Notebooks | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pyspark350: - display_name: "PySpark 3.5.0 + Python 3.11" - default: true - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.5.0 - pyspark341: - display_name: "PySpark 3.4.1 + Python 3.11" - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.4.1 - kubespawner_override: - node_selector: - NodePool: default - cpu_guarantee: 2 - mem_guarantee: 8G - cpu_limit: 4 - mem_limit: 8G - cmd: null - # NOTE: - - display_name: Trainium (trn1) - description: "Trainium | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch 1.13.1 + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: trainium - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: "start-singleuser.sh" - - display_name: Inferentia (inf2) - description: "Inferentia | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: inferentia - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 20 - mem_guarantee: 100G - cpu_limit: 20 - mem_limit: 100G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: null - - display_name: Data Science (GPU + Time-Slicing - G5) - default: true - description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" - kubespawner_override: - # namespace: data-team-a - image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only - node_selector: - NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_limit: 2 - mem_limit: 4G - cpu_guarantee: 2 - mem_guarantee: 4G - cmd: "start-singleuser.sh" - # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. - # Hence, this profile relies on Managed node groups with GPU MIG enabled - - display_name: Data Science (GPU + MIG on P4d.24xlarge) - description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only - node_selector: - provisioner: cluster-autoscaler - node.kubernetes.io/instance-type: p4d.24xlarge - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_guarantees: - nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb - # extra_resource_limits: - # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - - display_name: Data Science (GPU - P4d.24xlarge) - description: "GPU with P4d instances | Karpenter Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.5_cuda-11.6_ubuntu-20.04_python-only - node_selector: - NodePool: gpu-mig - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "8" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: null - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - -prePuller: - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -global: - safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml deleted file mode 100755 index 24320e22a..000000000 --- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-dummy.yaml +++ /dev/null @@ -1,259 +0,0 @@ -hub: - db: - pvc: - storage: 50Gi - storageClassName: gp3 - authenticatePrometheus: false - -proxy: - https: - enabled: false - type: offload - service: - type: ClusterIP - # Disabled LoadBalancer type -# annotations: -# service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "ssl_cert_arn" -# service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" -# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" -# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" -# service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip -# service.beta.kubernetes.io/aws-load-balancer-scheme: internal -# service.beta.kubernetes.io/aws-load-balancer-type: external -# service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' -# service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 -singleuser: - startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull - profileList: - - display_name: Elyra (CPU) - description: "Elyra Notebooks | Karpenter Autoscaling" - kubespawner_override: - image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0 - node_selector: - NodePool: default - cpu_guarantee: 2 - mem_guarantee: 8G - cpu_limit: 4 - mem_limit: 8G - cmd: null - - display_name: Data Engineering (CPU) - description: "PySpark Notebooks | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pyspark350: - display_name: "PySpark 3.5.0 + Python 3.11" - default: true - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.5.0 - pyspark341: - display_name: "PySpark 3.4.1 + Python 3.11" - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.4.1 - kubespawner_override: - node_selector: - NodePool: default - cpu_guarantee: 2 - mem_guarantee: 8G - cpu_limit: 4 - mem_limit: 8G - cmd: null - # NOTE: - - display_name: Trainium (trn1) - description: "Trainium | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch 1.13.1 + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: trainium - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: "start-singleuser.sh" - - display_name: Inferentia (inf2) - description: "Inferentia | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: inferentia - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 20 - mem_guarantee: 100G - cpu_limit: 20 - mem_limit: 100G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: null - - display_name: Data Science (GPU + Time-Slicing - G5) - default: true - description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" - kubespawner_override: - # namespace: data-team-a - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_limit: 2 - mem_limit: 4G - cpu_guarantee: 2 - mem_guarantee: 4G - cmd: "start-singleuser.sh" - # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. - # Hence, this profile relies on Managed node groups with GPU MIG enabled - - display_name: Data Science (GPU + MIG on P4d.24xlarge) - description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - provisioner: cluster-autoscaler - node.kubernetes.io/instance-type: p4d.24xlarge - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_guarantees: - nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb - # extra_resource_limits: - # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - - display_name: Data Science (GPU - P4d.24xlarge) - description: "GPU with P4d instances | Karpenter Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - NodePool: gpu-mig - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "8" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: null - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - -prePuller: - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -global: - safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml b/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml deleted file mode 100755 index 869163d22..000000000 --- a/ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml +++ /dev/null @@ -1,273 +0,0 @@ -hub: - db: - pvc: - storage: 50Gi - storageClassName: gp3 - authenticatePrometheus: false - config: - GenericOAuthenticator: - oauth_callback_url: ${jupyterdomain} - client_id: ${client_id} - client_secret: ${client_secret} - authorize_url: ${authorize_url} - token_url: ${token_url} - userdata_url: ${userdata_url} - scope: - - openid - - profile - username_key: "${username_key}" - login_service: "oauth" - allow_all: true # Allows all oauth authenticated users to use Jupyterhub. For finer grained control, you can use `allowed_users`: https://jupyterhub.readthedocs.io/en/stable/tutorial/getting-started/authenticators-users-basics.html#deciding-who-is-allowed - JupyterHub: - authenticator_class: generic-oauth -proxy: - https: - enabled: true - type: offload - service: - annotations: - service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn} - service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https" - service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp" - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600" - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal - service.beta.kubernetes.io/aws-load-balancer-type: external - service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true' - service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4 - -singleuser: - startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull - profileList: - - display_name: Elyra (CPU) - description: "Elyra Notebooks | Karpenter Autoscaling" - kubespawner_override: - image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0 - node_selector: - NodePool: default - cpu_guarantee: 2 - mem_guarantee: 8G - cpu_limit: 4 - mem_limit: 8G - cmd: null - - display_name: Data Engineering (CPU) - description: "PySpark Notebooks | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pyspark350: - display_name: "PySpark 3.5.0 + Python 3.11" - default: true - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.5.0 - pyspark341: - display_name: "PySpark 3.4.1 + Python 3.11" - kubespawner_override: - image: jupyter/pyspark-notebook:spark-3.4.1 - kubespawner_override: - node_selector: - NodePool: default - cpu_guarantee: 2 - mem_guarantee: 8G - cpu_limit: 4 - mem_limit: 8G - cmd: null - # NOTE: - - display_name: Trainium (trn1) - description: "Trainium | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch 1.13.1 + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow 2.10.1 + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: trainium - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: "start-singleuser.sh" - - display_name: Inferentia (inf2) - description: "Inferentia | Karpenter AutoScaling" - profile_options: - image: - display_name: "Image" - choices: - pytorch1131: - display_name: "PyTorch + torch-neuronx" - default: true - kubespawner_override: - image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest - tflow2101: - display_name: "Tensorflow + tensorflow-neuronx" - kubespawner_override: - image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest - kubespawner_override: - node_selector: - NodePool: inferentia - hub.jupyter.org/node-purpose: user - tolerations: - - key: aws.amazon.com/neuroncore - operator: Exists - effect: NoSchedule - - key: aws.amazon.com/neuron - operator: Exists - effect: NoSchedule - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - cpu_guarantee: 20 - mem_guarantee: 100G - cpu_limit: 20 - mem_limit: 100G - extra_resource_limits: - aws.amazon.com/neuron: "1" - cmd: null - - display_name: Data Science (GPU + Time-Slicing - G5) - default: true - description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling" - kubespawner_override: - # namespace: data-team-a - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_limit: 2 - mem_limit: 4G - cpu_guarantee: 2 - mem_guarantee: 4G - cmd: "start-singleuser.sh" - # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1, or nvidia.com/mig-2g.20gb: 1 etc. - # Hence, this profile relies on Managed node groups with GPU MIG enabled - - display_name: Data Science (GPU + MIG on P4d.24xlarge) - description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - provisioner: cluster-autoscaler - node.kubernetes.io/instance-type: p4d.24xlarge - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_guarantees: - nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb - # extra_resource_limits: - # nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - - display_name: Data Science (GPU - P4d.24xlarge) - description: "GPU with P4d instances | Karpenter Autoscaler" - kubespawner_override: - image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only - node_selector: - NodePool: gpu-mig - hub.jupyter.org/node-purpose: user - tolerations: - - key: "nvidia.com/gpu" - operator: "Exists" - effect: "NoSchedule" - - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - extra_resource_limits: - nvidia.com/gpu: "8" - cpu_guarantee: 2 - mem_guarantee: 10G - cpu_limit: 2 - mem_limit: 10G - cmd: "start-singleuser.sh" - storage: - type: "static" - static: - pvcName: "efs-persist" - subPath: "home/{username}" - extraVolumes: - - name: jupyterhub-shared - persistentVolumeClaim: - claimName: efs-persist-shared - extraVolumeMounts: - - name: jupyterhub-shared - mountPath: /home/shared - readOnly: false - serviceAccountName: ${jupyter_single_user_sa_name} - allowPrivilegeEscalation: true - extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account - securityContext: - fsGroup: 100 - extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance - GRANT_SUDO: "yes" - NOTEBOOK_ARGS: "--allow-root" - CHOWN_HOME: "yes" - CHOWN_HOME_OPTS: "-R" - CHOWN_EXTRA: "/home/shared" - uid: 0 - fsGid: 0 - cmd: null - -# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html -scheduling: - userScheduler: - enabled: true - podPriority: - enabled: true - userPlaceholder: - enabled: false - replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner - -prePuller: - hook: - enabled: false - continuous: - # NOTE: if used with Karpenter, also add user-placeholders - enabled: false - -global: - safeToShowValues: false diff --git a/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml b/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml deleted file mode 100644 index 1b13f6dec..000000000 --- a/ai-ml/jupyterhub/helm/kube-prometheus-stack/values.yaml +++ /dev/null @@ -1,80 +0,0 @@ -prometheus: - prometheusSpec: - resources: - requests: - memory: 4Gi - cpu: 2 - retention: 5h - scrapeInterval: 30s - evaluationInterval: 30s - scrapeTimeout: 10s - storageSpec: - volumeClaimTemplate: - metadata: - name: data - spec: - storageClassName: gp3 - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 100Gi - # Scrape Cost metrics for Kubecost and JupyterHub add-ons - additionalScrapeConfigs: - - job_name: kubecost - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /metrics - scheme: http - dns_sd_configs: - - names: - - kubecost-cost-analyzer.kubecost.svc - type: 'A' - port: 9003 - - job_name: jupyterhub - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /hub/metrics - scheme: http - dns_sd_configs: - - names: - - hub.jupyterhub.svc - type: 'A' - port: 8081 - - job_name: gpu-metrics - scrape_interval: 1m - metrics_path: /metrics - scheme: http - kubernetes_sd_configs: - - role: endpoints - namespaces: - names: - - gpu-operator - relabel_configs: - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: kubernetes_node - -alertmanager: - enabled: false - -grafana: - enabled: true - defaultDashboardsEnabled: true - resources: - requests: - memory: 4Gi - cpu: 2 - sidecar: - datasources: - alertmanager: - enabled: false - -kube-state-metrics: - metricLabelsAllowlist: - # to select jupyterhub component pods and get the hub usernames - - pods=[app,component,hub.jupyter.org/username] - # allowing all labels is probably fine for nodes, since they don't churn much, unlike pods - - nodes=[*] diff --git a/ai-ml/jupyterhub/helm/kubecost/values.yaml b/ai-ml/jupyterhub/helm/kubecost/values.yaml deleted file mode 100644 index 0f9441497..000000000 --- a/ai-ml/jupyterhub/helm/kubecost/values.yaml +++ /dev/null @@ -1,65 +0,0 @@ - -# KubeCost WebUI -> kubectl port-forward --namespace kubecost deployment/kubecost-cost-analyzer 9090 - -global: - # pricingCsv: - # enabled: false - # location: - # provider: "AWS" - # region: "us-east-1" - # URI: s3://kc-csv-test/pricing_schema.csv # a valid file URI - # csvAccessCredentials: pricing-schema-access-secret - - # This Prometheus setup is reusing the existing Prometheus deployment - # Check for more docs under https://guide.kubecost.com/hc/en-us/articles/4407595941015 - prometheus: - fqdn: http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc:9090 - enabled: false - -# If you have node-exporter and/or KSM running on your cluster, follow this step to disable the Kubecost included versions. -prometheus: - nodeExporter: - enabled: false - serviceAccounts: - nodeExporter: - create: false - kubeStateMetrics: - enabled: false - -#imageVersion: prod-1.96.0 # commented to use the latest - -kubecostFrontend: - image: public.ecr.aws/kubecost/frontend - resources: - requests: - cpu: "200m" - memory: "512Mi" - -kubecostMetrics: - emitPodAnnotations: true - emitNamespaceAnnotations: true - -kubecostModel: - image: public.ecr.aws/kubecost/cost-model - resources: - requests: - cpu: "500m" - memory: "512Mi" - -# Set this to false if you're bringing your own service account. -#serviceAccount: -# create: false -# name: kubecost-cost-analyzer -# annotations: -# eks.amazonaws.com/role-arn: - -# Define persistence volume for cost-analyzer -persistentVolume: - size: 32Gi - dbSize: 32.0Gi - enabled: true # Note that setting this to false means configurations will be wiped out on pod restart. - storageClass: gp3 - # existingClaim: kubecost-cost-analyzer # a claim in the same namespace as kubecost - -grafana: - enabled: false diff --git a/ai-ml/jupyterhub/helm/metrics-server/values.yaml b/ai-ml/jupyterhub/helm/metrics-server/values.yaml deleted file mode 100644 index 026d97a6a..000000000 --- a/ai-ml/jupyterhub/helm/metrics-server/values.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# HA config for metrics-server -image: - repository: registry.k8s.io/metrics-server/metrics-server - pullPolicy: IfNotPresent - -serviceAccount: - create: true - name: metrics-server - -rbac: - create: true - pspEnabled: false - -apiService: - create: true - -podLabels: - k8s-app: metrics-server - -# HA enabled by enabling replicas to 2, updateStrategy and podDisruptionBudget to true -replicas: 2 - -updateStrategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 0 - maxUnavailable: 1 - -podDisruptionBudget: - enabled: true - minAvailable: 1 - -defaultArgs: - - --cert-dir=/tmp - - --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname - - --kubelet-use-node-status-port - - --metric-resolution=15s - -resources: - requests: - cpu: 200m - memory: 512Mi - -affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - k8s-app: metrics-server - namespaces: - - kube-system - topologyKey: kubernetes.io/hostname diff --git a/ai-ml/jupyterhub/install.sh b/ai-ml/jupyterhub/install.sh index b87db5117..77838a56e 100755 --- a/ai-ml/jupyterhub/install.sh +++ b/ai-ml/jupyterhub/install.sh @@ -1,33 +1,6 @@ #!/bin/bash +# Copy the base infrastructure into the folder +cp -r ../infrastructure/terraform/* ./terraform -echo "Initializing ..." -terraform init || echo "\"terraform init\" failed" - -# List of Terraform modules to apply in sequence -targets=( - "module.vpc" - "module.eks" -) - -# Apply modules in sequence -for target in "${targets[@]}" -do - echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of $target completed successfully" - else - echo "FAILED: Terraform apply of $target failed" - exit 1 - fi -done - -# Final apply to catch any remaining resources -echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of all modules completed successfully" -else - echo "FAILED: Terraform apply of all modules failed" - exit 1 -fi +cd terraform +source ./install.sh diff --git a/ai-ml/jupyterhub/jupyterhub.tf b/ai-ml/jupyterhub/jupyterhub.tf deleted file mode 100644 index 30809aeef..000000000 --- a/ai-ml/jupyterhub/jupyterhub.tf +++ /dev/null @@ -1,143 +0,0 @@ -#----------------------------------------------------------------------------------------- -# JupyterHub Sinlgle User IRSA, maybe that block could be incorporated in add-on registry -#----------------------------------------------------------------------------------------- -resource "kubernetes_namespace" "jupyterhub" { - metadata { - name = "jupyterhub" - } -} - -module "jupyterhub_single_user_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - - role_name = "${module.eks.cluster_name}-jupyterhub-single-user-sa" - - role_policy_arns = { - policy = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess" # Policy needs to be defined based in what you need to give access to your notebook instances. - } - - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["${kubernetes_namespace.jupyterhub.metadata[0].name}:jupyterhub-single-user"] - } - } -} - -resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { - metadata { - name = "${module.eks.cluster_name}-jupyterhub-single-user" - namespace = kubernetes_namespace.jupyterhub.metadata[0].name - annotations = { "eks.amazonaws.com/role-arn" : module.jupyterhub_single_user_irsa.iam_role_arn } - } - - automount_service_account_token = true -} - -resource "kubernetes_secret_v1" "jupyterhub_single_user" { - metadata { - name = "${module.eks.cluster_name}-jupyterhub-single-user-secret" - namespace = kubernetes_namespace.jupyterhub.metadata[0].name - annotations = { - "kubernetes.io/service-account.name" = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name - "kubernetes.io/service-account.namespace" = kubernetes_namespace.jupyterhub.metadata[0].name - } - } - - type = "kubernetes.io/service-account-token" -} - -#--------------------------------------------------------------- -# EFS Filesystem for private volumes per user -# This will be replaced with Dynamic EFS provision using EFS CSI Driver -#--------------------------------------------------------------- -resource "aws_efs_file_system" "efs" { - encrypted = true - - tags = local.tags -} - -#--------------------------------------------------------------- -# module.vpc.private_subnets = [AZ1_10.x, AZ2_10.x, AZ1_100.x, AZ2_100.x] -# We use index 2 and 3 to select the subnet in AZ1 with the 100.x CIDR: -# Create EFS mount targets for the 3rd subnet -resource "aws_efs_mount_target" "efs_mt_1" { - file_system_id = aws_efs_file_system.efs.id - subnet_id = module.vpc.private_subnets[2] - security_groups = [aws_security_group.efs.id] -} - -# Create EFS mount target for the 4th subnet -resource "aws_efs_mount_target" "efs_mt_2" { - file_system_id = aws_efs_file_system.efs.id - subnet_id = module.vpc.private_subnets[3] - security_groups = [aws_security_group.efs.id] -} - -resource "aws_security_group" "efs" { - name = "${local.name}-efs" - description = "Allow inbound NFS traffic from private subnets of the VPC" - vpc_id = module.vpc.vpc_id - - ingress { - description = "Allow NFS 2049/tcp" - cidr_blocks = module.vpc.vpc_secondary_cidr_blocks - from_port = 2049 - to_port = 2049 - protocol = "tcp" - } - - tags = local.tags -} - -#--------------------------------------- -# EFS Configuration -#--------------------------------------- -module "efs_config" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.2" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - helm_releases = { - efs = { - name = "efs" - description = "A Helm chart for storage configurations" - namespace = "jupyterhub" - create_namespace = false - chart = "${path.module}/helm/efs" - chart_version = "0.0.1" - values = [ - <<-EOT - pv: - name: efs-persist - dnsName: ${aws_efs_file_system.efs.dns_name} - pvc: - name: efs-persist - EOT - ] - } - efs-shared = { - name = "efs-shared" - description = "A Helm chart for shared storage configurations" - namespace = "jupyterhub" - create_namespace = false - chart = "${path.module}/helm/efs" - chart_version = "0.0.1" - values = [ - <<-EOT - pv: - name: efs-persist-shared - dnsName: ${aws_efs_file_system.efs.dns_name} - pvc: - name: efs-persist-shared - EOT - ] - } - } - - depends_on = [kubernetes_namespace.jupyterhub] -} diff --git a/ai-ml/jupyterhub/main.tf b/ai-ml/jupyterhub/main.tf deleted file mode 100755 index dcccf3de4..000000000 --- a/ai-ml/jupyterhub/main.tf +++ /dev/null @@ -1,157 +0,0 @@ -provider "aws" { - region = local.region -} - -# Removed exec plugin as this doesn't work with Terraform Cloud and TOFU controller plugin with backstage -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} - -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_availability_zones" "available" {} - -locals { - name = var.name - region = var.region - azs = slice(data.aws_availability_zones.available.names, 0, 2) - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} - -#--------------------------------------------------------------- -# EKS Cluster -#--------------------------------------------------------------- -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.15" - - cluster_name = local.name - cluster_version = var.eks_cluster_version - - #WARNING: Avoid using this option (cluster_endpoint_public_access = true) in preprod or prod accounts. This feature is designed for sandbox accounts, simplifying cluster deployment and testing. - cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint. - - vpc_id = module.vpc.vpc_id - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - - manage_aws_auth_configmap = true - aws_auth_roles = [ - # We need to add in the Karpenter node IAM role for nodes launched by Karpenter - { - rolearn = module.eks_blueprints_addons.karpenter.node_iam_role_arn - username = "system:node:{{EC2PrivateDNSName}}" - groups = [ - "system:bootstrappers", - "system:nodes", - ] - } - ] - #--------------------------------------- - # Note: This can further restricted to specific required for each Add-on and your application - #--------------------------------------- - # Extend cluster security group rules - cluster_security_group_additional_rules = { - ingress_nodes_ephemeral_ports_tcp = { - description = "Nodes on ephemeral ports" - protocol = "tcp" - from_port = 1025 - to_port = 65535 - type = "ingress" - source_node_security_group = true - } - } - - # Extend node-to-node security group rules - node_security_group_additional_rules = { - ingress_self_all = { - description = "Node to node all ports/protocols" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - self = true - } - - # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane. - # This can be restricted further to specific port based on the requirement for each Add-on e.g., coreDNS 53, metrics-server 4443, spark-operator 8080, karpenter 8443 etc. - # Update this according to your security requirements if needed - ingress_cluster_to_node_all_traffic = { - description = "Cluster API to Nodegroup all traffic" - protocol = "-1" - from_port = 0 - to_port = 0 - type = "ingress" - source_cluster_security_group = true - } - } - - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect mounted volumes - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - - eks_managed_node_groups = { - # It's recommended to have a Managed Node group for hosting critical add-ons - # It's recommended to use Karpenter to place your workloads instead of using Managed Node groups - # You can leverage nodeSelector and Taints/tolerations to distribute workloads across Managed Node group or Karpenter nodes. - core_node_group = { - name = "jupyterhub-node-group" - description = "EKS Core node group for hosting critical add-ons" - # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned - subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - - min_size = 4 - max_size = 8 - desired_size = 4 - - instance_types = ["m5.xlarge"] - - ebs_optimized = true - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 100 - volume_type = "gp3" - } - } - } - - labels = { - WorkerType = "ON_DEMAND" - NodeGroupType = "core" - } - - tags = merge(local.tags, { - Name = "core-node-grp", - "karpenter.sh/discovery" = local.name - }) - } - } -} diff --git a/ai-ml/jupyterhub/outputs.tf b/ai-ml/jupyterhub/outputs.tf deleted file mode 100755 index f6444daab..000000000 --- a/ai-ml/jupyterhub/outputs.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${var.region} update-kubeconfig --name ${var.name}" -} diff --git a/ai-ml/jupyterhub/variables.tf b/ai-ml/jupyterhub/variables.tf deleted file mode 100755 index b2a2a0600..000000000 --- a/ai-ml/jupyterhub/variables.tf +++ /dev/null @@ -1,91 +0,0 @@ -variable "name" { - description = "Name of the VPC and EKS Cluster" - default = "jupyterhub-on-eks" - type = string -} - -variable "region" { - description = "Region" - type = string - default = "us-west-2" -} - -variable "eks_cluster_version" { - description = "EKS Cluster version" - default = "1.30" - type = string -} - -# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs -variable "vpc_cidr" { - description = "VPC CIDR. This should be a valid private (RFC 1918) CIDR range" - default = "10.1.0.0/21" - type = string -} - -# RFC6598 range 100.64.0.0/10 -# Note you can only /16 range to VPC. You can add multiples of /16 if required -variable "secondary_cidr_blocks" { - description = "Secondary CIDR blocks to be attached to VPC" - default = ["100.64.0.0/16"] - type = list(string) -} - -# NOTE: You need to use private domain or public domain name with ACM certificate -# Data-on-EKS website docs will show you how to create free public domain name with ACM certificate for testing purpose only -# Example of public domain name(..com): eks.jupyter-doeks.dynamic-dns.com -variable "jupyter_hub_auth_mechanism" { - type = string - description = "Allowed values: cognito, dummy, oauth" - default = "dummy" -} - -# Domain name is public so make sure you use a unique while deploying, Only needed if auth mechanism is set to cognito -variable "cognito_custom_domain" { - description = "Cognito domain prefix for Hosted UI authentication endpoints" - type = string - default = "eks" -} - -# Only needed if auth mechanism is set to cognito -variable "acm_certificate_domain" { - type = string - description = "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com" - default = "" -} - -# Only needed if auth mechanism is set to cognito or oauth. This is the domain for jupyterhub -variable "jupyterhub_domain" { - type = string - description = "Enter domain name for jupyterhub to be hosted, e.g. eks.example.com. Only needed if auth mechanism is set to cognito or oauth" - default = "" -} - -# Only needed if auth mechanism is set to oauth. This is the root path for the oidc endpoints -variable "oauth_domain" { - type = string - description = "Enter oauth domain and endpoint, e.g. https://keycloak.example.com/realms/master/protocol/openid-connect. Only needed if auth mechanism is set to oauth" - default = "" -} - -# Only needed if auth mechanism is set to oauth. This is the id of the client -variable "oauth_jupyter_client_id" { - type = string - description = "Enter oauth client id for jupyterhub, e.g. jupyterhub. Only needed if auth mechanism is set to oauth" - default = "" -} - -# Only needed if auth mechanism is set to oauth. This is the secret for the client -variable "oauth_jupyter_client_secret" { - type = string - description = "Enter oauth client secret. Only needed if auth mechanism is set to oauth" - default = "" - sensitive = true -} - -# Only needed if auth mechanism is set to oauth. This is the key to use for looking up the username. -variable "oauth_username_key" { - type = string - description = "oauth field for the username. e.g. 'preferred_username' Only needed if auth mechanism is set to oauth" - default = "" -} diff --git a/ai-ml/jupyterhub/versions.tf b/ai-ml/jupyterhub/versions.tf deleted file mode 100755 index 9b6678a5f..000000000 --- a/ai-ml/jupyterhub/versions.tf +++ /dev/null @@ -1,27 +0,0 @@ -terraform { - required_version = ">= 1.0.0" - - required_providers { - aws = { - source = "hashicorp/aws" - version = ">= 3.72" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.12.1" - } - random = { - source = "hashicorp/random" - version = "3.1.0" # Replace with the appropriate version of the random provider - } - - archive = { - source = "hashicorp/archive" - version = "2.4.0" - } - } -} diff --git a/ai-ml/jupyterhub/vpc.tf b/ai-ml/jupyterhub/vpc.tf deleted file mode 100755 index 59c3da89c..000000000 --- a/ai-ml/jupyterhub/vpc.tf +++ /dev/null @@ -1,53 +0,0 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - -#--------------------------------------------------------------- -# VPC -#--------------------------------------------------------------- -# WARNING: This VPC module includes the creation of an Internet Gateway and NAT Gateway, which simplifies cluster deployment and testing, primarily intended for sandbox accounts. -# IMPORTANT: For preprod and prod use cases, it is crucial to consult with your security team and AWS architects to design a private infrastructure solution that aligns with your security requirements - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = local.name - cidr = var.vpc_cidr - azs = local.azs - - # Secondary CIDR block attached to VPC for EKS Control Plane ENI + Nodes + Pods - secondary_cidr_blocks = var.secondary_cidr_blocks - - # 1/ EKS Data Plane secondary CIDR blocks for two subnets across two AZs for EKS Control Plane ENI + Nodes + Pods - # 2/ Two private Subnets with RFC1918 private IPv4 address range for Private NAT + NLB + Airflow + EC2 Jumphost etc. - private_subnets = concat(local.private_subnets, local.secondary_ip_range_private_subnets) - - # ------------------------------ - # Optional Public Subnets for NAT and IGW for PoC/Dev/Test environments - # Public Subnets can be disabled while deploying to Production and use Private NAT + TGW - public_subnets = local.public_subnets - enable_nat_gateway = true - single_nat_gateway = true - #------------------------------- - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - # Tags subnets for Karpenter auto-discovery - "karpenter.sh/discovery" = local.name - } - - tags = local.tags -} diff --git a/ai-ml/mlflow/cleanup.sh b/ai-ml/mlflow/cleanup.sh deleted file mode 100755 index 6f96c6ef5..000000000 --- a/ai-ml/mlflow/cleanup.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -set -o errexit -set -o pipefail - -targets=( - "module.eks_data_addons" - "module.eks_blueprints_addons" -) - -#------------------------------------------- -# Helpful to delete the stuck in "Terminating" namespaces -# Rerun the cleanup.sh script to detect and delete the stuck resources -#------------------------------------------- -terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name') - -# If there are no terminating namespaces, exit the script -if [[ -z $terminating_namespaces ]]; then - echo "No terminating namespaces found" -fi - -for ns in $terminating_namespaces; do - echo "Terminating namespace: $ns" - kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f - -done - -for target in "${targets[@]}" -do - terraform destroy -target="$target" -auto-approve - destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1) - if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of $target completed successfully" - else - echo "FAILED: Terraform destroy of $target failed" - exit 1 - fi -done - -terraform destroy -auto-approve -destroy_output=$(terraform destroy -auto-approve 2>&1) -if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then - echo "SUCCESS: Terraform destroy of all targets completed successfully" -else - echo "FAILED: Terraform destroy of all targets failed" - exit 1 -fi diff --git a/ai-ml/mlflow/install.sh b/ai-ml/mlflow/install.sh index 2832252fb..77838a56e 100755 --- a/ai-ml/mlflow/install.sh +++ b/ai-ml/mlflow/install.sh @@ -1,37 +1,6 @@ #!/bin/bash +# Copy the base infrastructure into the folder +cp -r ../infrastructure/terraform/* ./terraform -# List of Terraform modules to apply in sequence -targets=( - "module.vpc" - "module.eks" - "module.ebs_csi_driver_irsa" - "module.eks_blueprints_addons" - "module.db" -) - -# Initialize Terraform -echo "Initializing ..." -terraform init --upgrade || echo "\"terraform init\" failed" - -# Apply modules in sequence -for target in "${targets[@]}" -do - echo "Applying module $target..." - apply_output=$(terraform apply -target="$target" -auto-approve 2>&1 | tee /dev/tty) - if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of $target completed successfully" - else - echo "FAILED: Terraform apply of $target failed" - exit 1 - fi -done - -# Final apply to catch any remaining resources -echo "Applying remaining resources..." -apply_output=$(terraform apply -auto-approve 2>&1 | tee /dev/tty) -if [[ ${PIPESTATUS[0]} -eq 0 && $apply_output == *"Apply complete"* ]]; then - echo "SUCCESS: Terraform apply of all modules completed successfully" -else - echo "FAILED: Terraform apply of all modules failed" - exit 1 -fi +cd terraform +source ./install.sh diff --git a/ai-ml/mlflow/variables.tf b/ai-ml/mlflow/variables.tf deleted file mode 100644 index 1600e75b5..000000000 --- a/ai-ml/mlflow/variables.tf +++ /dev/null @@ -1,44 +0,0 @@ -variable "name" { - description = "Name of the VPC and EKS Cluster" - default = "mlflow-on-eks" - type = string -} - -variable "region" { - description = "Region" - type = string - default = "us-west-2" -} - -variable "eks_cluster_version" { - description = "EKS Cluster version" - default = "1.27" - type = string -} - -# VPC with 2046 IPs (10.1.0.0/21) and 2 AZs -variable "vpc_cidr" { - description = "VPC CIDR" - default = "10.1.0.0/21" - type = string -} - -# RFC6598 range 100.64.0.0/10 -# Note you can only /16 range to VPC. You can add multiples of /16 if required -variable "secondary_cidr_blocks" { - description = "Secondary CIDR blocks to be attached to VPC" - default = ["100.64.0.0/16"] - type = list(string) -} - -variable "enable_amazon_prometheus" { - description = "Enable AWS Managed Prometheus service" - type = bool - default = true -} - -variable "enable_mlflow_tracking" { - description = "Enable MLflow Tracking" - type = bool - default = true -} From 43f2659c5497884d60f43ab06d9f4c2edb9de1f4 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Wed, 29 Jan 2025 13:13:51 -0800 Subject: [PATCH 11/16] add missing blueprint tfvars --- ai-ml/bionemo/terraform/blueprint.tfvars | 1 + ai-ml/emr-spark-rapids/terraform/blueprint.tfvars | 3 +++ ai-ml/jark-stack/terraform/blueprint.tfvars | 8 ++++++++ ai-ml/jupyterhub/terraform/blueprint.tfvars | 6 ++++++ ai-ml/mlflow/terraform/blueprint.tfvars | 4 ++++ 5 files changed, 22 insertions(+) create mode 100644 ai-ml/bionemo/terraform/blueprint.tfvars create mode 100644 ai-ml/emr-spark-rapids/terraform/blueprint.tfvars create mode 100644 ai-ml/jark-stack/terraform/blueprint.tfvars create mode 100644 ai-ml/jupyterhub/terraform/blueprint.tfvars create mode 100644 ai-ml/mlflow/terraform/blueprint.tfvars diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars new file mode 100644 index 000000000..a1bde59d1 --- /dev/null +++ b/ai-ml/bionemo/terraform/blueprint.tfvars @@ -0,0 +1 @@ +name = "bionemo-on-eks" diff --git a/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars b/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars new file mode 100644 index 000000000..df273cd86 --- /dev/null +++ b/ai-ml/emr-spark-rapids/terraform/blueprint.tfvars @@ -0,0 +1,3 @@ +name = "emr-spark-rapids" +enable_amazon_prometheus = true +enable_amazon_emr = true diff --git a/ai-ml/jark-stack/terraform/blueprint.tfvars b/ai-ml/jark-stack/terraform/blueprint.tfvars new file mode 100644 index 000000000..125219743 --- /dev/null +++ b/ai-ml/jark-stack/terraform/blueprint.tfvars @@ -0,0 +1,8 @@ +name = "jark-stack" +enable_aws_efs_csi_driver = true +enable_aws_cloudwatch_metrics = true +enable_jupyterhub = true +enable_volcano = true +enable_kuberay_operator = true +enable_argo_workflows = true +enable_argo_events = true diff --git a/ai-ml/jupyterhub/terraform/blueprint.tfvars b/ai-ml/jupyterhub/terraform/blueprint.tfvars new file mode 100644 index 000000000..4b9562b12 --- /dev/null +++ b/ai-ml/jupyterhub/terraform/blueprint.tfvars @@ -0,0 +1,6 @@ +name = "jark-stack" +enable_aws_efs_csi_driver = true +enable_aws_cloudwatch_metrics = true +enable_jupyterhub = true +enable_volcano = true +enable_kuberay_operator = true diff --git a/ai-ml/mlflow/terraform/blueprint.tfvars b/ai-ml/mlflow/terraform/blueprint.tfvars new file mode 100644 index 000000000..41466c8a3 --- /dev/null +++ b/ai-ml/mlflow/terraform/blueprint.tfvars @@ -0,0 +1,4 @@ +name = "mlflow-on-eks" +enable_aws_cloudwatch_metrics = true +enable_amazon_prometheus = true +enable_mlflow_tracking = true From e4ed1cc1c03f01ffcd0b05f37a513422e8838a4f Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Wed, 12 Feb 2025 12:57:07 -0800 Subject: [PATCH 12/16] update fsx csi driver variable --- ai-ml/infrastructure/terraform/addons.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index 57e63d062..a15597aaf 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -186,7 +186,7 @@ module "eks_blueprints_addons" { #--------------------------------------- # Enable FSx for Lustre CSI Driver #--------------------------------------- - enable_aws_fsx_csi_driver = var.enable_aws_efa_k8s_device_plugin + enable_aws_fsx_csi_driver = var.enable_aws_fsx_csi_driver tags = local.tags From 94b8576557f4718ea908cd5f85a5b4b02bd3b59d Mon Sep 17 00:00:00 2001 From: Divya Gupta Date: Mon, 17 Feb 2025 13:23:26 -0500 Subject: [PATCH 13/16] missing bionemo tfvar --- ai-ml/bionemo/terraform/blueprint.tfvars | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars index a1bde59d1..6d2b662ea 100644 --- a/ai-ml/bionemo/terraform/blueprint.tfvars +++ b/ai-ml/bionemo/terraform/blueprint.tfvars @@ -1 +1,3 @@ name = "bionemo-on-eks" +enable_aws_fsx_csi_driver= "true" +deploy_fsx_volume= "true" From 91eb0e1e499b97a7687584fbb3e12ff3e9560895 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Tue, 18 Feb 2025 07:42:38 -0800 Subject: [PATCH 14/16] add missing redis ha, torchx etcd vars --- ai-ml/infrastructure/terraform/addons.tf | 17 ++++++ .../terraform/elastic-cache-redis.tf | 57 +++++++++++++++++++ ai-ml/infrastructure/terraform/variables.tf | 11 ++++ 3 files changed, 85 insertions(+) create mode 100644 ai-ml/infrastructure/terraform/elastic-cache-redis.tf diff --git a/ai-ml/infrastructure/terraform/addons.tf b/ai-ml/infrastructure/terraform/addons.tf index a15597aaf..89f8badf6 100644 --- a/ai-ml/infrastructure/terraform/addons.tf +++ b/ai-ml/infrastructure/terraform/addons.tf @@ -603,6 +603,23 @@ module "eks_data_addons" { } } +#--------------------------------------------------------------- +# ETCD for TorchX +#--------------------------------------------------------------- +data "http" "torchx_etcd_yaml" { + url = "https://raw.githubusercontent.com/pytorch/torchx/main/resources/etcd.yaml" +} + +data "kubectl_file_documents" "torchx_etcd_yaml" { + content = data.http.torchx_etcd_yaml.response_body +} + +resource "kubectl_manifest" "torchx_etcd" { + for_each = var.enable_torchx_etcd ? data.kubectl_file_documents.torchx_etcd_yaml.manifests : {} + yaml_body = each.value + depends_on = [module.eks.eks_cluster_id] +} + #--------------------------------------------------------------- # Grafana Admin credentials resources # Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana" diff --git a/ai-ml/infrastructure/terraform/elastic-cache-redis.tf b/ai-ml/infrastructure/terraform/elastic-cache-redis.tf new file mode 100644 index 000000000..df3c3c6a8 --- /dev/null +++ b/ai-ml/infrastructure/terraform/elastic-cache-redis.tf @@ -0,0 +1,57 @@ +#------------------------------------------- +# For Rayhead High availability cluster +#------------------------------------------- +module "elasticache" { + create = var.enable_rayserve_ha_elastic_cache_redis + source = "terraform-aws-modules/elasticache/aws" + version = "1.2.0" + + cluster_id = local.name + create_cluster = true + create_replication_group = false + + engine_version = "7.1" + node_type = "cache.t4g.small" + + apply_immediately = true + + # Security Group + vpc_id = module.vpc.vpc_id + security_group_rules = { + ingress_vpc = { + # Default type is `ingress` + # Default port is based on the default engine port + description = "VPC traffic" + cidr_ipv4 = module.vpc.vpc_cidr_block + } + + ingress_from_eks_worker_node_tcp = { + description = "Ingress rule to allow TCP on port 6379 from EKS Ray Head Node" + protocol = "tcp" + from_port = 6379 + referenced_security_group_id = module.eks.node_security_group_id + to_port = 6379 + type = "ingress" + } + } + + # Subnet Group + subnet_group_name = local.name + subnet_group_description = "${title(local.name)} subnet group" + subnet_ids = module.vpc.private_subnets + + # Parameter Group + create_parameter_group = true + parameter_group_name = local.name + parameter_group_family = "redis7" + parameter_group_description = "${title(local.name)} parameter group" + parameters = [ + { + name = "latency-tracking" + value = "yes" + } + ] + + tags = local.tags + +} diff --git a/ai-ml/infrastructure/terraform/variables.tf b/ai-ml/infrastructure/terraform/variables.tf index d768410c7..79b891afe 100644 --- a/ai-ml/infrastructure/terraform/variables.tf +++ b/ai-ml/infrastructure/terraform/variables.tf @@ -119,6 +119,17 @@ variable "huggingface_token" { default = "DUMMY_TOKEN_REPLACE_ME" sensitive = true } +variable "enable_rayserve_ha_elastic_cache_redis" { + description = "Flag to enable Ray Head High Availability with Elastic Cache for Redis" + type = bool + default = false +} + +variable "enable_torchx_etcd" { + description = "Flag to enable etcd deployment for torchx" + type = bool + default = false +} # Jupyterhub Specific Variables From 2a352da35dc2c93d75d25c3bcfae5392579c89c9 Mon Sep 17 00:00:00 2001 From: Divya Gupta Date: Tue, 18 Feb 2025 10:47:18 -0500 Subject: [PATCH 15/16] style fix --- ai-ml/bionemo/terraform/blueprint.tfvars | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ai-ml/bionemo/terraform/blueprint.tfvars b/ai-ml/bionemo/terraform/blueprint.tfvars index 6d2b662ea..53f738698 100644 --- a/ai-ml/bionemo/terraform/blueprint.tfvars +++ b/ai-ml/bionemo/terraform/blueprint.tfvars @@ -1,3 +1,3 @@ name = "bionemo-on-eks" -enable_aws_fsx_csi_driver= "true" -deploy_fsx_volume= "true" +enable_aws_fsx_csi_driver = "true" +deploy_fsx_volume = "true" From 7898ffa50b2dcc0ffb2527a8c2d3ae253aadbb5c Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Thu, 20 Feb 2025 12:38:53 -0800 Subject: [PATCH 16/16] addressing some review comments --- ai-ml/infrastructure/terraform/eks.tf | 96 ------------------- .../jupyterhub-values-cognito.yaml | 3 - .../helm-values/jupyterhub-values-dummy.yaml | 3 - .../helm-values/jupyterhub-values-oauth.yaml | 3 - .../monitoring/neuron-monitor-daemonset.yaml | 4 + ai-ml/jupyterhub/terraform/blueprint.tfvars | 5 +- 6 files changed, 6 insertions(+), 108 deletions(-) diff --git a/ai-ml/infrastructure/terraform/eks.tf b/ai-ml/infrastructure/terraform/eks.tf index 169b19bac..26ac6bffa 100644 --- a/ai-ml/infrastructure/terraform/eks.tf +++ b/ai-ml/infrastructure/terraform/eks.tf @@ -118,101 +118,5 @@ module "eks" { Name = "core-node-grp" }) } - -# # GPU Nodegroup for JupyterHub Notebook and Ray Service -# gpu1 = { -# name = "gpu-node-grp" -# description = "EKS Node Group to run GPU workloads" -# # Filtering only Secondary CIDR private subnets starting with "100.". -# # Subnet IDs where the nodes/node groups will be provisioned -# subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : -# substr(cidr_block, 0, 4) == "100." ? subnet_id : null] -# ) -# -# ami_type = "AL2_x86_64_GPU" -# min_size = 0 -# max_size = 1 -# desired_size = 0 -# -# instance_types = ["g5.12xlarge"] -# -# labels = { -# WorkerType = "ON_DEMAND" -# NodeGroupType = "gpu" -# } -# -# taints = { -# gpu = { -# key = "nvidia.com/gpu" -# effect = "NO_SCHEDULE" -# operator = "EXISTS" -# } -# } -# -# tags = merge(local.tags, { -# Name = "gpu-node-grp" -# }) -# } - - # # This nodegroup can be used for P4/P5 instances with, or without, a Capacity Reservation. - # # - # gpu_p5_node_group = { - # name = "p5-gpu-node-grp" - # description = "EKS Node Group to run GPU workloads" - - # ami_type = "AL2_x86_64_GPU" - - # instance_types = ["p5.48xlarge"] - # capacity_type = "ON_DEMAND" - - # # Filtering only Secondary CIDR private subnets starting with "100.". - # # Subnet IDs where the nodes/node groups will be provisioned - # subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : - # substr(cidr_block, 0, 4) == "100." ? subnet_id : null] - # ) - - # # If you are using a Capacity Reservation, the Subnet for the instances must match AZ for the reservation. - # # subnet_ids = ["subnet-01234567890fds"] - # # capacity_reservation_specification = { - # # capacity_reservation_target = { - # # capacity_reservation_id = "cr-01234567890fds" - # # } - # # } - - # min_size = 1 - # max_size = 1 - # desired_size = 1 - - # # The P Series can leverage EFA devices, below we attach EFA interfaces to all of the available slots to the instance - # # we assign the host interface device_index=0, and all other interfaces device_index=1 - # # p5.48xlarge has 32 network card indexes so the range should be 31, we'll create net interfaces 0-31 - # # p4 instances have 4 network card indexes so the range should be 4, we'll create Net interfaces 0-3 - # network_interfaces = [ - # for i in range(32) : { - # associate_public_ip_address = false - # delete_on_termination = true - # device_index = i == 0 ? 0 : 1 - # network_card_index = i - # interface_type = "efa" - # } - # ] - - # # add `--local-disks raid0` to use the NVMe devices underneath the Pods, kubelet, containerd, and logs: https://github.com/awslabs/amazon-eks-ami/pull/1171 - # bootstrap_extra_args = "--local-disks raid0" - # taints = { - # gpu = { - # key = "nvidia.com/gpu" - # effect = "NO_SCHEDULE" - # operator = "EXISTS" - # } - # } - # labels = { - # WorkerType = "ON_DEMAND" - # NodeGroupType = "gpu" - # } - # tags = merge(local.tags, { - # Name = "p5-gpu-node-grp" - # }) - # } } } diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml index 4e2073836..aab0e8b9c 100755 --- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-cognito.yaml @@ -249,9 +249,6 @@ scheduling: userPlaceholder: enabled: false replicas: 1 -# userPods: -# nodeAffinity: -# matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner prePuller: hook: diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml index 0d1fcdc4e..d13fa4126 100755 --- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-dummy.yaml @@ -204,9 +204,6 @@ scheduling: userPlaceholder: enabled: false replicas: 1 -# userPods: -# nodeAffinity: -# matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner prePuller: hook: diff --git a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml index 486a750a8..bf53eca66 100755 --- a/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml +++ b/ai-ml/infrastructure/terraform/helm-values/jupyterhub-values-oauth.yaml @@ -217,9 +217,6 @@ scheduling: userPlaceholder: enabled: false replicas: 1 - userPods: - nodeAffinity: - matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner prePuller: hook: diff --git a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml index 2ed065546..bffd6558a 100644 --- a/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml +++ b/ai-ml/infrastructure/terraform/monitoring/neuron-monitor-daemonset.yaml @@ -38,5 +38,9 @@ spec: value: 160MiB securityContext: privileged: true + tolerations: + - key: aws.amazon.com/neuron + operator: Exists + effect: NoSchedule nodeSelector: accelerator: neuron diff --git a/ai-ml/jupyterhub/terraform/blueprint.tfvars b/ai-ml/jupyterhub/terraform/blueprint.tfvars index 4b9562b12..177a52b35 100644 --- a/ai-ml/jupyterhub/terraform/blueprint.tfvars +++ b/ai-ml/jupyterhub/terraform/blueprint.tfvars @@ -1,6 +1,5 @@ -name = "jark-stack" +name = "jupyterhub" enable_aws_efs_csi_driver = true enable_aws_cloudwatch_metrics = true enable_jupyterhub = true -enable_volcano = true -enable_kuberay_operator = true +