Setup GCS fault tolerance with Redis for ray

GoogleCloudPlatform · Aug 30, 2023 · 6eaa4d7 · 6eaa4d7
1 parent 8f44477
commit 6eaa4d7
Show file tree

Hide file tree

Showing 13 changed files with 466 additions and 3 deletions.
diff --git a/ray-on-gke/README.md b/ray-on-gke/README.md
@@ -47,6 +47,7 @@ If you need to reinstall any resources, make sure to delete this file as well.
 1. `cd ../user`
 
 2. Edit `variables.tf` with your GCP settings. The `<your user name>` that you specify will become a K8s namespace for your Ray services.
+   Note: To create a fault tolerant ray cluster, please see the section on "Fault Tolerance" below.
 
 3. Run `terraform init`
 
@@ -136,6 +137,25 @@ prompt = (
 4. This should output a generated text response.
 
 
+## Fault Tolerance
+
+The ray cluster is not fault tolerant by default i.e. if the ray head node dies for any reason, the application is disrupted.
+Fault tolerance via Redis can be enabled in the following way:
+
+1. `cd ray-on-gke/user`
+
+2. Edit `variables.tf` to set `enable_fault_tolerance` to `true`.
+
+3. Run `terraform init` and `terraform apply` as described above in the "User" section.
+
+To verify the setup, run `kubectl get pods -n <your_namespace>` which should display a Redis pod.
+Now if the head node dies, the application will continue to run.
+
+The Ray dashboard and job logs can also be recovered:
+1. Re-run the following: `kubectl port-forward -n <namespace> service/example-cluster-kuberay-head-svc 8265:8265`
+2. Open the dashboard via `http://localhost:8265` 
+
+
 ## Logging and Monitoring
 
 This repository comes with out-of-the-box integrations with Google Cloud Logging

diff --git a/ray-on-gke/platform/variables.tf b/ray-on-gke/platform/variables.tf
@@ -40,4 +40,4 @@ variable "enable_tpu" {
   type        = bool
   description = "Set to true to create TPU node pool"
   default     = false
-}
+}
diff --git a/ray-on-gke/user/main.tf b/ray-on-gke/user/main.tf
@@ -32,6 +32,7 @@ module "kubernetes" {
   source = "./modules/kubernetes"
 
   namespace = var.namespace
+  enable_fault_tolerance = var.enable_fault_tolerance
 }
 
 module "service_accounts" {
@@ -49,6 +50,7 @@ module "kuberay" {
   depends_on = [module.kubernetes]
   namespace  = var.namespace
   enable_tpu = var.enable_tpu
+  enable_fault_tolerance = var.enable_fault_tolerance
 }
 
 module "prometheus" {

diff --git a/ray-on-gke/user/modules/kuberay/kuberay-with-fault-tolerance.yaml b/ray-on-gke/user/modules/kuberay/kuberay-with-fault-tolerance.yaml
@@ -0,0 +1,312 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default values for ray-cluster.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# The KubeRay community welcomes PRs to expose additional configuration
+# in this Helm chart.
+
+image:
+  # Replace this with your own image if needed.
+  repository: rayproject/ray
+  tag: 2.6.1-py310-gpu
+  pullPolicy: IfNotPresent
+
+nameOverride: "kuberay"
+fullnameOverride: ""
+
+imagePullSecrets: []
+  # - name: an-existing-secret
+
+annotations:
+  ray.io/ft-enabled: "true" # <- add this annotation enable GCS FT
+  ray.io/external-storage-namespace: "my-raycluster-storage-namespace"
+
+head:
+  groupName: headgroup
+  # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
+  # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
+  # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
+  # enableInTreeAutoscaling: true
+  # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
+  # The example configuration shown below below represents the DEFAULT values.
+  # autoscalerOptions:
+    # upscalingMode: Default
+    # idleTimeoutSeconds: 60
+    # securityContext: {}
+    # env: []
+    # envFrom: []
+    # resources specifies optional resource request and limit overrides for the autoscaler container.
+    # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+    # resources:
+    #   limits:
+    #     cpu: "500m"
+    #     memory: "512Mi"
+    #   requests:
+    #     cpu: "500m"
+    #     memory: "512Mi"
+  labels:
+    cloud.google.com/gke-ray-node-type: head
+  rayStartParams:
+    dashboard-host: '0.0.0.0'
+    block: 'true'
+    redis-password: $REDIS_PASSWORD
+  # containerEnv specifies environment variables for the Ray container,
+  # Follows standard K8s container env schema.
+  containerEnv:
+  # - name: EXAMPLE_ENV
+  #   value: "1"
+    - name: RAY_memory_monitor_refresh_ms
+      value: "0"
+    - name: RAY_REDIS_ADDRESS
+      value: redis:6379
+    - name: REDIS_PASSWORD
+      valueFrom:
+        secretKeyRef:
+          name: redis-password-secret
+          key: password
+  envFrom: []
+    # - secretRef:
+    #     name: my-env-secret
+  # ports optionally allows specifying ports for the Ray container.
+  # ports: []
+  # resource requests and limits for the Ray head container.
+  # Modify as needed for your application.
+  # Note that the resources in this example are much too small for production;
+  # we don't recommend allocating less than 8G memory for a Ray pod in production.
+  # Ray pods should be sized to take up entire K8s nodes when possible.
+  # Always set CPU and memory limits for Ray pods.
+  # It is usually best to set requests equal to limits.
+  # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
+  # for further guidance.
+  resources:
+    limits:
+      cpu: "8"
+      nvidia.com/gpu: "1"
+      # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head.
+      memory: "20G"
+      ephemeral-storage: 20Gi
+    requests:
+      cpu: "8"
+      nvidia.com/gpu: "1"
+      memory: "20G"
+      ephemeral-storage: 10Gi
+  annotations: {}
+  nodeSelector:
+    iam.gke.io/gke-metadata-server-enabled: "true"
+    cloud.google.com/gke-accelerator: "nvidia-tesla-t4"
+  tolerations: []
+  affinity: {}
+  # Ray container security context.
+  securityContext: {}
+  volumes:
+    - name: ray-logs
+      emptyDir: {}
+    - name: fluentbit-config
+      configMap:
+        name: fluentbit-config
+  # Ray writes logs to /tmp/ray/session_latests/logs
+  volumeMounts:
+    - mountPath: /tmp/ray
+      name: ray-logs
+  # sidecarContainers specifies additional containers to attach to the Ray pod.
+  # Follows standard K8s container spec.
+  sidecarContainers:
+    - name: fluentbit
+      image: fluent/fluent-bit:1.9.6
+      # These resource requests for Fluent Bit should be sufficient in production.
+      resources:
+        requests:
+          cpu: 100m
+          memory: 128Mi
+          ephemeral-storage: 2Gi
+        limits:
+          cpu: 100m
+          memory: 128Mi
+          ephemeral-storage: 4Gi
+      volumeMounts:
+      - mountPath: /tmp/ray
+        name: ray-logs
+      - mountPath: /fluent-bit/etc/
+        name: fluentbit-config
+
+worker:
+  # If you want to disable the default workergroup
+  # uncomment the line below
+  # disabled: true
+  groupName: workergroup
+  replicas: 1
+  type: worker
+  labels:
+    cloud.google.com/gke-ray-node-type: worker
+  rayStartParams:
+    block: 'true'
+  initContainerImage: 'busybox:1.28'  # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories.
+  # Security context for the init container.
+  initContainerSecurityContext: {}
+  # containerEnv specifies environment variables for the Ray container,
+  # Follows standard K8s container env schema.
+  containerEnv: []
+  # - name: EXAMPLE_ENV
+  #   value: "1"
+  envFrom: []
+    # - secretRef:
+    #     name: my-env-secret
+  # ports optionally allows specifying ports for the Ray container.
+  # ports: []
+  # resource requests and limits for the Ray head container.
+  # Modify as needed for your application.
+  # Note that the resources in this example are much too small for production;
+  # we don't recommend allocating less than 8G memory for a Ray pod in production.
+  # Ray pods should be sized to take up entire K8s nodes when possible.
+  # Always set CPU and memory limits for Ray pods.
+  # It is usually best to set requests equal to limits.
+  # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
+  # for further guidance.
+  resources:
+    limits:
+      cpu: "1"
+      nvidia.com/gpu: "1"
+      memory: "20G"
+      ephemeral-storage: 20Gi
+    requests:
+      cpu: "1"
+      nvidia.com/gpu: "1"
+      memory: "20G"
+      ephemeral-storage: 10Gi
+  annotations:
+    key: value
+  nodeSelector:
+    iam.gke.io/gke-metadata-server-enabled: "true"
+    cloud.google.com/gke-accelerator: "nvidia-tesla-t4"
+  tolerations: []
+  affinity: {}
+  # Ray container security context.
+  securityContext: {}
+  volumes:
+    - name: ray-logs
+      emptyDir: {}
+    - name: fluentbit-config
+      configMap:
+        name: fluentbit-config
+  # Ray writes logs to /tmp/ray/session_latests/logs
+  volumeMounts:
+    - mountPath: /tmp/ray
+      name: ray-logs
+  # sidecarContainers specifies additional containers to attach to the Ray pod.
+  # Follows standard K8s container spec.
+  sidecarContainers:
+    - name: fluentbit
+      image: fluent/fluent-bit:1.9.6
+      # These resource requests for Fluent Bit should be sufficient in production.
+      resources:
+        requests:
+          cpu: 100m
+          memory: 128Mi
+          ephemeral-storage: 2Gi
+        limits:
+          cpu: 100m
+          memory: 128Mi
+          ephemeral-storage: 4Gi
+      volumeMounts:
+      - mountPath: /tmp/ray
+        name: ray-logs
+      - mountPath: /fluent-bit/etc/
+        name: fluentbit-config
+
+# The map's key is used as the groupName.
+# For example, key:small-group in the map below
+# will be used as the groupName
+additionalWorkerGroups:
+  smallGroup:
+    # Disabled by default
+    disabled: true
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 3
+    type: worker
+    labels: {}
+    rayStartParams:
+      block: 'true'
+    initContainerImage: 'busybox:1.28'  # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories.
+    # Security context for the init container.
+    initContainerSecurityContext: {}
+  # containerEnv specifies environment variables for the Ray container,
+  # Follows standard K8s container env schema.
+    containerEnv: []
+      # - name: EXAMPLE_ENV
+      #   value: "1"
+    envFrom: []
+        # - secretRef:
+        #     name: my-env-secret
+    # ports optionally allows specifying ports for the Ray container.
+    # ports: []
+  # resource requests and limits for the Ray head container.
+  # Modify as needed for your application.
+  # Note that the resources in this example are much too small for production;
+  # we don't recommend allocating less than 8G memory for a Ray pod in production.
+  # Ray pods should be sized to take up entire K8s nodes when possible.
+  # Always set CPU and memory limits for Ray pods.
+  # It is usually best to set requests equal to limits.
+  # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
+  # for further guidance.
+    resources:
+      limits:
+        cpu: 1
+        memory: "1G"
+      requests:
+        cpu: 1
+        memory: "1G"
+    annotations:
+      key: value
+    nodeSelector: {}
+    tolerations: []
+    affinity: {}
+    # Ray container security context.
+    securityContext: {}
+    volumes:
+      - name: ray-logs
+        emptyDir: {}
+      - name: fluentbit-config
+        configMap:
+          name: fluentbit-config
+    # Ray writes logs to /tmp/ray/session_latests/logs
+    volumeMounts:
+      - mountPath: /tmp/ray
+        name: ray-logs
+    # sidecarContainers specifies additional containers to attach to the Ray pod.
+    # Follows standard K8s container spec.
+    sidecarContainers:
+      - name: fluentbit
+        image: fluent/fluent-bit:1.9.6
+        # These resource requests for Fluent Bit should be sufficient in production.
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+            ephemeral-storage: 2Gi
+          limits:
+            cpu: 100m
+            memory: 128Mi
+            ephemeral-storage: 4Gi
+        volumeMounts:
+        - mountPath: /tmp/ray
+          name: ray-logs
+        - mountPath: /fluent-bit/etc/
+          name: fluentbit-config
+
+service:
+  type: ClusterIP
diff --git a/ray-on-gke/user/modules/kuberay/kuberay.tf b/ray-on-gke/user/modules/kuberay/kuberay.tf
@@ -17,5 +17,5 @@ resource "helm_release" "ray-cluster" {
   repository = "https://ray-project.github.io/kuberay-helm/"
   chart      = "ray-cluster"
   namespace  = var.namespace
-  values     = var.enable_tpu ? [file("${path.module}/kuberay-tpu-values.yaml")] : [file("${path.module}/kuberay-values.yaml")]
+  values     = var.enable_tpu ? [file("${path.module}/kuberay-tpu-values.yaml")] : (var.enable_fault_tolerance ? [file("${path.module}/kuberay-with-fault-tolerance.yaml")] : [file("${path.module}/kuberay-values.yaml")])
 }
diff --git a/ray-on-gke/user/modules/kuberay/variables.tf b/ray-on-gke/user/modules/kuberay/variables.tf
@@ -22,4 +22,10 @@ variable "enable_tpu" {
   type        = bool
   description = "Set to true to create TPU node pool"
   default     = false
-}
+}
+
+variable "enable_fault_tolerance" {
+  type        = bool
+  description = "Set to true to create fault tolerant Ray cluster"
+  default     = false
+}