-
Notifications
You must be signed in to change notification settings - Fork 177
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Setup GCS fault tolerance with Redis for ray
- Loading branch information
Showing
13 changed files
with
466 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,4 +40,4 @@ variable "enable_tpu" { | |
type = bool | ||
description = "Set to true to create TPU node pool" | ||
default = false | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
312 changes: 312 additions & 0 deletions
312
ray-on-gke/user/modules/kuberay/kuberay-with-fault-tolerance.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
# Copyright 2023 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Default values for ray-cluster. | ||
# This is a YAML-formatted file. | ||
# Declare variables to be passed into your templates. | ||
|
||
# The KubeRay community welcomes PRs to expose additional configuration | ||
# in this Helm chart. | ||
|
||
image: | ||
# Replace this with your own image if needed. | ||
repository: rayproject/ray | ||
tag: 2.6.1-py310-gpu | ||
pullPolicy: IfNotPresent | ||
|
||
nameOverride: "kuberay" | ||
fullnameOverride: "" | ||
|
||
imagePullSecrets: [] | ||
# - name: an-existing-secret | ||
|
||
annotations: | ||
ray.io/ft-enabled: "true" # <- add this annotation enable GCS FT | ||
ray.io/external-storage-namespace: "my-raycluster-storage-namespace" | ||
|
||
head: | ||
groupName: headgroup | ||
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. | ||
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0 | ||
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. | ||
# enableInTreeAutoscaling: true | ||
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. | ||
# The example configuration shown below below represents the DEFAULT values. | ||
# autoscalerOptions: | ||
# upscalingMode: Default | ||
# idleTimeoutSeconds: 60 | ||
# securityContext: {} | ||
# env: [] | ||
# envFrom: [] | ||
# resources specifies optional resource request and limit overrides for the autoscaler container. | ||
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. | ||
# resources: | ||
# limits: | ||
# cpu: "500m" | ||
# memory: "512Mi" | ||
# requests: | ||
# cpu: "500m" | ||
# memory: "512Mi" | ||
labels: | ||
cloud.google.com/gke-ray-node-type: head | ||
rayStartParams: | ||
dashboard-host: '0.0.0.0' | ||
block: 'true' | ||
redis-password: $REDIS_PASSWORD | ||
# containerEnv specifies environment variables for the Ray container, | ||
# Follows standard K8s container env schema. | ||
containerEnv: | ||
# - name: EXAMPLE_ENV | ||
# value: "1" | ||
- name: RAY_memory_monitor_refresh_ms | ||
value: "0" | ||
- name: RAY_REDIS_ADDRESS | ||
value: redis:6379 | ||
- name: REDIS_PASSWORD | ||
valueFrom: | ||
secretKeyRef: | ||
name: redis-password-secret | ||
key: password | ||
envFrom: [] | ||
# - secretRef: | ||
# name: my-env-secret | ||
# ports optionally allows specifying ports for the Ray container. | ||
# ports: [] | ||
# resource requests and limits for the Ray head container. | ||
# Modify as needed for your application. | ||
# Note that the resources in this example are much too small for production; | ||
# we don't recommend allocating less than 8G memory for a Ray pod in production. | ||
# Ray pods should be sized to take up entire K8s nodes when possible. | ||
# Always set CPU and memory limits for Ray pods. | ||
# It is usually best to set requests equal to limits. | ||
# See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources | ||
# for further guidance. | ||
resources: | ||
limits: | ||
cpu: "8" | ||
nvidia.com/gpu: "1" | ||
# To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. | ||
memory: "20G" | ||
ephemeral-storage: 20Gi | ||
requests: | ||
cpu: "8" | ||
nvidia.com/gpu: "1" | ||
memory: "20G" | ||
ephemeral-storage: 10Gi | ||
annotations: {} | ||
nodeSelector: | ||
iam.gke.io/gke-metadata-server-enabled: "true" | ||
cloud.google.com/gke-accelerator: "nvidia-tesla-t4" | ||
tolerations: [] | ||
affinity: {} | ||
# Ray container security context. | ||
securityContext: {} | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
- name: fluentbit-config | ||
configMap: | ||
name: fluentbit-config | ||
# Ray writes logs to /tmp/ray/session_latests/logs | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
# sidecarContainers specifies additional containers to attach to the Ray pod. | ||
# Follows standard K8s container spec. | ||
sidecarContainers: | ||
- name: fluentbit | ||
image: fluent/fluent-bit:1.9.6 | ||
# These resource requests for Fluent Bit should be sufficient in production. | ||
resources: | ||
requests: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 2Gi | ||
limits: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 4Gi | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
- mountPath: /fluent-bit/etc/ | ||
name: fluentbit-config | ||
|
||
worker: | ||
# If you want to disable the default workergroup | ||
# uncomment the line below | ||
# disabled: true | ||
groupName: workergroup | ||
replicas: 1 | ||
type: worker | ||
labels: | ||
cloud.google.com/gke-ray-node-type: worker | ||
rayStartParams: | ||
block: 'true' | ||
initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. | ||
# Security context for the init container. | ||
initContainerSecurityContext: {} | ||
# containerEnv specifies environment variables for the Ray container, | ||
# Follows standard K8s container env schema. | ||
containerEnv: [] | ||
# - name: EXAMPLE_ENV | ||
# value: "1" | ||
envFrom: [] | ||
# - secretRef: | ||
# name: my-env-secret | ||
# ports optionally allows specifying ports for the Ray container. | ||
# ports: [] | ||
# resource requests and limits for the Ray head container. | ||
# Modify as needed for your application. | ||
# Note that the resources in this example are much too small for production; | ||
# we don't recommend allocating less than 8G memory for a Ray pod in production. | ||
# Ray pods should be sized to take up entire K8s nodes when possible. | ||
# Always set CPU and memory limits for Ray pods. | ||
# It is usually best to set requests equal to limits. | ||
# See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources | ||
# for further guidance. | ||
resources: | ||
limits: | ||
cpu: "1" | ||
nvidia.com/gpu: "1" | ||
memory: "20G" | ||
ephemeral-storage: 20Gi | ||
requests: | ||
cpu: "1" | ||
nvidia.com/gpu: "1" | ||
memory: "20G" | ||
ephemeral-storage: 10Gi | ||
annotations: | ||
key: value | ||
nodeSelector: | ||
iam.gke.io/gke-metadata-server-enabled: "true" | ||
cloud.google.com/gke-accelerator: "nvidia-tesla-t4" | ||
tolerations: [] | ||
affinity: {} | ||
# Ray container security context. | ||
securityContext: {} | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
- name: fluentbit-config | ||
configMap: | ||
name: fluentbit-config | ||
# Ray writes logs to /tmp/ray/session_latests/logs | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
# sidecarContainers specifies additional containers to attach to the Ray pod. | ||
# Follows standard K8s container spec. | ||
sidecarContainers: | ||
- name: fluentbit | ||
image: fluent/fluent-bit:1.9.6 | ||
# These resource requests for Fluent Bit should be sufficient in production. | ||
resources: | ||
requests: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 2Gi | ||
limits: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 4Gi | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
- mountPath: /fluent-bit/etc/ | ||
name: fluentbit-config | ||
|
||
# The map's key is used as the groupName. | ||
# For example, key:small-group in the map below | ||
# will be used as the groupName | ||
additionalWorkerGroups: | ||
smallGroup: | ||
# Disabled by default | ||
disabled: true | ||
replicas: 1 | ||
minReplicas: 1 | ||
maxReplicas: 3 | ||
type: worker | ||
labels: {} | ||
rayStartParams: | ||
block: 'true' | ||
initContainerImage: 'busybox:1.28' # Enable users to specify the image for init container. Users can pull the busybox image from their private repositories. | ||
# Security context for the init container. | ||
initContainerSecurityContext: {} | ||
# containerEnv specifies environment variables for the Ray container, | ||
# Follows standard K8s container env schema. | ||
containerEnv: [] | ||
# - name: EXAMPLE_ENV | ||
# value: "1" | ||
envFrom: [] | ||
# - secretRef: | ||
# name: my-env-secret | ||
# ports optionally allows specifying ports for the Ray container. | ||
# ports: [] | ||
# resource requests and limits for the Ray head container. | ||
# Modify as needed for your application. | ||
# Note that the resources in this example are much too small for production; | ||
# we don't recommend allocating less than 8G memory for a Ray pod in production. | ||
# Ray pods should be sized to take up entire K8s nodes when possible. | ||
# Always set CPU and memory limits for Ray pods. | ||
# It is usually best to set requests equal to limits. | ||
# See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources | ||
# for further guidance. | ||
resources: | ||
limits: | ||
cpu: 1 | ||
memory: "1G" | ||
requests: | ||
cpu: 1 | ||
memory: "1G" | ||
annotations: | ||
key: value | ||
nodeSelector: {} | ||
tolerations: [] | ||
affinity: {} | ||
# Ray container security context. | ||
securityContext: {} | ||
volumes: | ||
- name: ray-logs | ||
emptyDir: {} | ||
- name: fluentbit-config | ||
configMap: | ||
name: fluentbit-config | ||
# Ray writes logs to /tmp/ray/session_latests/logs | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
# sidecarContainers specifies additional containers to attach to the Ray pod. | ||
# Follows standard K8s container spec. | ||
sidecarContainers: | ||
- name: fluentbit | ||
image: fluent/fluent-bit:1.9.6 | ||
# These resource requests for Fluent Bit should be sufficient in production. | ||
resources: | ||
requests: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 2Gi | ||
limits: | ||
cpu: 100m | ||
memory: 128Mi | ||
ephemeral-storage: 4Gi | ||
volumeMounts: | ||
- mountPath: /tmp/ray | ||
name: ray-logs | ||
- mountPath: /fluent-bit/etc/ | ||
name: fluentbit-config | ||
|
||
service: | ||
type: ClusterIP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.