Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use longer exec probe timeouts for Head pods #2353

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,9 +264,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
}

if rayContainer.LivenessProbe == nil {
probeTimeout := utils.DefaultLivenessProbeTimeoutSeconds
if rayNodeType == rayv1.HeadNode {
probeTimeout = utils.DefaultHeadLivenessProbeTimeoutSeconds
}

rayContainer.LivenessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds,
TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds,
TimeoutSeconds: int32(probeTimeout),
PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds,
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
Expand All @@ -275,9 +280,13 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
}

if rayContainer.ReadinessProbe == nil {
probeTimeout := utils.DefaultReadinessProbeTimeoutSeconds
if rayNodeType == rayv1.HeadNode {
probeTimeout = utils.DefaultHeadReadinessProbeTimeoutSeconds
}
rayContainer.ReadinessProbe = &corev1.Probe{
InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds,
TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds,
TimeoutSeconds: int32(probeTimeout),
PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds,
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,
Expand Down
18 changes: 17 additions & 1 deletion ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1128,7 +1128,7 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
assert.Nil(t, rayContainer.LivenessProbe.Exec)
assert.Nil(t, rayContainer.ReadinessProbe.Exec)

// Test 2: User does not define a custom probe. KubeRay will inject Exec probe.
// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
Expand All @@ -1138,4 +1138,20 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)

// Test 3: User does not define a custom probe. KubeRay will inject Exec probe for head pod.
// Here we test the case where the Ray Pod originates from RayServiceCRD,
// implying that an additional serve health check will be added to the readiness probe.
rayContainer.LivenessProbe = nil
rayContainer.ReadinessProbe = nil
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
// head pod should not have Ray Serve proxy health probes
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
}
18 changes: 11 additions & 7 deletions ray-operator/controllers/ray/utils/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,21 @@ const (
// Ray FT default readiness probe values
DefaultReadinessProbeInitialDelaySeconds = 10
DefaultReadinessProbeTimeoutSeconds = 2
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadReadinessProbeTimeoutSeconds = 5
DefaultReadinessProbePeriodSeconds = 5
DefaultReadinessProbeSuccessThreshold = 1
DefaultReadinessProbeFailureThreshold = 10
ServeReadinessProbeFailureThreshold = 1

// Ray FT default liveness probe values
DefaultLivenessProbeInitialDelaySeconds = 30
DefaultLivenessProbeTimeoutSeconds = 2
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
DefaultHeadLivenessProbeTimeoutSeconds = 5
DefaultLivenessProbePeriodSeconds = 5
DefaultLivenessProbeSuccessThreshold = 1
DefaultLivenessProbeFailureThreshold = 120

// Ray health check related configurations
// Note: Since the Raylet process and the dashboard agent process are fate-sharing,
Expand Down
Loading