diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 70f08abcfa..d90a4baa64 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -264,9 +264,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r } if rayContainer.LivenessProbe == nil { + probeTimeout := utils.DefaultLivenessProbeTimeoutSeconds + if rayNodeType == rayv1.HeadNode { + probeTimeout = utils.DefaultHeadLivenessProbeTimeoutSeconds + } + rayContainer.LivenessProbe = &corev1.Probe{ InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds, - TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds, + TimeoutSeconds: int32(probeTimeout), PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds, SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold, FailureThreshold: utils.DefaultLivenessProbeFailureThreshold, @@ -275,9 +280,13 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r } if rayContainer.ReadinessProbe == nil { + probeTimeout := utils.DefaultReadinessProbeTimeoutSeconds + if rayNodeType == rayv1.HeadNode { + probeTimeout = utils.DefaultHeadReadinessProbeTimeoutSeconds + } rayContainer.ReadinessProbe = &corev1.Probe{ InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds, - TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds, + TimeoutSeconds: int32(probeTimeout), PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds, SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold, FailureThreshold: utils.DefaultReadinessProbeFailureThreshold, diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 7bca65c24e..1b7e277219 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1128,7 +1128,7 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { assert.Nil(t, rayContainer.LivenessProbe.Exec) assert.Nil(t, rayContainer.ReadinessProbe.Exec) - // Test 2: User does not define a custom probe. KubeRay will inject Exec probe. + // Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod. // Here we test the case where the Ray Pod originates from RayServiceCRD, // implying that an additional serve health check will be added to the readiness probe. rayContainer.LivenessProbe = nil @@ -1138,4 +1138,20 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) { assert.NotNil(t, rayContainer.ReadinessProbe.Exec) assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) + assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds) + assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds) + + // Test 3: User does not define a custom probe. KubeRay will inject Exec probe for head pod. + // Here we test the case where the Ray Pod originates from RayServiceCRD, + // implying that an additional serve health check will be added to the readiness probe. + rayContainer.LivenessProbe = nil + rayContainer.ReadinessProbe = nil + initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD) + assert.NotNil(t, rayContainer.LivenessProbe.Exec) + assert.NotNil(t, rayContainer.ReadinessProbe.Exec) + // head pod should not have Ray Serve proxy health probes + assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) + assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath)) + assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds) + assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds) } diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 7fa2299e50..abbb5ca7d7 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -149,17 +149,21 @@ const ( // Ray FT default readiness probe values DefaultReadinessProbeInitialDelaySeconds = 10 DefaultReadinessProbeTimeoutSeconds = 2 - DefaultReadinessProbePeriodSeconds = 5 - DefaultReadinessProbeSuccessThreshold = 1 - DefaultReadinessProbeFailureThreshold = 10 - ServeReadinessProbeFailureThreshold = 1 + // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) + DefaultHeadReadinessProbeTimeoutSeconds = 5 + DefaultReadinessProbePeriodSeconds = 5 + DefaultReadinessProbeSuccessThreshold = 1 + DefaultReadinessProbeFailureThreshold = 10 + ServeReadinessProbeFailureThreshold = 1 // Ray FT default liveness probe values DefaultLivenessProbeInitialDelaySeconds = 30 DefaultLivenessProbeTimeoutSeconds = 2 - DefaultLivenessProbePeriodSeconds = 5 - DefaultLivenessProbeSuccessThreshold = 1 - DefaultLivenessProbeFailureThreshold = 120 + // Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz) + DefaultHeadLivenessProbeTimeoutSeconds = 5 + DefaultLivenessProbePeriodSeconds = 5 + DefaultLivenessProbeSuccessThreshold = 1 + DefaultLivenessProbeFailureThreshold = 120 // Ray health check related configurations // Note: Since the Raylet process and the dashboard agent process are fate-sharing,