ray-project · kevin85421 · Oct 30, 2023 · Oct 28, 2023 · architkulkarni · Oct 30, 2023
diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go
@@ -889,11 +889,12 @@ func (r *RayServiceReconciler) generateConfigKeyPrefix(rayServiceInstance *rayv1
 }
 
 // Return true if healthy, otherwise false.
-func (r *RayServiceReconciler) updateAndCheckDashboardStatus(rayServiceClusterStatus *rayv1.RayServiceStatus, isHealthy bool, unhealthyThreshold *int32) bool {
+func updateAndCheckDashboardStatus(rayServiceClusterStatus *rayv1.RayServiceStatus, isHealthy bool, unhealthyThreshold *int32) bool {
 	timeNow := metav1.Now()
+	oldIsHealthy := rayServiceClusterStatus.DashboardStatus.IsHealthy
 	rayServiceClusterStatus.DashboardStatus.LastUpdateTime = &timeNow
 	rayServiceClusterStatus.DashboardStatus.IsHealthy = isHealthy
-	if rayServiceClusterStatus.DashboardStatus.HealthLastUpdateTime.IsZero() || isHealthy {
+	if rayServiceClusterStatus.DashboardStatus.HealthLastUpdateTime.IsZero() || oldIsHealthy {
 		rayServiceClusterStatus.DashboardStatus.HealthLastUpdateTime = &timeNow
 	}
 
@@ -1045,7 +1046,7 @@ func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context,
 	rayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus
 
 	if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DashboardAgentListenPortName); err != nil || clientURL == "" {
-		r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
+		updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
 		return err
 	}
 
@@ -1054,11 +1055,11 @@ func (r *RayServiceReconciler) updateStatusForActiveCluster(ctx context.Context,
 
 	var isHealthy, isReady bool
 	if isHealthy, isReady, err = r.getAndCheckServeStatus(ctx, rayDashboardClient, rayServiceStatus, r.determineServeConfigType(rayServiceInstance), rayServiceInstance.Spec.ServiceUnhealthySecondThreshold); err != nil {
-		r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
+		updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
 		return err
 	}
 
-	r.updateAndCheckDashboardStatus(rayServiceStatus, true, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
+	updateAndCheckDashboardStatus(rayServiceStatus, true, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
 
 	logger.Info("Check serve health", "isHealthy", isHealthy, "isReady", isReady)
 
@@ -1084,19 +1085,24 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
 		rayServiceStatus = &rayServiceInstance.Status.PendingServiceStatus
 	}
 
-	// Check if head pod is running and ready. If not, requeue the resource event to avoid
-	// redundant custom resource status updates.
-	//
-	// TODO (kevin85421): Note that the Dashboard and GCS may take a few seconds to start up
-	// after the head pod is running and ready. Hence, some requests to the Dashboard (e.g. `UpdateDeployments`) may fail.
-	// This is not an issue since `UpdateDeployments` is an idempotent operation.
-	if isRunningAndReady, err := r.isHeadPodRunningAndReady(ctx, rayClusterInstance); err != nil || !isRunningAndReady {
-		if err != nil {
-			logger.Error(err, "Failed to check if head pod is running and ready!")
-		} else {
-			logger.Info("Skipping the update of Serve deployments because the Ray head pod is not ready.")
+	// TODO (kevin85421): This `if` block is a hotfix for the case that active RayCluster's dashboard agent process crashes.
+	// I will refactor the health checking logic in the future.
+	if !isActive {
+		// Check if head pod is running and ready. If not, requeue the resource event to avoid
+		// redundant custom resource status updates.
+		//
+		// TODO (kevin85421): Note that the Dashboard and GCS may take a few seconds to start up
+		// after the head pod is running and ready. Hence, some requests to the Dashboard (e.g. `UpdateDeployments`) may fail.
+		// This is not an issue since `UpdateDeployments` is an idempotent operation.
+		logger.Info("Check the head Pod status of the pending RayCluster", "RayCluster name", rayClusterInstance.Name)
+		if isRunningAndReady, err := r.isHeadPodRunningAndReady(ctx, rayClusterInstance); err != nil || !isRunningAndReady {
+			if err != nil {
+				logger.Error(err, "Failed to check if head Pod is running and ready!")
+			} else {
+				logger.Info("Skipping the update of Serve deployments because the Ray head Pod is not ready.")
+			}
+			return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, false, err
 		}
-		return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, false, err
 	}
 
 	if clientURL, err = utils.FetchHeadServiceURL(ctx, &r.Log, r.Client, rayClusterInstance, common.DashboardAgentListenPortName); err != nil || clientURL == "" {
@@ -1106,10 +1112,9 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
 	rayDashboardClient.InitClient(clientURL)
 
 	shouldUpdate := r.checkIfNeedSubmitServeDeployment(rayServiceInstance, rayClusterInstance, rayServiceStatus)
-
 	if shouldUpdate {
 		if err = r.updateServeDeployment(ctx, rayServiceInstance, rayDashboardClient, rayClusterInstance.Name); err != nil {
-			if !r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
+			if !updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
 				logger.Info("Dashboard is unhealthy, restart the cluster.")
 				r.markRestart(rayServiceInstance)
 			}
@@ -1123,15 +1128,15 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
 
 	var isHealthy, isReady bool
 	if isHealthy, isReady, err = r.getAndCheckServeStatus(ctx, rayDashboardClient, rayServiceStatus, r.determineServeConfigType(rayServiceInstance), rayServiceInstance.Spec.ServiceUnhealthySecondThreshold); err != nil {
-		if !r.updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
+		if !updateAndCheckDashboardStatus(rayServiceStatus, false, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold) {
 			logger.Info("Dashboard is unhealthy, restart the cluster.")
 			r.markRestart(rayServiceInstance)
 		}
 		err = r.updateState(ctx, rayServiceInstance, rayv1.FailedToGetServeDeploymentStatus, err)
 		return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, false, err
 	}
 
-	r.updateAndCheckDashboardStatus(rayServiceStatus, true, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
+	updateAndCheckDashboardStatus(rayServiceStatus, true, rayServiceInstance.Spec.DeploymentUnhealthySecondThreshold)
 
 	logger.Info("Check serve health", "isHealthy", isHealthy, "isReady", isReady, "isActive", isActive)
 

diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go
@@ -791,6 +791,43 @@ func TestReconcileRayCluster(t *testing.T) {
 	}
 }
 
+func TestUpdateAndCheckDashboardStatus(t *testing.T) {
+	now := metav1.Now()
+	rayServiceStatus := rayv1.RayServiceStatus{
+		DashboardStatus: rayv1.DashboardStatus{
+			IsHealthy:            true,
+			HealthLastUpdateTime: &now,
+			LastUpdateTime:       &now,
+		},
+	}
+	deploymentUnhealthySecondThreshold := int32(300)
+
+	// Test 1: The dashboard agent was healthy, and the dashboard agent is still healthy.
+	svcStatusCopy := rayServiceStatus.DeepCopy()
+	assert.True(t, updateAndCheckDashboardStatus(svcStatusCopy, true, &deploymentUnhealthySecondThreshold))
+	assert.NotEqual(t, svcStatusCopy.DashboardStatus.HealthLastUpdateTime, now)
+
+	// Test 2: The dashboard agent was healthy, and the dashboard agent becomes unhealthy.
+	svcStatusCopy = rayServiceStatus.DeepCopy()
+	assert.True(t, updateAndCheckDashboardStatus(svcStatusCopy, false, &deploymentUnhealthySecondThreshold))
+	assert.NotEqual(t, *svcStatusCopy.DashboardStatus.HealthLastUpdateTime, now)
+
+	// Test 3: The dashboard agent was unhealthy, and the dashboard agent is still unhealthy.
+	svcStatusCopy = rayServiceStatus.DeepCopy()
+	svcStatusCopy.DashboardStatus.IsHealthy = false
+	assert.True(t, updateAndCheckDashboardStatus(svcStatusCopy, false, &deploymentUnhealthySecondThreshold))
+	// The `HealthLastUpdateTime` should not be updated.
+	assert.Equal(t, *svcStatusCopy.DashboardStatus.HealthLastUpdateTime, now)
+
+	// Test 4: The dashboard agent was unhealthy, and the dashboard agent lasts unhealthy for more than `deploymentUnhealthySecondThreshold` seconds.
+	svcStatusCopy = rayServiceStatus.DeepCopy()
+	svcStatusCopy.DashboardStatus.IsHealthy = false
+	minus301Seconds := metav1.NewTime(now.Add(-time.Second * time.Duration(deploymentUnhealthySecondThreshold+1)))
+	svcStatusCopy.DashboardStatus.HealthLastUpdateTime = &minus301Seconds
+	assert.False(t, updateAndCheckDashboardStatus(svcStatusCopy, false, &deploymentUnhealthySecondThreshold))
+	assert.Equal(t, *svcStatusCopy.DashboardStatus.HealthLastUpdateTime, minus301Seconds)
+}
+
 func initFakeDashboardClient(appName string, deploymentStatus string, appStatus string) utils.RayDashboardClientInterface {
 	fakeDashboardClient := utils.FakeRayDashboardClient{}
 	status := generateServeStatus(deploymentStatus, appStatus)