diff --git a/ray-operator/config/manager/manager.yaml b/ray-operator/config/manager/manager.yaml index b22780fca7..93473aeba2 100644 --- a/ray-operator/config/manager/manager.yaml +++ b/ray-operator/config/manager/manager.yaml @@ -26,7 +26,8 @@ spec: containers: - command: - /manager -# args: + args: + - --feature-gates=RayClusterStatusConditions=true # this argument can be removed for version >= v1.3 where the feature gate is enabled by default. # - --enable-leader-election image: kuberay/operator imagePullPolicy: IfNotPresent diff --git a/ray-operator/test/sampleyaml/raycluster_test.go b/ray-operator/test/sampleyaml/raycluster_test.go index 790aa7b177..85dfeb51a2 100644 --- a/ray-operator/test/sampleyaml/raycluster_test.go +++ b/ray-operator/test/sampleyaml/raycluster_test.go @@ -8,6 +8,8 @@ import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" . "github.com/ray-project/kuberay/ray-operator/test/support" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestRayCluster(t *testing.T) { @@ -76,6 +78,10 @@ func TestRayCluster(t *testing.T) { g.Expect(rayCluster).NotTo(BeNil()) test.T().Logf("Waiting for RayCluster %s/%s to be ready", namespace.Name, rayCluster.Name) + g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.HeadPodReady), MatchCondition(metav1.ConditionTrue, rayv1.HeadPodRunningAndReady))) + g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.RayClusterProvisioned), MatchCondition(metav1.ConditionTrue, rayv1.AllPodRunningAndReadyFirstTime))) g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). Should(WithTransform(RayClusterState, Equal(rayv1.Ready))) rayCluster, err = GetRayCluster(test, namespace.Name, rayCluster.Name) @@ -99,6 +105,19 @@ func TestRayCluster(t *testing.T) { // Check that all pods can submit jobs g.Eventually(SubmitJobsToAllPods(test, rayCluster), TestTimeoutShort).Should(Succeed()) + + // Delete all pods after setting quota to 0 to avoid recreating pods + KubectlApplyQuota(test, namespace.Name, "--hard=cpu=0,memory=0G,pods=0") + KubectlDeleteAllPods(test, namespace.Name) + // The HeadPodReady condition should now be False with a HeadPodNotFound reason. + g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.HeadPodReady), MatchCondition(metav1.ConditionFalse, rayv1.HeadPodNotFound))) + // The RayClusterProvisioned condition should still be True. + g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.RayClusterProvisioned), MatchCondition(metav1.ConditionTrue, rayv1.AllPodRunningAndReadyFirstTime))) + // The RayClusterReplicaFailure condition now be True with a FailedCreateHeadPod reason due to the quota limit. + g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.RayClusterReplicaFailure), MatchCondition(metav1.ConditionTrue, "FailedCreateHeadPod"))) }) } } diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go index 0d63ee09f1..ea75ec5b30 100644 --- a/ray-operator/test/support/ray.go +++ b/ray-operator/test/support/ray.go @@ -3,6 +3,8 @@ package support import ( "errors" + "github.com/onsi/gomega/format" + "github.com/onsi/gomega/types" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" @@ -70,6 +72,48 @@ func RayClusterState(cluster *rayv1.RayCluster) rayv1.ClusterState { return cluster.Status.State //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288 } +func StatusCondition(condType rayv1.RayClusterConditionType) func(*rayv1.RayCluster) metav1.Condition { + return func(cluster *rayv1.RayCluster) metav1.Condition { + if cluster != nil { + for _, cond := range cluster.Status.Conditions { + if cond.Type == string(condType) { + return cond + } + } + } + return metav1.Condition{} + } +} + +type ConditionMatcher struct { + expected metav1.Condition +} + +func (c *ConditionMatcher) Match(actual interface{}) (success bool, err error) { + if actual == nil { + return false, errors.New(" should be a metav1.Condition but it is nil") + } + a, ok := actual.(metav1.Condition) + if !ok { + return false, errors.New(" should be a metav1.Condition") + } + return a.Reason == c.expected.Reason && a.Status == c.expected.Status, nil +} + +func (c *ConditionMatcher) FailureMessage(actual interface{}) (message string) { + a := actual.(metav1.Condition) + return format.Message(a, "to equal", c.expected) +} + +func (c *ConditionMatcher) NegatedFailureMessage(actual interface{}) (message string) { + a := actual.(metav1.Condition) + return format.Message(a, "not to equal", c.expected) +} + +func MatchCondition(status metav1.ConditionStatus, reason string) types.GomegaMatcher { + return &ConditionMatcher{expected: metav1.Condition{Status: status, Reason: reason}} +} + func RayClusterDesiredWorkerReplicas(cluster *rayv1.RayCluster) int32 { return cluster.Status.DesiredWorkerReplicas } diff --git a/ray-operator/test/support/yaml.go b/ray-operator/test/support/yaml.go index e46e462e7e..289b96bc50 100644 --- a/ray-operator/test/support/yaml.go +++ b/ray-operator/test/support/yaml.go @@ -4,7 +4,7 @@ import ( "os" "os/exec" - "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/runtime" @@ -28,7 +28,7 @@ func DeserializeRayClusterYAML(t Test, filename string) *rayv1.RayCluster { t.T().Helper() rayCluster := &rayv1.RayCluster{} err := deserializeYAML(filename, rayCluster) - assert.NoError(t.T(), err) + require.NoError(t.T(), err, "Fail to deserialize yaml file %s", filename) return rayCluster } @@ -36,7 +36,7 @@ func DeserializeRayJobYAML(t Test, filename string) *rayv1.RayJob { t.T().Helper() rayJob := &rayv1.RayJob{} err := deserializeYAML(filename, rayJob) - assert.NoError(t.T(), err) + require.NoError(t.T(), err, "Fail to deserialize yaml file %s", filename) return rayJob } @@ -44,7 +44,7 @@ func DeserializeRayServiceYAML(t Test, filename string) *rayv1.RayService { t.T().Helper() rayService := &rayv1.RayService{} err := deserializeYAML(filename, rayService) - assert.NoError(t.T(), err) + require.NoError(t.T(), err, "Fail to deserialize yaml file %s", filename) return rayService } @@ -52,8 +52,22 @@ func KubectlApplyYAML(t Test, filename string, namespace string) { t.T().Helper() kubectlCmd := exec.CommandContext(t.Ctx(), "kubectl", "apply", "-f", filename, "-n", namespace) err := kubectlCmd.Run() - if err != nil { - t.T().Fatalf("Failed to apply %s to namespace %s: %v", filename, namespace, err) - } + require.NoError(t.T(), err, "Failed to apply %s to namespace %s", filename, namespace) t.T().Logf("Successfully applied %s to namespace %s", filename, namespace) } + +func KubectlApplyQuota(t Test, namespace, quota string) { + t.T().Helper() + kubectlCmd := exec.CommandContext(t.Ctx(), "kubectl", "create", "quota", namespace, "-n", namespace, quota) + err := kubectlCmd.Run() + require.NoError(t.T(), err, "Failed to apply quota %s in %s", quota, namespace) + t.T().Logf("Successfully applied quota %s in %s", quota, namespace) +} + +func KubectlDeleteAllPods(t Test, namespace string) { + t.T().Helper() + kubectlCmd := exec.CommandContext(t.Ctx(), "kubectl", "delete", "--all", "pods", "-n", namespace) + err := kubectlCmd.Run() + require.NoError(t.T(), err, "Failed to delete pods in %s", namespace) + t.T().Logf("Successfully delete pods in %s", namespace) +}