From 699543397b2ab60b8f5627214ae9d335cdf5091f Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Thu, 11 Jul 2024 10:19:10 +0200 Subject: [PATCH] Generate AppWrapper name to provide unique workloads --- test/e2e/mnist_pytorch_appwrapper_test.go | 34 ++++++++++++++--------- test/e2e/mnist_rayjob_raycluster_test.go | 20 ++++++++----- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/test/e2e/mnist_pytorch_appwrapper_test.go b/test/e2e/mnist_pytorch_appwrapper_test.go index 94239f57..ab196959 100644 --- a/test/e2e/mnist_pytorch_appwrapper_test.go +++ b/test/e2e/mnist_pytorch_appwrapper_test.go @@ -75,14 +75,20 @@ func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) { Kind: "Job", }, ObjectMeta: metav1.ObjectMeta{ - Name: "mnist", - Namespace: namespace.Name, + GenerateName: "mnist", + Namespace: namespace.Name, }, Spec: batchv1.JobSpec{ Completions: Ptr(int32(1)), Parallelism: Ptr(int32(1)), Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ + Tolerations: []corev1.Toleration{ + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpExists, + }, + }, Containers: []corev1.Container{ { Name: "job", @@ -139,9 +145,9 @@ func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) { Kind: "AppWrapper", }, ObjectMeta: metav1.ObjectMeta{ - Name: "mnist", - Namespace: namespace.Name, - Labels: map[string]string{"kueue.x-k8s.io/queue-name": localQueue.Name}, + GenerateName: "mnist", + Namespace: namespace.Name, + Labels: map[string]string{"kueue.x-k8s.io/queue-name": localQueue.Name}, }, Spec: mcadv1beta2.AppWrapperSpec{ Components: []mcadv1beta2.AppWrapperComponent{ @@ -158,16 +164,18 @@ func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) { unstruct := unstructured.Unstructured{Object: awMap} _, err = test.Client().Dynamic().Resource(appWrapperResource).Namespace(namespace.Name).Create(test.Ctx(), &unstruct, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created AppWrapper %s/%s successfully", aw.Namespace, aw.Name) + test.T().Logf("Created AppWrapper %s/%s successfully", aw.Namespace, aw.GenerateName) - test.T().Logf("Waiting for AppWrapper %s/%s to be running", aw.Namespace, aw.Name) - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). - Should(WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperRunning))) + test.T().Logf("Waiting for AppWrapper %s/%s to be running", aw.Namespace, aw.GenerateName) + test.Eventually(AppWrappers(test, namespace), TestTimeoutMedium). + Should(ContainElement(WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperRunning)))) test.T().Logf("Waiting for AppWrapper %s/%s to complete", job.Namespace, job.Name) - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutLong).Should( - Or( - WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperSucceeded)), - WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperFailed)), + test.Eventually(AppWrappers(test, namespace), TestTimeoutLong).Should( + ContainElement( + Or( + WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperSucceeded)), + WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperFailed)), + ), )) } diff --git a/test/e2e/mnist_rayjob_raycluster_test.go b/test/e2e/mnist_rayjob_raycluster_test.go index 0f2490c2..aae9e1d0 100644 --- a/test/e2e/mnist_rayjob_raycluster_test.go +++ b/test/e2e/mnist_rayjob_raycluster_test.go @@ -127,9 +127,9 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number Kind: "AppWrapper", }, ObjectMeta: metav1.ObjectMeta{ - Name: rayCluster.Name, - Namespace: namespace.Name, - Labels: map[string]string{"kueue.x-k8s.io/queue-name": localQueue.Name}, + GenerateName: rayCluster.Name, + Namespace: namespace.Name, + Labels: map[string]string{"kueue.x-k8s.io/queue-name": localQueue.Name}, }, Spec: mcadv1beta2.AppWrapperSpec{ Components: []mcadv1beta2.AppWrapperComponent{ @@ -145,11 +145,11 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number unstruct := unstructured.Unstructured{Object: awMap} _, err = test.Client().Dynamic().Resource(appWrapperResource).Namespace(namespace.Name).Create(test.Ctx(), &unstruct, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created AppWrapper %s/%s successfully", aw.Namespace, aw.Name) + test.T().Logf("Created AppWrapper %s/%s successfully", aw.Namespace, aw.GenerateName) - test.T().Logf("Waiting for AppWrapper %s/%s to be running", aw.Namespace, aw.Name) - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). - Should(WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperRunning))) + test.T().Logf("Waiting for AppWrapper %s/%s to be running", aw.Namespace, aw.GenerateName) + test.Eventually(AppWrappers(test, namespace), TestTimeoutMedium). + Should(ContainElement(WithTransform(AppWrapperPhase, Equal(mcadv1beta2.AppWrapperRunning)))) test.T().Logf("Waiting for RayCluster %s/%s to be running", rayCluster.Namespace, rayCluster.Name) test.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutMedium). @@ -266,6 +266,12 @@ func constructRayCluster(_ Test, namespace *corev1.Namespace, mnist *corev1.Conf RayStartParams: map[string]string{}, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ + Tolerations: []corev1.Toleration{ + { + Key: "nvidia.com/gpu", + Operator: corev1.TolerationOpExists, + }, + }, Containers: []corev1.Container{ { Name: "ray-worker",