diff --git a/go.mod b/go.mod index 583e490f..7a70df90 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/openshift/api v0.0.0-20230823114715-5fdd7511b790 github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c github.com/project-codeflare/appwrapper v0.25.0 - github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e + github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0 github.com/ray-project/kuberay/ray-operator v1.1.1 go.uber.org/zap v1.27.0 golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 diff --git a/go.sum b/go.sum index 8ba1d5b6..bbb7e9f0 100644 --- a/go.sum +++ b/go.sum @@ -249,8 +249,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4 h1:XYjq50WpGxagELHurCXyiirvdM9OzxTnCMcQC9gebnQ= github.com/project-codeflare/appwrapper v0.25.1-0.20240926155059-30a8af17b8f4/go.mod h1:zDALq3/gn+eiczpD7TBZWWbAVuwcCGDFuN/77oh+CDw= -github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e h1:juFd1dQyioeMxbVE6F0YD25ozm/jiqJE+MpDhu8p22k= -github.com/project-codeflare/codeflare-common v0.0.0-20240628111341-56c962a09b7e/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y= +github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0 h1:5gfJUhF2GRZIXCUK/aUYTo79Ipo4Ngg9HO8Jgj8zThM= +github.com/project-codeflare/codeflare-common v0.0.0-20240927111823-758dad4e90d0/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y= github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= diff --git a/test/e2e/mnist_pytorch_appwrapper_test.go b/test/e2e/mnist_pytorch_appwrapper_test.go index ab142658..1919c616 100644 --- a/test/e2e/mnist_pytorch_appwrapper_test.go +++ b/test/e2e/mnist_pytorch_appwrapper_test.go @@ -28,23 +28,30 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/kueue/apis/kueue/v1beta1" ) func TestMnistPyTorchAppWrapperCpu(t *testing.T) { - runMnistPyTorchAppWrapper(t, "cpu") + runMnistPyTorchAppWrapper(t, "cpu", 0) } func TestMnistPyTorchAppWrapperGpu(t *testing.T) { - runMnistPyTorchAppWrapper(t, "gpu") + runMnistPyTorchAppWrapper(t, "gpu", 1) } // Trains the MNIST dataset as a batch Job in an AppWrapper, and asserts successful completion of the training job. -func runMnistPyTorchAppWrapper(t *testing.T, accelerator string) { +func runMnistPyTorchAppWrapper(t *testing.T, accelerator string, numberOfGpus int) { test := With(t) - // Create a namespace and localqueue in that namespace + // Create a namespace namespace := test.NewTestNamespace() - localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue") + + // Create Kueue resources + resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{}) + defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{}) + clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus) + defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{}) + localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue) // Test configuration config := &corev1.ConfigMap{ diff --git a/test/e2e/mnist_rayjob_raycluster_test.go b/test/e2e/mnist_rayjob_raycluster_test.go index b1408ed4..c66fb28f 100644 --- a/test/e2e/mnist_rayjob_raycluster_test.go +++ b/test/e2e/mnist_rayjob_raycluster_test.go @@ -33,6 +33,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/kueue/apis/kueue/v1beta1" ) // Trains the MNIST dataset as a RayJob, executed by a Ray cluster @@ -49,9 +50,15 @@ func TestMnistRayJobRayClusterGpu(t *testing.T) { func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int) { test := With(t) - // Create a namespace and localqueue in that namespace + // Create a namespace namespace := test.NewTestNamespace() - localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue") + + // Create Kueue resources + resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{}) + defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{}) + clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus) + defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{}) + CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue) // Create MNIST training script mnist := constructMNISTConfigMap(test, namespace) @@ -61,7 +68,6 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int // Create RayCluster and assign it to the localqueue rayCluster := constructRayCluster(test, namespace, mnist, numberOfGpus) - AssignToLocalQueue(rayCluster, localQueue) rayCluster, err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Create(test.Ctx(), rayCluster, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name) @@ -78,8 +84,8 @@ func runMnistRayJobRayCluster(t *testing.T, accelerator string, numberOfGpus int rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name) - test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String()) - rayClient := NewRayClusterClient(rayDashboardURL) + test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL) + rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken) test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name) test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). @@ -111,9 +117,15 @@ func TestMnistRayJobRayClusterAppWrapperGpu(t *testing.T) { func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, numberOfGpus int) { test := With(t) - // Create a namespace and localqueue in that namespace + // Create a namespace namespace := test.NewTestNamespace() - localQueue := CreateKueueLocalQueue(test, namespace.Name, "e2e-cluster-queue") + + // Create Kueue resources + resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{}) + defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{}) + clusterQueue := createClusterQueue(test, resourceFlavor, numberOfGpus) + defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{}) + localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue) // Create MNIST training script mnist := constructMNISTConfigMap(test, namespace) @@ -167,8 +179,8 @@ func runMnistRayJobRayClusterAppWrapper(t *testing.T, accelerator string, number rayDashboardURL := getRayDashboardURL(test, rayCluster.Namespace, rayCluster.Name) - test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String()) - rayClient := NewRayClusterClient(rayDashboardURL) + test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL) + rayClient := GetRayClusterClient(test, rayDashboardURL, test.Config().BearerToken) test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name) test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). @@ -374,7 +386,7 @@ func constructRayJob(_ Test, namespace *corev1.Namespace, rayCluster *rayv1.RayC } } -func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL { +func getRayDashboardURL(test Test, namespace, rayClusterName string) string { dashboardName := "ray-dashboard-" + rayClusterName if IsOpenShift(test) { @@ -396,10 +408,10 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL { return resp.StatusCode, nil }, TestTimeoutShort).Should(Not(Equal(503))) - return url.URL{ - Scheme: "https", - Host: hostname, - } + dashboardUrl, _ := url.Parse("https://" + hostname) + test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String()) + + return dashboardUrl.String() } ingress := GetIngress(test, namespace, dashboardName) @@ -408,8 +420,41 @@ func getRayDashboardURL(test Test, namespace, rayClusterName string) url.URL { test.Eventually(Ingress(test, ingress.Namespace, ingress.Name), TestTimeoutShort). Should(WithTransform(LoadBalancerIngresses, HaveLen(1))) - return url.URL{ - Scheme: "http", - Host: ingress.Spec.Rules[0].Host, + hostname := ingress.Spec.Rules[0].Host + dashboardUrl, _ := url.Parse("http://" + hostname) + test.T().Logf("Ray-dashboard route : %s\n", dashboardUrl.String()) + + return dashboardUrl.String() +} + +// Create ClusterQueue +func createClusterQueue(test Test, resourceFlavor *v1beta1.ResourceFlavor, numberOfGpus int) *v1beta1.ClusterQueue { + cqSpec := v1beta1.ClusterQueueSpec{ + NamespaceSelector: &metav1.LabelSelector{}, + ResourceGroups: []v1beta1.ResourceGroup{ + { + CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")}, + Flavors: []v1beta1.FlavorQuotas{ + { + Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name), + Resources: []v1beta1.ResourceQuota{ + { + Name: corev1.ResourceCPU, + NominalQuota: resource.MustParse("8"), + }, + { + Name: corev1.ResourceMemory, + NominalQuota: resource.MustParse("12Gi"), + }, + { + Name: corev1.ResourceName("nvidia.com/gpu"), + NominalQuota: resource.MustParse(fmt.Sprint(numberOfGpus)), + }, + }, + }, + }, + }, + }, } + return CreateKueueClusterQueue(test, cqSpec) } diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index 1ee1d90f..71b5b12a 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -69,30 +69,3 @@ done echo "" sleep 5 -echo Creating Kueue ResourceFlavor and ClusterQueue -cat <