diff --git a/test/e2e/install-codeflare-sdk.sh b/test/e2e/install-codeflare-sdk.sh deleted file mode 100755 index e90f4071..00000000 --- a/test/e2e/install-codeflare-sdk.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# go to codeflare-sdk folder and install codeflare-sdk -cd .. - -# Install Poetry and configure virtualenvs -pip install poetry -poetry config virtualenvs.create false - -cd codeflare-sdk -# Clone the CodeFlare SDK repository -git clone --branch main https://github.com/project-codeflare/codeflare-sdk.git - -cd codeflare-sdk - -# Lock dependencies and install them -poetry lock --no-update -poetry install --with test,docs - -# Return to the workdir -cd ../.. -cd workdir diff --git a/test/e2e/mnist_raycluster_sdk.py b/test/e2e/mnist_raycluster_sdk.py deleted file mode 100644 index d171c9b0..00000000 --- a/test/e2e/mnist_raycluster_sdk.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -import os - -from time import sleep - -from torchx.specs.api import AppState, is_terminal - -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration -from codeflare_sdk.job.jobs import DDPJobDefinition - -namespace = sys.argv[1] -ray_image = os.getenv('RAY_IMAGE') -host = os.getenv('CLUSTER_HOSTNAME') - -ingress_options = {} -if host is not None: - ingress_options = { - "ingresses": [ - { - "ingressName": "ray-dashboard", - "port": 8265, - "pathType": "Prefix", - "path": "/", - "host": host, - }, - ] - } - - -cluster = Cluster(ClusterConfiguration( - name='mnist', - namespace=namespace, - num_workers=1, - head_cpus='500m', - head_memory=2, - min_cpus='500m', - max_cpus=1, - min_memory=0.5, - max_memory=2, - num_gpus=0, - instascale=False, - image=ray_image, - ingress_options=ingress_options, -)) - -cluster.up() - -cluster.status() - -cluster.wait_ready() - -cluster.status() - -cluster.details() - -jobdef = DDPJobDefinition( - name="mnist", - script="mnist.py", - scheduler_args={"requirements": "requirements.txt"}, -) -job = jobdef.submit(cluster) - -done = False -time = 0 -timeout = 900 -while not done: - status = job.status() - if is_terminal(status.state): - break - if not done: - print(status) - if timeout and time >= timeout: - raise TimeoutError(f"job has timed out after waiting {timeout}s") - sleep(5) - time += 5 - -print(f"Job has completed: {status.state}") - -print(job.logs()) - -cluster.down() - -if not status.state == AppState.SUCCEEDED: - exit(1) -else: - exit(0) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go deleted file mode 100644 index be6933f6..00000000 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ /dev/null @@ -1,215 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e - -import ( - "testing" - - . "github.com/onsi/gomega" - . "github.com/project-codeflare/codeflare-common/support" - mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK. -// Asserts successful completion of the training job. -// -// This covers the installation of the CodeFlare SDK, as well as the RBAC required -// for the SDK to successfully perform requests to the cluster, on behalf of the -// impersonated user. -func TestMNISTRayClusterSDK(t *testing.T) { - test := With(t) - test.T().Parallel() - - // Create a namespace - namespace := test.NewTestNamespace() - - // Test configuration - config := CreateConfigMap(test, namespace.Name, map[string][]byte{ - // SDK script - "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - // codeflare-sdk installation script - "install-codeflare-sdk.sh": ReadFile(test, "install-codeflare-sdk.sh"), - }) - - // Create RBAC, retrieve token for user with limited rights - policyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, - APIGroups: []string{mcadv1beta1.GroupName}, - Resources: []string{"appwrappers"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{rayv1.GroupVersion.Group}, - Resources: []string{"rayclusters", "rayclusters/status"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"route.openshift.io"}, - Resources: []string{"routes"}, - }, - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"networking.k8s.io"}, - Resources: []string{"ingresses"}, - }, - } - - // Create cluster wide RBAC, required for SDK OpenShift check - // TODO reevaluate once SDK change OpenShift detection logic - clusterPolicyRules := []rbacv1.PolicyRule{ - { - Verbs: []string{"get", "list"}, - APIGroups: []string{"config.openshift.io"}, - Resources: []string{"ingresses"}, - ResourceNames: []string{"cluster"}, - }, - } - - sa := CreateServiceAccount(test, namespace.Name) - role := CreateRole(test, namespace.Name, policyRules) - CreateRoleBinding(test, namespace.Name, sa, role) - clusterRole := CreateClusterRole(test, clusterPolicyRules) - CreateClusterRoleBinding(test, sa, clusterRole) - - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", - Namespace: namespace.Name, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - BackoffLimit: Ptr(int32(0)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "test", - // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed - // See https://github.com/project-codeflare/codeflare-sdk/pull/146 - Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Env: []corev1.EnvVar{ - {Name: "PYTHONUSERBASE", Value: "/workdir"}, - {Name: "RAY_IMAGE", Value: GetRayImage()}, - }, - Command: []string{"/bin/sh", "-c", "cp /test/* . && chmod +x install-codeflare-sdk.sh && ./install-codeflare-sdk.sh && python mnist_raycluster_sdk.py" + " " + namespace.Name}, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - { - Name: "codeflare-sdk", - MountPath: "/codeflare-sdk", - }, - { - Name: "workdir", - MountPath: "/workdir", - }, - }, - WorkingDir: "/workdir", - SecurityContext: &corev1.SecurityContext{ - AllowPrivilegeEscalation: Ptr(false), - SeccompProfile: &corev1.SeccompProfile{ - Type: "RuntimeDefault", - }, - Capabilities: &corev1.Capabilities{ - Drop: []corev1.Capability{"ALL"}, - }, - RunAsNonRoot: Ptr(true), - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - { - Name: "codeflare-sdk", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - { - Name: "workdir", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - ServiceAccountName: sa.Name, - }, - }, - }, - } - if GetClusterType(test) == KindCluster { - // Take first KinD node and redirect pod hostname requests there - node := GetNodes(test)[0] - hostname := GetClusterHostname(test) - IP := GetNodeInternalIP(test, node) - - test.T().Logf("Setting KinD cluster hostname '%s' to node IP '%s' for SDK pod", hostname, IP) - job.Spec.Template.Spec.HostAliases = []corev1.HostAlias{ - { - IP: IP, - Hostnames: []string{hostname}, - }, - } - - // Propagate hostname into Python code as env variable - hostnameEnvVar := corev1.EnvVar{Name: "CLUSTER_HOSTNAME", Value: hostname} - job.Spec.Template.Spec.Containers[0].Env = append(job.Spec.Template.Spec.Containers[0].Env, hostnameEnvVar) - } - - job, err := test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) - - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) - - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) -}