From 126995f7a6b743ffc21fa97ad8ddeb24113eb5d3 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Mon, 4 Sep 2023 15:16:36 +0100 Subject: [PATCH 01/15] Adding instascale e2e test --- test/e2e/instascale/instascale_test.go | 133 +++++++++++++++++++++++++ test/support/clusterpools.go | 92 +++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 test/e2e/instascale/instascale_test.go create mode 100644 test/support/clusterpools.go diff --git a/test/e2e/instascale/instascale_test.go b/test/e2e/instascale/instascale_test.go new file mode 100644 index 00000000..a4439130 --- /dev/null +++ b/test/e2e/instascale/instascale_test.go @@ -0,0 +1,133 @@ +package e2e + +import ( + "fmt" + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "testing" +) + +var ( + ocmToken string + machinePoolsExist bool + numInitialNodePools int + numInitialMachineSets int +) + +func TestInstascale(t *testing.T) { + + test := With(t) + test.T().Parallel() + + namespace := test.NewTestNamespace() + + connection, err := CreateOCMConnection() + if err != nil { + test.T().Errorf("Unable to create ocm connection - Error : %v", err) + } + defer connection.Close() + + machinePoolsExist = true + // check existing machine pools + numInitialMachinePools, err := MachinePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine pools - Error : %v", err) + } + + if numInitialMachinePools == 0 { + machinePoolsExist = false + numInitialNodePools, err = NodePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count node pools - Error : %v", err) + } + if numInitialNodePools == 0 { + numInitialMachineSets, err = MachineSetsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine sets - Error : %v", err) + } + } + } + + // create an appwrapper + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-instascale", + Namespace: namespace.Name, + Labels: map[string]string{ + "orderedinstance": "m5.xlarge_g4dn.xlarge", + }, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + }, + }, + }, + }, + } + + aw, err = test.Client().MCAD().McadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) + + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + if !machinePoolsExist { + numNodePools, err := NodePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count node pools - Error : %v", err) + } + fmt.Println(numNodePools) + test.Expect(numNodePools).To(BeNumerically(">", numInitialNodePools)) + test.T().Logf("number of machine pools increased") + + } else if machinePoolsExist { + numMachinePools, err := MachinePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine pools - Error : %v", err) + } + fmt.Println(numMachinePools) + test.Expect(numMachinePools).To(BeNumerically(">", numInitialMachinePools)) + test.T().Logf("number of machine pools increased") + } else { + numMachineSets, err := MachineSetsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine sets - Error : %v", err) + } + fmt.Println(numMachineSets) + test.Expect(numMachineSets).To(BeNumerically(">", numInitialMachineSets)) + test.T().Logf("number of machine sets increased") + } + + // TODO submit and check that the job has completed and that resources are released/scaled down +} diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go new file mode 100644 index 00000000..7462e7a8 --- /dev/null +++ b/test/support/clusterpools.go @@ -0,0 +1,92 @@ +package support + +import ( + "context" + "fmt" + "os" + + ocmsdk "github.com/openshift-online/ocm-sdk-go" + mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" + "github.com/openshift/client-go/machine/listers/machine/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + ocmToken string = os.Getenv("OCMTOKEN") + ClusterID string = os.Getenv("CLUSTERID") + machinePoolsExist bool + machineClient mapiclientset.Interface + msLister v1beta1.MachineSetLister +) + +const ( + namespaceToList = "openshift-machine-api" +) + +func CreateOCMConnection() (*ocmsdk.Connection, error) { + + logger, err := ocmsdk.NewGoLoggerBuilder(). + Debug(false). + Build() + if err != nil { + fmt.Fprintf(os.Stderr, "Can't build logger: %v\n", err) + os.Exit(1) + } + + connection, err := ocmsdk.NewConnectionBuilder(). + Logger(logger). + Tokens(ocmToken). + Build() + fmt.Println("connection", connection, err) + if err != nil || connection == nil { + fmt.Println("something went wrong", connection, err) + fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) + os.Exit(1) + } + + return connection, nil +} + + +func MachinePoolsCount(connection *ocmsdk.Connection) (numMachinePools int, err error) { + fmt.Println("clusterID %v", ClusterID) + machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List() + fmt.Println("machine pools connection %v", machinePoolsConnection) + + machinePoolsListResponse, err := machinePoolsConnection.Send() + if err != nil { + fmt.Println("machine pools list response, %v error, %v", machinePoolsListResponse, err) + return 0, fmt.Errorf("unable to send request, error: %v", err) + } + machinePoolsList := machinePoolsListResponse.Items() + fmt.Println("machine pool list %v", machinePoolsList) + //check the current number of machine pools + // TODO to be more precise could we check the machineTypes? + numMachinePools = machinePoolsList.Len() + fmt.Println(numMachinePools) + + return numMachinePools, nil +} + +func NodePoolsCount(connection *ocmsdk.Connection) (numNodePools int, err error) { + nodePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools().List() + nodePoolsListResponse, err := nodePoolsConnection.SendContext(context.Background()) + if err != nil { + return 0, fmt.Errorf("unable to send request, error: %v", err) + } + nodePoolsList := nodePoolsListResponse.Items() + numNodePools = nodePoolsList.Len() + fmt.Println(numNodePools) + + return numNodePools, nil +} + +func MachineSetsCount(connection *ocmsdk.Connection) (numMachineSets int, err error) { + machineSets, err := machineClient.MachineV1beta1().MachineSets(namespaceToList).List(context.Background(), metav1.ListOptions{}) + if err != nil { + return 0, fmt.Errorf("error while listing machine sets, error: %v", err) + } + machineSetsSize := machineSets.ListMeta.Size() + + return machineSetsSize, nil +} From d1ebfb14031efa5b4e0d97bdfa2a06a1ffb5f9ee Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Tue, 12 Sep 2023 10:08:34 +0100 Subject: [PATCH 02/15] update to test --- test/e2e/instascale/instascale_test.go | 133 ------------- test/e2e/instascale_test.go | 264 +++++++++++++++++++++++++ test/support/clusterpools.go | 22 +-- 3 files changed, 267 insertions(+), 152 deletions(-) delete mode 100644 test/e2e/instascale/instascale_test.go create mode 100644 test/e2e/instascale_test.go diff --git a/test/e2e/instascale/instascale_test.go b/test/e2e/instascale/instascale_test.go deleted file mode 100644 index a4439130..00000000 --- a/test/e2e/instascale/instascale_test.go +++ /dev/null @@ -1,133 +0,0 @@ -package e2e - -import ( - "fmt" - . "github.com/onsi/gomega" - . "github.com/project-codeflare/codeflare-operator/test/support" - mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "testing" -) - -var ( - ocmToken string - machinePoolsExist bool - numInitialNodePools int - numInitialMachineSets int -) - -func TestInstascale(t *testing.T) { - - test := With(t) - test.T().Parallel() - - namespace := test.NewTestNamespace() - - connection, err := CreateOCMConnection() - if err != nil { - test.T().Errorf("Unable to create ocm connection - Error : %v", err) - } - defer connection.Close() - - machinePoolsExist = true - // check existing machine pools - numInitialMachinePools, err := MachinePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine pools - Error : %v", err) - } - - if numInitialMachinePools == 0 { - machinePoolsExist = false - numInitialNodePools, err = NodePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count node pools - Error : %v", err) - } - if numInitialNodePools == 0 { - numInitialMachineSets, err = MachineSetsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine sets - Error : %v", err) - } - } - } - - // create an appwrapper - aw := &mcadv1beta1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-instascale", - Namespace: namespace.Name, - Labels: map[string]string{ - "orderedinstance": "m5.xlarge_g4dn.xlarge", - }, - }, - Spec: mcadv1beta1.AppWrapperSpec{ - AggrResources: mcadv1beta1.AppWrapperResourceList{ - GenericItems: []mcadv1beta1.AppWrapperGenericResource{ - { - CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ - { - Replicas: 1, - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1G"), - }, - }, - { - Replicas: 1, - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1G"), - }, - }, - }, - }, - }, - }, - }, - } - - aw, err = test.Client().MCAD().McadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) - - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). - Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) - - if !machinePoolsExist { - numNodePools, err := NodePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count node pools - Error : %v", err) - } - fmt.Println(numNodePools) - test.Expect(numNodePools).To(BeNumerically(">", numInitialNodePools)) - test.T().Logf("number of machine pools increased") - - } else if machinePoolsExist { - numMachinePools, err := MachinePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine pools - Error : %v", err) - } - fmt.Println(numMachinePools) - test.Expect(numMachinePools).To(BeNumerically(">", numInitialMachinePools)) - test.T().Logf("number of machine pools increased") - } else { - numMachineSets, err := MachineSetsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine sets - Error : %v", err) - } - fmt.Println(numMachineSets) - test.Expect(numMachineSets).To(BeNumerically(">", numInitialMachineSets)) - test.T().Logf("number of machine sets increased") - } - - // TODO submit and check that the job has completed and that resources are released/scaled down -} diff --git a/test/e2e/instascale_test.go b/test/e2e/instascale_test.go new file mode 100644 index 00000000..f2f26b19 --- /dev/null +++ b/test/e2e/instascale_test.go @@ -0,0 +1,264 @@ +package e2e + +import ( + "sync" + "testing" + "time" + + . "github.com/onsi/gomega" + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + machinePoolsExist bool + numInitialNodePools int + numInitialMachineSets int + wg = &sync.WaitGroup{} +) + +func TestInstascale(t *testing.T) { + + test := With(t) + test.T().Parallel() + + namespace := test.NewTestNamespace() + + // Test configuration + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-mcad", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + // create OCM connection + instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{}) + if err != nil { + test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err) + } + test.Expect(err).NotTo(HaveOccurred()) + ocmToken := string(instascaleOCMSecret.Data["token"]) + test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name) + + connection, err := CreateOCMConnection(ocmToken) + if err != nil { + test.T().Errorf("Unable to create ocm connection - Error : %v", err) + } + defer connection.Close() + + machinePoolsExist = true + // check existing cluster resources + numInitialMachinePools, err := MachinePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine pools - Error : %v", err) + } + + if numInitialMachinePools == 0 { + machinePoolsExist = false + numInitialNodePools, err = NodePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count node pools - Error : %v", err) + } + if numInitialNodePools == 0 { + numInitialMachineSets, err = MachineSetsCount() + if err != nil { + test.T().Errorf("Unable to count machine sets - Error : %v", err) + } + } + } + + // Batch Job + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "job", + Image: GetPyTorchImage(), + Env: []corev1.EnvVar{ + corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"}, + }, + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, + Args: []string{"$PYTHONUSERBASE"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + { + Name: "test2", + MountPath: "/test2", + }, + }, + WorkingDir: "/test2", + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + { + Name: "test2", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + // create an appwrapper + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-instascale", + Namespace: namespace.Name, + Labels: map[string]string{ + "orderedinstance": "m5.xlarge_g4dn.xlarge", + }, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + }, + }, + }, + }, + } + + _, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) + + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + // wait for required resources to be created before checking them again + time.Sleep(TestTimeoutShort) + if !machinePoolsExist { + numNodePools, err := NodePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count node pools - Error : %v", err) + } + test.Expect(numNodePools).To(BeNumerically(">", numInitialNodePools)) + test.T().Logf("number of node pools increased from %d to %d", numInitialNodePools, numNodePools) + + } else if machinePoolsExist { + numMachinePools, err := MachinePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine pools - Error : %v", err) + } + test.Expect(numMachinePools).To(BeNumerically(">", numInitialMachinePools)) + test.T().Logf("number of machine pools increased from %d to %d", numInitialMachinePools, numMachinePools) + } else { + numMachineSets, err := MachineSetsCount() + if err != nil { + test.T().Errorf("Unable to count machine sets - Error : %v", err) + } + test.Expect(numMachineSets).To(BeNumerically(">", numInitialMachineSets)) + test.T().Logf("number of machine sets increased from %d to %d", numInitialMachineSets, numMachineSets) + } + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + + // AppWrapper not being updated to complete once job is finished + + time.Sleep(TestTimeoutMedium) + if !machinePoolsExist { + numNodePoolsFinal, err := NodePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count node pools - Error : %v", err) + } + test.Expect(numNodePoolsFinal).To(BeNumerically("==", numInitialNodePools)) + test.T().Logf("number of machine pools decreased") + + } else if machinePoolsExist { + numMachinePoolsFinal, err := MachinePoolsCount(connection) + if err != nil { + test.T().Errorf("Unable to count machine pools - Error : %v", err) + } + test.Expect(numMachinePoolsFinal).To(BeNumerically("==", numInitialMachinePools)) + test.T().Logf("number of machine pools decreased") + } else { + numMachineSetsFinal, err := MachineSetsCount() + if err != nil { + test.T().Errorf("Unable to count machine sets - Error : %v", err) + } + test.Expect(numMachineSetsFinal).To(BeNumerically("==", numInitialMachineSets)) + test.T().Logf("number of machine sets decreased") + } +} diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 7462e7a8..43d5b5bb 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -7,24 +7,19 @@ import ( ocmsdk "github.com/openshift-online/ocm-sdk-go" mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" - "github.com/openshift/client-go/machine/listers/machine/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var ( - ocmToken string = os.Getenv("OCMTOKEN") ClusterID string = os.Getenv("CLUSTERID") - machinePoolsExist bool machineClient mapiclientset.Interface - msLister v1beta1.MachineSetLister ) const ( namespaceToList = "openshift-machine-api" ) -func CreateOCMConnection() (*ocmsdk.Connection, error) { - +func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { logger, err := ocmsdk.NewGoLoggerBuilder(). Debug(false). Build() @@ -32,14 +27,11 @@ func CreateOCMConnection() (*ocmsdk.Connection, error) { fmt.Fprintf(os.Stderr, "Can't build logger: %v\n", err) os.Exit(1) } - connection, err := ocmsdk.NewConnectionBuilder(). Logger(logger). - Tokens(ocmToken). + Tokens(string(secret)). Build() - fmt.Println("connection", connection, err) if err != nil || connection == nil { - fmt.Println("something went wrong", connection, err) fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) os.Exit(1) } @@ -49,21 +41,13 @@ func CreateOCMConnection() (*ocmsdk.Connection, error) { func MachinePoolsCount(connection *ocmsdk.Connection) (numMachinePools int, err error) { - fmt.Println("clusterID %v", ClusterID) machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List() - fmt.Println("machine pools connection %v", machinePoolsConnection) - machinePoolsListResponse, err := machinePoolsConnection.Send() if err != nil { - fmt.Println("machine pools list response, %v error, %v", machinePoolsListResponse, err) return 0, fmt.Errorf("unable to send request, error: %v", err) } machinePoolsList := machinePoolsListResponse.Items() - fmt.Println("machine pool list %v", machinePoolsList) - //check the current number of machine pools - // TODO to be more precise could we check the machineTypes? numMachinePools = machinePoolsList.Len() - fmt.Println(numMachinePools) return numMachinePools, nil } @@ -81,7 +65,7 @@ func NodePoolsCount(connection *ocmsdk.Connection) (numNodePools int, err error) return numNodePools, nil } -func MachineSetsCount(connection *ocmsdk.Connection) (numMachineSets int, err error) { +func MachineSetsCount() (numMachineSets int, err error) { machineSets, err := machineClient.MachineV1beta1().MachineSets(namespaceToList).List(context.Background(), metav1.ListOptions{}) if err != nil { return 0, fmt.Errorf("error while listing machine sets, error: %v", err) From b4d4e4ec6117beac4b7af36a466a815ad0e516a2 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Fri, 22 Sep 2023 16:01:34 +0100 Subject: [PATCH 03/15] refactored to check for specific machine names --- test/e2e/instascale_test.go | 137 +++++++++++++++-------------------- test/support/clusterpools.go | 44 ++++++++--- test/support/support.go | 1 + 3 files changed, 93 insertions(+), 89 deletions(-) diff --git a/test/e2e/instascale_test.go b/test/e2e/instascale_test.go index f2f26b19..f9b6fd58 100644 --- a/test/e2e/instascale_test.go +++ b/test/e2e/instascale_test.go @@ -1,7 +1,6 @@ package e2e import ( - "sync" "testing" "time" @@ -14,13 +13,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -var ( - machinePoolsExist bool - numInitialNodePools int - numInitialMachineSets int - wg = &sync.WaitGroup{} -) - func TestInstascale(t *testing.T) { test := With(t) @@ -65,25 +57,25 @@ func TestInstascale(t *testing.T) { } defer connection.Close() - machinePoolsExist = true // check existing cluster resources - numInitialMachinePools, err := MachinePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine pools - Error : %v", err) - } + machinePoolsExist, err := MachinePoolsExist(connection) + test.Expect(err).NotTo(HaveOccurred()) + nodePoolsExist, err := NodePoolsExist(connection) + test.Expect(err).NotTo(HaveOccurred()) - if numInitialMachinePools == 0 { - machinePoolsExist = false - numInitialNodePools, err = NodePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count node pools - Error : %v", err) - } - if numInitialNodePools == 0 { - numInitialMachineSets, err = MachineSetsCount() - if err != nil { - test.T().Errorf("Unable to count machine sets - Error : %v", err) - } - } + if machinePoolsExist { + // look for machine pool with aw name - expect not to find it + foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachinePool).To(BeFalse()) + } else if nodePoolsExist { + // look for node pool with aw name - expect not to find it + foundNodePool, err := CheckNodePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundNodePool).To(BeFalse()) + } else { + // TODO update to foundMachineSet + } // Batch Job @@ -103,13 +95,13 @@ func TestInstascale(t *testing.T) { Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "job", - Image: GetPyTorchImage(), + Name: "job", + Image: GetPyTorchImage(), Env: []corev1.EnvVar{ corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, - Args: []string{"$PYTHONUSERBASE"}, + Args: []string{"$PYTHONUSERBASE"}, VolumeMounts: []corev1.VolumeMount{ { Name: "test", @@ -184,46 +176,41 @@ func TestInstascale(t *testing.T) { }, }, }, - GenericTemplate: Raw(test, job), + GenericTemplate: Raw(test, job), + CompletionStatus: "Complete", }, }, }, }, } - _, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) + _, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) - // wait for required resources to be created before checking them again - time.Sleep(TestTimeoutShort) - if !machinePoolsExist { - numNodePools, err := NodePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count node pools - Error : %v", err) - } - test.Expect(numNodePools).To(BeNumerically(">", numInitialNodePools)) - test.T().Logf("number of node pools increased from %d to %d", numInitialNodePools, numNodePools) - - } else if machinePoolsExist { - numMachinePools, err := MachinePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine pools - Error : %v", err) - } - test.Expect(numMachinePools).To(BeNumerically(">", numInitialMachinePools)) - test.T().Logf("number of machine pools increased from %d to %d", numInitialMachinePools, numMachinePools) + // time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense + // wait for required resources to scale up before checking them again + time.Sleep(TestTimeoutThirtySeconds) + + if machinePoolsExist { + // look for machine pool with aw name - expect to find it + foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachinePool).To(BeTrue()) + } else if nodePoolsExist { + // look for node pool with aw name - expect to find it + foundNodePool, err := CheckNodePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundNodePool).To(BeTrue()) } else { - numMachineSets, err := MachineSetsCount() - if err != nil { - test.T().Errorf("Unable to count machine sets - Error : %v", err) - } - test.Expect(numMachineSets).To(BeNumerically(">", numInitialMachineSets)) - test.T().Logf("number of machine sets increased from %d to %d", numInitialMachineSets, numMachineSets) + // TODO update to foundMachineSet + } - + + // Assert that the job has completed test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( Or( @@ -235,30 +222,24 @@ func TestInstascale(t *testing.T) { test.Expect(GetJob(test, job.Namespace, job.Name)). To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) - // AppWrapper not being updated to complete once job is finished - - time.Sleep(TestTimeoutMedium) - if !machinePoolsExist { - numNodePoolsFinal, err := NodePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count node pools - Error : %v", err) - } - test.Expect(numNodePoolsFinal).To(BeNumerically("==", numInitialNodePools)) - test.T().Logf("number of machine pools decreased") - - } else if machinePoolsExist { - numMachinePoolsFinal, err := MachinePoolsCount(connection) - if err != nil { - test.T().Errorf("Unable to count machine pools - Error : %v", err) - } - test.Expect(numMachinePoolsFinal).To(BeNumerically("==", numInitialMachinePools)) - test.T().Logf("number of machine pools decreased") + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) + + // allow time for the resources to scale down before checking them again + time.Sleep(TestTimeoutThirtySeconds) + + if machinePoolsExist { + // look for machine pool with aw name - expect to find it + foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachinePool).To(BeFalse()) + } else if nodePoolsExist { + // look for node pool with aw name - expect to find it + foundNodePool, err := CheckNodePools(connection, "test-instascale") + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundNodePool).To(BeFalse()) } else { - numMachineSetsFinal, err := MachineSetsCount() - if err != nil { - test.T().Errorf("Unable to count machine sets - Error : %v", err) - } - test.Expect(numMachineSetsFinal).To(BeNumerically("==", numInitialMachineSets)) - test.T().Logf("number of machine sets decreased") + // TODO update to foundMachineSet + } } diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 43d5b5bb..c0f45dee 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -4,15 +4,17 @@ import ( "context" "fmt" "os" + "strings" ocmsdk "github.com/openshift-online/ocm-sdk-go" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var ( - ClusterID string = os.Getenv("CLUSTERID") - machineClient mapiclientset.Interface + ClusterID string = os.Getenv("CLUSTERID") + machineClient mapiclientset.Interface ) const ( @@ -39,30 +41,50 @@ func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { return connection, nil } +func MachinePoolsExist(connection *ocmsdk.Connection) (bool, error) { + machinePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools() + return machinePools != nil, nil +} + +func NodePoolsExist(connection *ocmsdk.Connection) (bool, error) { + nodePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools() + return nodePools != nil, nil +} -func MachinePoolsCount(connection *ocmsdk.Connection) (numMachinePools int, err error) { +func CheckMachinePools(connection *ocmsdk.Connection, awName string) (foundMachinePool bool, err error) { machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List() machinePoolsListResponse, err := machinePoolsConnection.Send() if err != nil { - return 0, fmt.Errorf("unable to send request, error: %v", err) + return false, fmt.Errorf("unable to send request, error: %v", err) } machinePoolsList := machinePoolsListResponse.Items() - numMachinePools = machinePoolsList.Len() + machinePoolsList.Range(func(index int, item *cmv1.MachinePool) bool { + instanceName, _ := item.GetID() + if strings.Contains(instanceName, awName) { + foundMachinePool = true + } + return true + }) - return numMachinePools, nil + return foundMachinePool, err } -func NodePoolsCount(connection *ocmsdk.Connection) (numNodePools int, err error) { +func CheckNodePools(connection *ocmsdk.Connection, awName string) (foundNodePool bool, err error) { nodePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools().List() nodePoolsListResponse, err := nodePoolsConnection.SendContext(context.Background()) if err != nil { - return 0, fmt.Errorf("unable to send request, error: %v", err) + return false, fmt.Errorf("unable to send request, error: %v", err) } nodePoolsList := nodePoolsListResponse.Items() - numNodePools = nodePoolsList.Len() - fmt.Println(numNodePools) + nodePoolsList.Range(func(index int, item *cmv1.NodePool) bool { + instanceName, _ := item.GetID() + if strings.Contains(instanceName, awName) { + foundNodePool = true + } + return true + }) - return numNodePools, nil + return foundNodePool, err } func MachineSetsCount() (numMachineSets int, err error) { diff --git a/test/support/support.go b/test/support/support.go index 1255baa8..8e0573a0 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -31,6 +31,7 @@ var ( ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true} TestTimeoutShort = 1 * time.Minute + TestTimeoutThirtySeconds = 30 * time.Second TestTimeoutMedium = 2 * time.Minute TestTimeoutLong = 5 * time.Minute ) From 72e7f067ef772748da867aa81e6aaa68113620ab Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Mon, 25 Sep 2023 14:48:29 +0100 Subject: [PATCH 04/15] updating machine sets functionality --- test/e2e/instascale_test.go | 27 +++++++++++++++------------ test/support/clusterpools.go | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/test/e2e/instascale_test.go b/test/e2e/instascale_test.go index f9b6fd58..e8e77044 100644 --- a/test/e2e/instascale_test.go +++ b/test/e2e/instascale_test.go @@ -65,17 +65,18 @@ func TestInstascale(t *testing.T) { if machinePoolsExist { // look for machine pool with aw name - expect not to find it - foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + foundMachinePool, err := CheckMachinePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundMachinePool).To(BeFalse()) } else if nodePoolsExist { // look for node pool with aw name - expect not to find it - foundNodePool, err := CheckNodePools(connection, "test-instascale") + foundNodePool, err := CheckNodePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundNodePool).To(BeFalse()) } else { - // TODO update to foundMachineSet - + foundMachineSet, err := CheckMachineSets(TestName) + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachineSet).To(BeFalse()) } // Batch Job @@ -197,17 +198,18 @@ func TestInstascale(t *testing.T) { if machinePoolsExist { // look for machine pool with aw name - expect to find it - foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + foundMachinePool, err := CheckMachinePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundMachinePool).To(BeTrue()) } else if nodePoolsExist { // look for node pool with aw name - expect to find it - foundNodePool, err := CheckNodePools(connection, "test-instascale") + foundNodePool, err := CheckNodePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundNodePool).To(BeTrue()) } else { - // TODO update to foundMachineSet - + foundMachineSet, err := CheckMachineSets(TestName) + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachineSet).To(BeTrue()) } // Assert that the job has completed @@ -230,16 +232,17 @@ func TestInstascale(t *testing.T) { if machinePoolsExist { // look for machine pool with aw name - expect to find it - foundMachinePool, err := CheckMachinePools(connection, "test-instascale") + foundMachinePool, err := CheckMachinePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundMachinePool).To(BeFalse()) } else if nodePoolsExist { // look for node pool with aw name - expect to find it - foundNodePool, err := CheckNodePools(connection, "test-instascale") + foundNodePool, err := CheckNodePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundNodePool).To(BeFalse()) } else { - // TODO update to foundMachineSet - + foundMachineSet, err := CheckMachineSets(TestName) + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachineSet).To(BeFalse()) } } diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index c0f45dee..dc89765e 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -9,12 +9,16 @@ import ( ocmsdk "github.com/openshift-online/ocm-sdk-go" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" + "github.com/openshift/client-go/machine/listers/machine/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" ) var ( ClusterID string = os.Getenv("CLUSTERID") machineClient mapiclientset.Interface + msLister v1beta1.MachineSetLister + TestName string = "test-instascale" ) const ( @@ -96,3 +100,16 @@ func MachineSetsCount() (numMachineSets int, err error) { return machineSetsSize, nil } + +func CheckMachineSets(awName string) (foundMachineSet bool, err error) { + machineSets, err := msLister.MachineSets("").List(labels.Everything()) + if err != nil { + return false, fmt.Errorf("error listing machine sets, error: %v", err) + } + for _, machineSet := range machineSets { + if strings.Contains(machineSet.Name, awName) { + foundMachineSet = true + } + } + return foundMachineSet, err +} From 5d193c7a65f49dd47ff02e904b4faa5ad5138cfc Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Tue, 26 Sep 2023 10:16:08 +0100 Subject: [PATCH 05/15] update vendor files --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c1caf36c..f8994c99 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.19 require ( github.com/onsi/gomega v1.27.10 + github.com/openshift-online/ocm-sdk-go v0.1.368 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c github.com/project-codeflare/instascale v0.0.9 @@ -69,7 +70,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/openshift-online/ocm-sdk-go v0.1.327 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.14.0 // indirect github.com/prometheus/client_model v0.3.0 // indirect diff --git a/go.sum b/go.sum index 2c06b07d..e6b42178 100644 --- a/go.sum +++ b/go.sum @@ -377,8 +377,8 @@ github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro= github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= -github.com/openshift-online/ocm-sdk-go v0.1.327 h1:WR822bGdQoMuZ2+dFdhZz3fpD2NlJhGr+F3FJPXvqFU= -github.com/openshift-online/ocm-sdk-go v0.1.327/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw= +github.com/openshift-online/ocm-sdk-go v0.1.368 h1:qP+gkChV8WDwwpkUw1xUyjTXKdvrwyd70Gff2GMUSeU= +github.com/openshift-online/ocm-sdk-go v0.1.368/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw= github.com/openshift/api v0.0.0-20230213134911-7ba313770556 h1:7W2fOhJicyEff24VaF7ASNzPtYvr+iSCVft4SIBAzaE= github.com/openshift/api v0.0.0-20230213134911-7ba313770556/go.mod h1:aQ6LDasvHMvHZXqLHnX2GRmnfTWCF/iIwz8EMTTIE9A= github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c h1:CV76yFOTXmq9VciBR3Bve5ZWzSxdft7gaMVB3kS0rwg= From 1e4a3f232f94355daaef3614ebc00aa21175fba3 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Wed, 27 Sep 2023 14:17:36 +0100 Subject: [PATCH 06/15] refactored test just for machine pools --- ...test.go => instascale_machinepool_test.go} | 82 ++++++------------- 1 file changed, 25 insertions(+), 57 deletions(-) rename test/e2e/{instascale_test.go => instascale_machinepool_test.go} (74%) diff --git a/test/e2e/instascale_test.go b/test/e2e/instascale_machinepool_test.go similarity index 74% rename from test/e2e/instascale_test.go rename to test/e2e/instascale_machinepool_test.go index e8e77044..44c07c81 100644 --- a/test/e2e/instascale_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -5,15 +5,17 @@ import ( "time" . "github.com/onsi/gomega" - . "github.com/project-codeflare/codeflare-operator/test/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" ) -func TestInstascale(t *testing.T) { +func TestInstascaleMachinePool(t *testing.T) { test := With(t) test.T().Parallel() @@ -38,11 +40,12 @@ func TestInstascale(t *testing.T) { }, Immutable: Ptr(true), } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - // create OCM connection + //create OCM connection instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{}) if err != nil { test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err) @@ -57,27 +60,11 @@ func TestInstascale(t *testing.T) { } defer connection.Close() - // check existing cluster resources - machinePoolsExist, err := MachinePoolsExist(connection) - test.Expect(err).NotTo(HaveOccurred()) - nodePoolsExist, err := NodePoolsExist(connection) + // check existing cluster machine pool resources + // look for machine pool with aw name - expect not to find it + foundMachinePool, err := CheckMachinePools(connection, TestName) test.Expect(err).NotTo(HaveOccurred()) - - if machinePoolsExist { - // look for machine pool with aw name - expect not to find it - foundMachinePool, err := CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeFalse()) - } else if nodePoolsExist { - // look for node pool with aw name - expect not to find it - foundNodePool, err := CheckNodePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundNodePool).To(BeFalse()) - } else { - foundMachineSet, err := CheckMachineSets(TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachineSet).To(BeFalse()) - } + test.Expect(foundMachinePool).To(BeFalse()) // Batch Job job := &batchv1.Job{ @@ -159,10 +146,12 @@ func TestInstascale(t *testing.T) { Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("250m"), corev1.ResourceMemory: resource.MustParse("512Mi"), + "nvidia.com/gpu": resource.MustParse("1"), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("500m"), corev1.ResourceMemory: resource.MustParse("1G"), + "nvidia.com/gpu": resource.MustParse("1"), }, }, { @@ -194,23 +183,12 @@ func TestInstascale(t *testing.T) { // time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense // wait for required resources to scale up before checking them again - time.Sleep(TestTimeoutThirtySeconds) - - if machinePoolsExist { - // look for machine pool with aw name - expect to find it - foundMachinePool, err := CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeTrue()) - } else if nodePoolsExist { - // look for node pool with aw name - expect to find it - foundNodePool, err := CheckNodePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundNodePool).To(BeTrue()) - } else { - foundMachineSet, err := CheckMachineSets(TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachineSet).To(BeTrue()) - } + time.Sleep(TestTimeoutMedium) + + // look for machine pool with aw name - expect to find it + foundMachinePool, err = CheckMachinePools(connection, TestName) + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachinePool).To(BeTrue()) // Assert that the job has completed test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) @@ -228,21 +206,11 @@ func TestInstascale(t *testing.T) { Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) // allow time for the resources to scale down before checking them again - time.Sleep(TestTimeoutThirtySeconds) - - if machinePoolsExist { - // look for machine pool with aw name - expect to find it - foundMachinePool, err := CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeFalse()) - } else if nodePoolsExist { - // look for node pool with aw name - expect to find it - foundNodePool, err := CheckNodePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundNodePool).To(BeFalse()) - } else { - foundMachineSet, err := CheckMachineSets(TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachineSet).To(BeFalse()) - } + time.Sleep(TestTimeoutMedium) + + // look for machine pool with aw name - expect not to find it + foundMachinePool, err = CheckMachinePools(connection, TestName) + test.Expect(err).NotTo(HaveOccurred()) + test.Expect(foundMachinePool).To(BeFalse()) + } From 88d1cebe49d1eae3cc3a13b467916ad6d63fb6ca Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Thu, 28 Sep 2023 10:12:15 +0100 Subject: [PATCH 07/15] moving common instascale test functions to separate file --- test/e2e/instascale.go | 174 ++++++++++++++++++++++++ test/e2e/instascale_machinepool_test.go | 157 +-------------------- 2 files changed, 181 insertions(+), 150 deletions(-) create mode 100644 test/e2e/instascale.go diff --git a/test/e2e/instascale.go b/test/e2e/instascale.go new file mode 100644 index 00000000..aa143f6b --- /dev/null +++ b/test/e2e/instascale.go @@ -0,0 +1,174 @@ +package e2e + +import ( + . "github.com/onsi/gomega" + ocmsdk "github.com/openshift-online/ocm-sdk-go" + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestConfig(test Test, namespace string) (*corev1.ConfigMap, error) { + // Test configuration + configMap := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-mcad", + Namespace: namespace, + }, + BinaryData: map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace).Create(test.Ctx(), configMap, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + return configMap, err +} + +func CreateConnection(test Test) (*ocmsdk.Connection, error) { + instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{}) + if err != nil { + test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err) + } + test.Expect(err).NotTo(HaveOccurred()) + ocmToken := string(instascaleOCMSecret.Data["token"]) + test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name) + + connection, err := CreateOCMConnection(ocmToken) + if err != nil { + test.T().Errorf("Unable to create ocm connection - Error : %v", err) + } + return connection, err +} + +func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) { + // Batch Job + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "job", + Image: GetPyTorchImage(), + Env: []corev1.EnvVar{ + corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"}, + }, + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, + Args: []string{"$PYTHONUSERBASE"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + { + Name: "test2", + MountPath: "/test2", + }, + }, + WorkingDir: "/test2", + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + { + Name: "test2", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + // create an appwrapper + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-instascale", + Namespace: namespace.Name, + Labels: map[string]string{ + "orderedinstance": "m5.xlarge_g4dn.xlarge", + }, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + CompletionStatus: "Complete", + }, + }, + }, + }, + } + + _, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) + + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + return job, aw, err +} diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index 44c07c81..4e5ece24 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -6,12 +6,8 @@ import ( . "github.com/onsi/gomega" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - . "github.com/project-codeflare/codeflare-operator/test/support" ) @@ -23,41 +19,13 @@ func TestInstascaleMachinePool(t *testing.T) { namespace := test.NewTestNamespace() // Test configuration - config := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist-mcad", - Namespace: namespace.Name, - }, - BinaryData: map[string][]byte{ - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - }, - Immutable: Ptr(true), - } - - config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + config, err := TestConfig(test, namespace.Name) + test.Expect(err).To(BeNil()) //create OCM connection - instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{}) - if err != nil { - test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err) - } - test.Expect(err).NotTo(HaveOccurred()) - ocmToken := string(instascaleOCMSecret.Data["token"]) - test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name) + connection, err := CreateConnection(test) + test.Expect(err).To(BeNil()) - connection, err := CreateOCMConnection(ocmToken) - if err != nil { - test.T().Errorf("Unable to create ocm connection - Error : %v", err) - } defer connection.Close() // check existing cluster machine pool resources @@ -66,120 +34,9 @@ func TestInstascaleMachinePool(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.Expect(foundMachinePool).To(BeFalse()) - // Batch Job - job := &batchv1.Job{ - TypeMeta: metav1.TypeMeta{ - APIVersion: batchv1.SchemeGroupVersion.String(), - Kind: "Job", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist", - Namespace: namespace.Name, - }, - Spec: batchv1.JobSpec{ - Completions: Ptr(int32(1)), - Parallelism: Ptr(int32(1)), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "job", - Image: GetPyTorchImage(), - Env: []corev1.EnvVar{ - corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"}, - }, - Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, - Args: []string{"$PYTHONUSERBASE"}, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "test", - MountPath: "/test", - }, - { - Name: "test2", - MountPath: "/test2", - }, - }, - WorkingDir: "/test2", - }, - }, - Volumes: []corev1.Volume{ - { - Name: "test", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: config.Name, - }, - }, - }, - }, - { - Name: "test2", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - }, - RestartPolicy: corev1.RestartPolicyNever, - }, - }, - }, - } - - // create an appwrapper - aw := &mcadv1beta1.AppWrapper{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-instascale", - Namespace: namespace.Name, - Labels: map[string]string{ - "orderedinstance": "m5.xlarge_g4dn.xlarge", - }, - }, - Spec: mcadv1beta1.AppWrapperSpec{ - AggrResources: mcadv1beta1.AppWrapperResourceList{ - GenericItems: []mcadv1beta1.AppWrapperGenericResource{ - { - CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ - { - Replicas: 1, - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - "nvidia.com/gpu": resource.MustParse("1"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1G"), - "nvidia.com/gpu": resource.MustParse("1"), - }, - }, - { - Replicas: 1, - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("500m"), - corev1.ResourceMemory: resource.MustParse("1G"), - }, - }, - }, - GenericTemplate: Raw(test, job), - CompletionStatus: "Complete", - }, - }, - }, - }, - } - - _, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) - - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). - Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + // Setup batch job and AppWrapper + job, aw, err := JobAppwrapperSetup(test, namespace, config) + test.Expect(err).To(BeNil()) // time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense // wait for required resources to scale up before checking them again From c133dbab0ccb5e34c733860fdb08ebcfb3446d47 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Mon, 2 Oct 2023 10:21:27 +0100 Subject: [PATCH 08/15] addressing feedback --- test/e2e/instascale.go | 8 ++--- test/e2e/instascale_machinepool_test.go | 6 ++-- test/support/clusterpools.go | 39 ++----------------------- test/support/support.go | 1 - 4 files changed, 9 insertions(+), 45 deletions(-) diff --git a/test/e2e/instascale.go b/test/e2e/instascale.go index aa143f6b..5c70576e 100644 --- a/test/e2e/instascale.go +++ b/test/e2e/instascale.go @@ -85,11 +85,11 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C MountPath: "/test", }, { - Name: "test2", - MountPath: "/test2", + Name: "workdir", + MountPath: "/workdir", }, }, - WorkingDir: "/test2", + WorkingDir: "workdir", }, }, Volumes: []corev1.Volume{ @@ -104,7 +104,7 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C }, }, { - Name: "test2", + Name: "workdir", VolumeSource: corev1.VolumeSource{ EmptyDir: &corev1.EmptyDirVolumeSource{}, }, diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index 4e5ece24..b60d583e 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -20,11 +20,11 @@ func TestInstascaleMachinePool(t *testing.T) { // Test configuration config, err := TestConfig(test, namespace.Name) - test.Expect(err).To(BeNil()) + test.Expect(err).NotTo(HaveOccurred()) //create OCM connection connection, err := CreateConnection(test) - test.Expect(err).To(BeNil()) + test.Expect(err).NotTo(HaveOccurred()) defer connection.Close() @@ -36,7 +36,7 @@ func TestInstascaleMachinePool(t *testing.T) { // Setup batch job and AppWrapper job, aw, err := JobAppwrapperSetup(test, namespace, config) - test.Expect(err).To(BeNil()) + test.Expect(err).NotTo(HaveOccurred()) // time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense // wait for required resources to scale up before checking them again diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index dc89765e..56db165a 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -10,8 +10,6 @@ import ( cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" "github.com/openshift/client-go/machine/listers/machine/v1beta1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" ) var ( @@ -31,7 +29,7 @@ func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { Build() if err != nil { fmt.Fprintf(os.Stderr, "Can't build logger: %v\n", err) - os.Exit(1) + return nil, err } connection, err := ocmsdk.NewConnectionBuilder(). Logger(logger). @@ -39,22 +37,12 @@ func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { Build() if err != nil || connection == nil { fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) - os.Exit(1) + return nil, err } return connection, nil } -func MachinePoolsExist(connection *ocmsdk.Connection) (bool, error) { - machinePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools() - return machinePools != nil, nil -} - -func NodePoolsExist(connection *ocmsdk.Connection) (bool, error) { - nodePools := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools() - return nodePools != nil, nil -} - func CheckMachinePools(connection *ocmsdk.Connection, awName string) (foundMachinePool bool, err error) { machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List() machinePoolsListResponse, err := machinePoolsConnection.Send() @@ -90,26 +78,3 @@ func CheckNodePools(connection *ocmsdk.Connection, awName string) (foundNodePool return foundNodePool, err } - -func MachineSetsCount() (numMachineSets int, err error) { - machineSets, err := machineClient.MachineV1beta1().MachineSets(namespaceToList).List(context.Background(), metav1.ListOptions{}) - if err != nil { - return 0, fmt.Errorf("error while listing machine sets, error: %v", err) - } - machineSetsSize := machineSets.ListMeta.Size() - - return machineSetsSize, nil -} - -func CheckMachineSets(awName string) (foundMachineSet bool, err error) { - machineSets, err := msLister.MachineSets("").List(labels.Everything()) - if err != nil { - return false, fmt.Errorf("error listing machine sets, error: %v", err) - } - for _, machineSet := range machineSets { - if strings.Contains(machineSet.Name, awName) { - foundMachineSet = true - } - } - return foundMachineSet, err -} diff --git a/test/support/support.go b/test/support/support.go index 8e0573a0..1255baa8 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -31,7 +31,6 @@ var ( ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true} TestTimeoutShort = 1 * time.Minute - TestTimeoutThirtySeconds = 30 * time.Second TestTimeoutMedium = 2 * time.Minute TestTimeoutLong = 5 * time.Minute ) From 348bd0046e72cced2fcd76eaadacfb4de6522c97 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Mon, 2 Oct 2023 10:29:06 +0100 Subject: [PATCH 09/15] fixing PR checks --- test/e2e/instascale.go | 7 +++++-- test/e2e/instascale_machinepool_test.go | 2 ++ test/support/clusterpools.go | 12 ++---------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/test/e2e/instascale.go b/test/e2e/instascale.go index 5c70576e..3fccf0d5 100644 --- a/test/e2e/instascale.go +++ b/test/e2e/instascale.go @@ -2,13 +2,16 @@ package e2e import ( . "github.com/onsi/gomega" - ocmsdk "github.com/openshift-online/ocm-sdk-go" - . "github.com/project-codeflare/codeflare-operator/test/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + ocmsdk "github.com/openshift-online/ocm-sdk-go" + + . "github.com/project-codeflare/codeflare-operator/test/support" ) func TestConfig(test Test, namespace string) (*corev1.ConfigMap, error) { diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index b60d583e..3a756dd7 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -6,8 +6,10 @@ import ( . "github.com/onsi/gomega" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + . "github.com/project-codeflare/codeflare-operator/test/support" ) diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 56db165a..9a389aea 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -8,19 +8,11 @@ import ( ocmsdk "github.com/openshift-online/ocm-sdk-go" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" - mapiclientset "github.com/openshift/client-go/machine/clientset/versioned" - "github.com/openshift/client-go/machine/listers/machine/v1beta1" ) var ( - ClusterID string = os.Getenv("CLUSTERID") - machineClient mapiclientset.Interface - msLister v1beta1.MachineSetLister - TestName string = "test-instascale" -) - -const ( - namespaceToList = "openshift-machine-api" + ClusterID string = os.Getenv("CLUSTERID") + TestName string = "test-instascale" ) func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { From e7a93b6b5f25c67597be21ec61db33347e549982 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Mon, 2 Oct 2023 13:40:02 +0200 Subject: [PATCH 10/15] Refactor Machine pool functions --- ...nstascale.go => instascale_app_wrapper.go} | 55 ++----------------- test/e2e/instascale_machinepool_test.go | 37 +++++-------- test/support/clusterpools.go | 44 ++++----------- test/support/codeflare.go | 14 ++++- test/support/config_map.go | 44 +++++++++++++++ test/support/ocm.go | 50 +++++++++++++++++ test/support/support.go | 7 ++- 7 files changed, 143 insertions(+), 108 deletions(-) rename test/e2e/{instascale.go => instascale_app_wrapper.go} (66%) create mode 100644 test/support/config_map.go create mode 100644 test/support/ocm.go diff --git a/test/e2e/instascale.go b/test/e2e/instascale_app_wrapper.go similarity index 66% rename from test/e2e/instascale.go rename to test/e2e/instascale_app_wrapper.go index 3fccf0d5..d5f14efe 100644 --- a/test/e2e/instascale.go +++ b/test/e2e/instascale_app_wrapper.go @@ -9,55 +9,10 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - ocmsdk "github.com/openshift-online/ocm-sdk-go" - . "github.com/project-codeflare/codeflare-operator/test/support" ) -func TestConfig(test Test, namespace string) (*corev1.ConfigMap, error) { - // Test configuration - configMap := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "mnist-mcad", - Namespace: namespace, - }, - BinaryData: map[string][]byte{ - // pip requirements - "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), - // MNIST training script - "mnist.py": ReadFile(test, "mnist.py"), - }, - Immutable: Ptr(true), - } - - config, err := test.Client().Core().CoreV1().ConfigMaps(namespace).Create(test.Ctx(), configMap, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) - - return configMap, err -} - -func CreateConnection(test Test) (*ocmsdk.Connection, error) { - instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{}) - if err != nil { - test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err) - } - test.Expect(err).NotTo(HaveOccurred()) - ocmToken := string(instascaleOCMSecret.Data["token"]) - test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name) - - connection, err := CreateOCMConnection(ocmToken) - if err != nil { - test.T().Errorf("Unable to create ocm connection - Error : %v", err) - } - return connection, err -} - -func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) { +func createInstaScaleJobAppWrapper(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) { // Batch Job job := &batchv1.Job{ TypeMeta: metav1.TypeMeta{ @@ -78,7 +33,7 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C Name: "job", Image: GetPyTorchImage(), Env: []corev1.EnvVar{ - corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"}, + {Name: "PYTHONUSERBASE", Value: "/workdir"}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, Args: []string{"$PYTHONUSERBASE"}, @@ -92,7 +47,7 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C MountPath: "/workdir", }, }, - WorkingDir: "workdir", + WorkingDir: "/workdir", }, }, Volumes: []corev1.Volume{ @@ -125,7 +80,7 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C Name: "test-instascale", Namespace: namespace.Name, Labels: map[string]string{ - "orderedinstance": "m5.xlarge_g4dn.xlarge", + "orderedinstance": "g4dn.xlarge", }, }, Spec: mcadv1beta1.AppWrapperSpec{ @@ -170,7 +125,7 @@ func JobAppwrapperSetup(test Test, namespace *corev1.Namespace, config *corev1.C test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) return job, aw, err diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index 3a756dd7..ad4143cf 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -2,7 +2,6 @@ package e2e import ( "testing" - "time" . "github.com/onsi/gomega" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" @@ -21,33 +20,31 @@ func TestInstascaleMachinePool(t *testing.T) { namespace := test.NewTestNamespace() // Test configuration - config, err := TestConfig(test, namespace.Name) - test.Expect(err).NotTo(HaveOccurred()) + testConfigData := map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + } + cm := CreateConfigMap(test, namespace.Name, testConfigData) //create OCM connection - connection, err := CreateConnection(test) - test.Expect(err).NotTo(HaveOccurred()) + connection := CreateOCMConnection(test) defer connection.Close() // check existing cluster machine pool resources // look for machine pool with aw name - expect not to find it - foundMachinePool, err := CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeFalse()) + test.Expect(GetMachinePools(test, connection)). + ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) // Setup batch job and AppWrapper - job, aw, err := JobAppwrapperSetup(test, namespace, config) + job, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm) test.Expect(err).NotTo(HaveOccurred()) - // time.Sleep is used twice throughout the test, each for 30 seconds. Can look into using sync package waitGroup instead if that makes more sense - // wait for required resources to scale up before checking them again - time.Sleep(TestTimeoutMedium) - // look for machine pool with aw name - expect to find it - foundMachinePool, err = CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeTrue()) + test.Eventually(MachinePools(test, connection), TestTimeoutLong). + Should(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) // Assert that the job has completed test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) @@ -64,12 +61,8 @@ func TestInstascaleMachinePool(t *testing.T) { test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) - // allow time for the resources to scale down before checking them again - time.Sleep(TestTimeoutMedium) - // look for machine pool with aw name - expect not to find it - foundMachinePool, err = CheckMachinePools(connection, TestName) - test.Expect(err).NotTo(HaveOccurred()) - test.Expect(foundMachinePool).To(BeFalse()) + test.Eventually(MachinePools(test, connection), TestTimeoutLong). + ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) } diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 9a389aea..1bd1036f 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -6,6 +6,7 @@ import ( "os" "strings" + "github.com/onsi/gomega" ocmsdk "github.com/openshift-online/ocm-sdk-go" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" ) @@ -15,42 +16,21 @@ var ( TestName string = "test-instascale" ) -func CreateOCMConnection(secret string) (*ocmsdk.Connection, error) { - logger, err := ocmsdk.NewGoLoggerBuilder(). - Debug(false). - Build() - if err != nil { - fmt.Fprintf(os.Stderr, "Can't build logger: %v\n", err) - return nil, err - } - connection, err := ocmsdk.NewConnectionBuilder(). - Logger(logger). - Tokens(string(secret)). - Build() - if err != nil || connection == nil { - fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) - return nil, err +func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool { + return func(g gomega.Gomega) []*cmv1.MachinePool { + machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List().Send() + g.Expect(err).NotTo(gomega.HaveOccurred()) + return machinePoolsListResponse.Items().Slice() } - - return connection, nil } -func CheckMachinePools(connection *ocmsdk.Connection, awName string) (foundMachinePool bool, err error) { - machinePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List() - machinePoolsListResponse, err := machinePoolsConnection.Send() - if err != nil { - return false, fmt.Errorf("unable to send request, error: %v", err) - } - machinePoolsList := machinePoolsListResponse.Items() - machinePoolsList.Range(func(index int, item *cmv1.MachinePool) bool { - instanceName, _ := item.GetID() - if strings.Contains(instanceName, awName) { - foundMachinePool = true - } - return true - }) +func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool { + t.T().Helper() + return MachinePools(t, connection)(t) +} - return foundMachinePool, err +func MachinePoolId(machinePool *cmv1.MachinePool) string { + return machinePool.ID() } func CheckNodePools(connection *ocmsdk.Connection, awName string) (foundNodePool bool, err error) { diff --git a/test/support/codeflare.go b/test/support/codeflare.go index 04b1f3e9..c378c4e6 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -30,8 +30,12 @@ const ( CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE" // The testing output directory, to write output files into. - CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR" + + // The name of a secret containing InstaScale OCM token. + InstaScaleOcmSecretName = "INSTASCALE_OCM_SECRET_NAME" + // The namespace where a secret containing InstaScale OCM token is stored. + InstaScaleOcmSecretNamespace = "INSTASCALE_OCM_SECRET_NAMESPACE" ) func GetCodeFlareSDKVersion() string { @@ -50,6 +54,14 @@ func GetPyTorchImage() string { return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime") } +func GetInstaScaleOcmSecretName() string { + return lookupEnvOrDefault(InstaScaleOcmSecretName, "instascale-ocm-secret") +} + +func GetInstaScaleOcmSecretNamespace() string { + return lookupEnvOrDefault(InstaScaleOcmSecretNamespace, "default") +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v diff --git a/test/support/config_map.go b/test/support/config_map.go new file mode 100644 index 00000000..8846bec2 --- /dev/null +++ b/test/support/config_map.go @@ -0,0 +1,44 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap { + configMap := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "config-", + Namespace: namespace, + }, + BinaryData: content, + Immutable: Ptr(true), + } + + configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{}) + t.Expect(err).NotTo(gomega.HaveOccurred()) + t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) + + return configMap +} diff --git a/test/support/ocm.go b/test/support/ocm.go new file mode 100644 index 00000000..235143ba --- /dev/null +++ b/test/support/ocm.go @@ -0,0 +1,50 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "fmt" + "os" + + "github.com/onsi/gomega" + ocmsdk "github.com/openshift-online/ocm-sdk-go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func CreateOCMConnection(test Test) *ocmsdk.Connection { + instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets(GetInstaScaleOcmSecretNamespace()).Get(test.Ctx(), GetInstaScaleOcmSecretName(), metav1.GetOptions{}) + test.Expect(err).NotTo(gomega.HaveOccurred()) + + ocmToken := string(instascaleOCMSecret.Data["token"]) + test.T().Logf("Retrieved Secret %s/%s successfully", instascaleOCMSecret.Namespace, instascaleOCMSecret.Name) + + connection, err := buildOCMConnection(ocmToken) + test.Expect(err).NotTo(gomega.HaveOccurred()) + return connection +} + +func buildOCMConnection(secret string) (*ocmsdk.Connection, error) { + connection, err := ocmsdk.NewConnectionBuilder(). + Tokens(secret). + Build() + if err != nil || connection == nil { + fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) + return nil, err + } + + return connection, nil +} diff --git a/test/support/support.go b/test/support/support.go index 1255baa8..0091b3bf 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -30,9 +30,10 @@ import ( var ( ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true} - TestTimeoutShort = 1 * time.Minute - TestTimeoutMedium = 2 * time.Minute - TestTimeoutLong = 5 * time.Minute + TestTimeoutShort = 1 * time.Minute + TestTimeoutMedium = 2 * time.Minute + TestTimeoutLong = 5 * time.Minute + TestTimeoutGpuProvisioning = 30 * time.Minute ) func init() { From ced9b72055f65ea3da2e95e7bfb79b2d796034d1 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Tue, 3 Oct 2023 10:08:05 +0200 Subject: [PATCH 11/15] Move OSD cluster ID into codeflare.go --- test/support/clusterpools.go | 34 +++++----------------------------- test/support/codeflare.go | 6 ++++++ test/support/config_map.go | 1 + test/support/ocm.go | 4 +++- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 1bd1036f..9b1f7848 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -1,24 +1,18 @@ package support import ( - "context" - "fmt" - "os" - "strings" - "github.com/onsi/gomega" + ocmsdk "github.com/openshift-online/ocm-sdk-go" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" ) -var ( - ClusterID string = os.Getenv("CLUSTERID") - TestName string = "test-instascale" -) - func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool { + osdClusterId, found := GetOsdClusterId() + t.Expect(found).To(gomega.BeTrue(), "OSD cluster id not found, please configure environment properly") + return func(g gomega.Gomega) []*cmv1.MachinePool { - machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).MachinePools().List().Send() + machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(osdClusterId).MachinePools().List().Send() g.Expect(err).NotTo(gomega.HaveOccurred()) return machinePoolsListResponse.Items().Slice() } @@ -32,21 +26,3 @@ func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool func MachinePoolId(machinePool *cmv1.MachinePool) string { return machinePool.ID() } - -func CheckNodePools(connection *ocmsdk.Connection, awName string) (foundNodePool bool, err error) { - nodePoolsConnection := connection.ClustersMgmt().V1().Clusters().Cluster(ClusterID).NodePools().List() - nodePoolsListResponse, err := nodePoolsConnection.SendContext(context.Background()) - if err != nil { - return false, fmt.Errorf("unable to send request, error: %v", err) - } - nodePoolsList := nodePoolsListResponse.Items() - nodePoolsList.Range(func(index int, item *cmv1.NodePool) bool { - instanceName, _ := item.GetID() - if strings.Contains(instanceName, awName) { - foundNodePool = true - } - return true - }) - - return foundNodePool, err -} diff --git a/test/support/codeflare.go b/test/support/codeflare.go index c378c4e6..3772ce44 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -36,6 +36,8 @@ const ( InstaScaleOcmSecretName = "INSTASCALE_OCM_SECRET_NAME" // The namespace where a secret containing InstaScale OCM token is stored. InstaScaleOcmSecretNamespace = "INSTASCALE_OCM_SECRET_NAMESPACE" + // Cluster ID for OSD cluster used in tests, used for testing InstaScale + OsdClusterID = "CLUSTERID" ) func GetCodeFlareSDKVersion() string { @@ -62,6 +64,10 @@ func GetInstaScaleOcmSecretNamespace() string { return lookupEnvOrDefault(InstaScaleOcmSecretNamespace, "default") } +func GetOsdClusterId() (string, bool) { + return os.LookupEnv(OsdClusterID) +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v diff --git a/test/support/config_map.go b/test/support/config_map.go index 8846bec2..392da42b 100644 --- a/test/support/config_map.go +++ b/test/support/config_map.go @@ -18,6 +18,7 @@ package support import ( "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) diff --git a/test/support/ocm.go b/test/support/ocm.go index 235143ba..ff4d0da4 100644 --- a/test/support/ocm.go +++ b/test/support/ocm.go @@ -21,8 +21,10 @@ import ( "os" "github.com/onsi/gomega" - ocmsdk "github.com/openshift-online/ocm-sdk-go" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + ocmsdk "github.com/openshift-online/ocm-sdk-go" ) func CreateOCMConnection(test Test) *ocmsdk.Connection { From 868ff121948072961a5e19ab33919a37e57d330d Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Wed, 4 Oct 2023 16:45:55 +0100 Subject: [PATCH 12/15] addressing feedback --- .github/workflows/e2e_tests.yaml | 1 + test/support/ocm.go | 2 +- test/support/support.go | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 42ee0b73..759932e9 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -84,6 +84,7 @@ jobs: export CODEFLARE_TEST_TIMEOUT_SHORT=1m export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m export CODEFLARE_TEST_TIMEOUT_LONG=10m + export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV diff --git a/test/support/ocm.go b/test/support/ocm.go index ff4d0da4..91738fbc 100644 --- a/test/support/ocm.go +++ b/test/support/ocm.go @@ -43,7 +43,7 @@ func buildOCMConnection(secret string) (*ocmsdk.Connection, error) { connection, err := ocmsdk.NewConnectionBuilder(). Tokens(secret). Build() - if err != nil || connection == nil { + if err != nil { fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) return nil, err } diff --git a/test/support/support.go b/test/support/support.go index 0091b3bf..36c8c9be 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -58,6 +58,13 @@ func init() { fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_LONG. Using default value: %s", TestTimeoutLong) } } + if value, ok := os.LookupEnv("CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING"); ok { + if duration, err := time.ParseDuration(value); err == nil { + TestTimeoutGpuProvisioning = duration + } else { + fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING. Using default value: %s", TestTimeoutGpuProvisioning) + } + } // Gomega settings gomega.SetDefaultEventuallyTimeout(TestTimeoutShort) From 3bdd35c00d7bfe0b3165ac96e4ac0741c16f04f0 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Thu, 5 Oct 2023 12:16:29 +0200 Subject: [PATCH 13/15] Add missing licence headers to InstaScale test files --- test/e2e/instascale_app_wrapper.go | 16 ++++++++++++++++ test/e2e/instascale_machinepool_test.go | 16 ++++++++++++++++ test/support/clusterpools.go | 16 ++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/test/e2e/instascale_app_wrapper.go b/test/e2e/instascale_app_wrapper.go index d5f14efe..7586d659 100644 --- a/test/e2e/instascale_app_wrapper.go +++ b/test/e2e/instascale_app_wrapper.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package e2e import ( diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index ad4143cf..12349d9d 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package e2e import ( diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go index 9b1f7848..b321aeb0 100644 --- a/test/support/clusterpools.go +++ b/test/support/clusterpools.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( From a34ea1928d7b1737488267b717f7f04888b54fdf Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Thu, 5 Oct 2023 12:30:46 +0100 Subject: [PATCH 14/15] skip test if not OSD --- test/e2e/instascale_machinepool_test.go | 4 ++++ test/support/codeflare.go | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index 12349d9d..1f62de0d 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -33,6 +33,10 @@ func TestInstascaleMachinePool(t *testing.T) { test := With(t) test.T().Parallel() + if !IsOsd(test) { + test.T().Skip("Skipping test as not running on an OSD cluster") + } + namespace := test.NewTestNamespace() // Test configuration diff --git a/test/support/codeflare.go b/test/support/codeflare.go index 3772ce44..bf336775 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -18,6 +18,9 @@ package support import ( "os" + "strconv" + + "github.com/onsi/gomega" ) const ( @@ -38,6 +41,8 @@ const ( InstaScaleOcmSecretNamespace = "INSTASCALE_OCM_SECRET_NAMESPACE" // Cluster ID for OSD cluster used in tests, used for testing InstaScale OsdClusterID = "CLUSTERID" + // Determine if test is being run on an OSD cluster, used for testing InstaScale. + IsOSD = "IS_OSD" ) func GetCodeFlareSDKVersion() string { @@ -68,6 +73,18 @@ func GetOsdClusterId() (string, bool) { return os.LookupEnv(OsdClusterID) } +func IsOsd(test Test) bool { + test.T().Helper() + env := lookupEnvOrDefault(IsOSD, "false") + osd, err := strconv.ParseBool(env) + if err != nil { + test.T().Logf("error parsing IS_OSD environment variable, using default 'false' value, error: %v ", err) + return false + } + test.Expect(err).NotTo(gomega.HaveOccurred()) + return osd +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v From ff040e3ced5212fb844aa1d491b4cde4b9a21732 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Fri, 6 Oct 2023 15:26:54 +0100 Subject: [PATCH 15/15] addressing feedback --- test/e2e/instascale_app_wrapper.go | 6 --- test/e2e/instascale_machinepool_test.go | 31 ++++--------- test/support/clusterpools.go | 44 ------------------ test/support/config_map.go | 45 ------------------- test/support/core.go | 21 +++++++++ test/support/{codeflare.go => environment.go} | 36 +++++---------- test/support/ocm.go | 24 +++++++++- 7 files changed, 65 insertions(+), 142 deletions(-) delete mode 100644 test/support/clusterpools.go delete mode 100644 test/support/config_map.go rename test/support/{codeflare.go => environment.go} (67%) diff --git a/test/e2e/instascale_app_wrapper.go b/test/e2e/instascale_app_wrapper.go index 7586d659..f9ec3bd0 100644 --- a/test/e2e/instascale_app_wrapper.go +++ b/test/e2e/instascale_app_wrapper.go @@ -17,7 +17,6 @@ limitations under the License. package e2e import ( - . "github.com/onsi/gomega" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" batchv1 "k8s.io/api/batch/v1" @@ -138,11 +137,6 @@ func createInstaScaleJobAppWrapper(test Test, namespace *corev1.Namespace, confi } _, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) - - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning). - Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) return job, aw, err } diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go index 1f62de0d..b6ad0782 100644 --- a/test/e2e/instascale_machinepool_test.go +++ b/test/e2e/instascale_machinepool_test.go @@ -22,35 +22,29 @@ import ( . "github.com/onsi/gomega" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" - . "github.com/project-codeflare/codeflare-operator/test/support" ) func TestInstascaleMachinePool(t *testing.T) { - test := With(t) test.T().Parallel() - if !IsOsd(test) { + if !IsOsd() { test.T().Skip("Skipping test as not running on an OSD cluster") } namespace := test.NewTestNamespace() // Test configuration - testConfigData := map[string][]byte{ + cm := CreateConfigMap(test, namespace.Name, map[string][]byte{ // pip requirements "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), // MNIST training script "mnist.py": ReadFile(test, "mnist.py"), - } - cm := CreateConfigMap(test, namespace.Name, testConfigData) + }) //create OCM connection connection := CreateOCMConnection(test) - defer connection.Close() // check existing cluster machine pool resources @@ -59,25 +53,18 @@ func TestInstascaleMachinePool(t *testing.T) { ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) // Setup batch job and AppWrapper - job, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm) + _, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) + + // assert that AppWrapper goes to "Running" state + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) // look for machine pool with aw name - expect to find it test.Eventually(MachinePools(test, connection), TestTimeoutLong). Should(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) - // Assert that the job has completed - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( - Or( - WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), - WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), - )) - - // Assert the job has completed successfully - test.Expect(GetJob(test, job.Namespace, job.Name)). - To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) - test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) diff --git a/test/support/clusterpools.go b/test/support/clusterpools.go deleted file mode 100644 index b321aeb0..00000000 --- a/test/support/clusterpools.go +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package support - -import ( - "github.com/onsi/gomega" - - ocmsdk "github.com/openshift-online/ocm-sdk-go" - cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" -) - -func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool { - osdClusterId, found := GetOsdClusterId() - t.Expect(found).To(gomega.BeTrue(), "OSD cluster id not found, please configure environment properly") - - return func(g gomega.Gomega) []*cmv1.MachinePool { - machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(osdClusterId).MachinePools().List().Send() - g.Expect(err).NotTo(gomega.HaveOccurred()) - return machinePoolsListResponse.Items().Slice() - } -} - -func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool { - t.T().Helper() - return MachinePools(t, connection)(t) -} - -func MachinePoolId(machinePool *cmv1.MachinePool) string { - return machinePool.ID() -} diff --git a/test/support/config_map.go b/test/support/config_map.go deleted file mode 100644 index 392da42b..00000000 --- a/test/support/config_map.go +++ /dev/null @@ -1,45 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package support - -import ( - "github.com/onsi/gomega" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap { - configMap := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - GenerateName: "config-", - Namespace: namespace, - }, - BinaryData: content, - Immutable: Ptr(true), - } - - configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{}) - t.Expect(err).NotTo(gomega.HaveOccurred()) - t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) - - return configMap -} diff --git a/test/support/core.go b/test/support/core.go index ee012c82..70c48c20 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -27,6 +27,27 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap { + configMap := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "config-", + Namespace: namespace, + }, + BinaryData: content, + Immutable: Ptr(true), + } + + configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{}) + t.Expect(err).NotTo(gomega.HaveOccurred()) + t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) + + return configMap +} + func Raw(t Test, obj runtime.Object) runtime.RawExtension { t.T().Helper() data, err := json.Marshal(obj) diff --git a/test/support/codeflare.go b/test/support/environment.go similarity index 67% rename from test/support/codeflare.go rename to test/support/environment.go index bf336775..bf3b2af7 100644 --- a/test/support/codeflare.go +++ b/test/support/environment.go @@ -18,9 +18,7 @@ package support import ( "os" - "strconv" - - "github.com/onsi/gomega" + "strings" ) const ( @@ -35,14 +33,11 @@ const ( // The testing output directory, to write output files into. CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR" - // The name of a secret containing InstaScale OCM token. - InstaScaleOcmSecretName = "INSTASCALE_OCM_SECRET_NAME" - // The namespace where a secret containing InstaScale OCM token is stored. - InstaScaleOcmSecretNamespace = "INSTASCALE_OCM_SECRET_NAMESPACE" + // The namespace where a secret containing InstaScale OCM token is stored and the secret name. + InstaScaleOcmSecret = "INSTASCALE_OCM_SECRET" + // Cluster ID for OSD cluster used in tests, used for testing InstaScale OsdClusterID = "CLUSTERID" - // Determine if test is being run on an OSD cluster, used for testing InstaScale. - IsOSD = "IS_OSD" ) func GetCodeFlareSDKVersion() string { @@ -61,28 +56,21 @@ func GetPyTorchImage() string { return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime") } -func GetInstaScaleOcmSecretName() string { - return lookupEnvOrDefault(InstaScaleOcmSecretName, "instascale-ocm-secret") -} - -func GetInstaScaleOcmSecretNamespace() string { - return lookupEnvOrDefault(InstaScaleOcmSecretNamespace, "default") +func GetInstascaleOcmSecret() (string, string) { + res := strings.SplitN(lookupEnvOrDefault(InstaScaleOcmSecret, "default/instascale-com-secret"), "/", 2) + return res[0], res[1] } func GetOsdClusterId() (string, bool) { return os.LookupEnv(OsdClusterID) } -func IsOsd(test Test) bool { - test.T().Helper() - env := lookupEnvOrDefault(IsOSD, "false") - osd, err := strconv.ParseBool(env) - if err != nil { - test.T().Logf("error parsing IS_OSD environment variable, using default 'false' value, error: %v ", err) - return false +func IsOsd() bool { + osdClusterId, found := GetOsdClusterId() + if found && osdClusterId != "" { + return true } - test.Expect(err).NotTo(gomega.HaveOccurred()) - return osd + return false } func lookupEnvOrDefault(key, value string) string { diff --git a/test/support/ocm.go b/test/support/ocm.go index 91738fbc..3d6bd7f1 100644 --- a/test/support/ocm.go +++ b/test/support/ocm.go @@ -25,10 +25,12 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ocmsdk "github.com/openshift-online/ocm-sdk-go" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" ) func CreateOCMConnection(test Test) *ocmsdk.Connection { - instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets(GetInstaScaleOcmSecretNamespace()).Get(test.Ctx(), GetInstaScaleOcmSecretName(), metav1.GetOptions{}) + secretNamespace, secretName := GetInstascaleOcmSecret() + instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets(secretNamespace).Get(test.Ctx(), secretName, metav1.GetOptions{}) test.Expect(err).NotTo(gomega.HaveOccurred()) ocmToken := string(instascaleOCMSecret.Data["token"]) @@ -50,3 +52,23 @@ func buildOCMConnection(secret string) (*ocmsdk.Connection, error) { return connection, nil } + +func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool { + osdClusterId, found := GetOsdClusterId() + t.Expect(found).To(gomega.BeTrue(), "OSD cluster id not found, please configure environment properly") + + return func(g gomega.Gomega) []*cmv1.MachinePool { + machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(osdClusterId).MachinePools().List().Send() + g.Expect(err).NotTo(gomega.HaveOccurred()) + return machinePoolsListResponse.Items().Slice() + } +} + +func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool { + t.T().Helper() + return MachinePools(t, connection)(t) +} + +func MachinePoolId(machinePool *cmv1.MachinePool) string { + return machinePool.ID() +}