Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding instascale e2e test #271

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/e2e_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jobs:
export CODEFLARE_TEST_TIMEOUT_SHORT=1m
export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m
export CODEFLARE_TEST_TIMEOUT_LONG=10m
export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m

export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.19

require (
github.com/onsi/gomega v1.27.10
github.com/openshift-online/ocm-sdk-go v0.1.368
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/instascale v0.0.9
Expand Down Expand Up @@ -69,7 +70,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/openshift-online/ocm-sdk-go v0.1.327 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,8 @@ github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
github.com/openshift-online/ocm-sdk-go v0.1.327 h1:WR822bGdQoMuZ2+dFdhZz3fpD2NlJhGr+F3FJPXvqFU=
github.com/openshift-online/ocm-sdk-go v0.1.327/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift-online/ocm-sdk-go v0.1.368 h1:qP+gkChV8WDwwpkUw1xUyjTXKdvrwyd70Gff2GMUSeU=
github.com/openshift-online/ocm-sdk-go v0.1.368/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556 h1:7W2fOhJicyEff24VaF7ASNzPtYvr+iSCVft4SIBAzaE=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556/go.mod h1:aQ6LDasvHMvHZXqLHnX2GRmnfTWCF/iIwz8EMTTIE9A=
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c h1:CV76yFOTXmq9VciBR3Bve5ZWzSxdft7gaMVB3kS0rwg=
Expand Down
142 changes: 142 additions & 0 deletions test/e2e/instascale_app_wrapper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package e2e

import (
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/project-codeflare/codeflare-operator/test/support"
)

func createInstaScaleJobAppWrapper(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) {
// Batch Job
job := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: batchv1.SchemeGroupVersion.String(),
Kind: "Job",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist",
Namespace: namespace.Name,
},
Spec: batchv1.JobSpec{
Completions: Ptr(int32(1)),
Parallelism: Ptr(int32(1)),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "job",
Image: GetPyTorchImage(),
Env: []corev1.EnvVar{
{Name: "PYTHONUSERBASE", Value: "/workdir"},
},
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
Fiona-Waters marked this conversation as resolved.
Show resolved Hide resolved
Args: []string{"$PYTHONUSERBASE"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "test",
MountPath: "/test",
},
{
Name: "workdir",
MountPath: "/workdir",
},
},
WorkingDir: "/workdir",
},
},
Volumes: []corev1.Volume{
{
Name: "test",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: config.Name,
},
},
},
},
{
Name: "workdir",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}

// create an appwrapper
aw := &mcadv1beta1.AppWrapper{
ObjectMeta: metav1.ObjectMeta{
Name: "test-instascale",
Namespace: namespace.Name,
Labels: map[string]string{
"orderedinstance": "g4dn.xlarge",
},
},
Spec: mcadv1beta1.AppWrapperSpec{
AggrResources: mcadv1beta1.AppWrapperResourceList{
GenericItems: []mcadv1beta1.AppWrapperGenericResource{
{
CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
"nvidia.com/gpu": resource.MustParse("1"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
},
},
},
GenericTemplate: Raw(test, job),
CompletionStatus: "Complete",
},
},
},
},
}

_, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{})

return job, aw, err
}
75 changes: 75 additions & 0 deletions test/e2e/instascale_machinepool_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package e2e

import (
"testing"

. "github.com/onsi/gomega"
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"

. "github.com/project-codeflare/codeflare-operator/test/support"
)

func TestInstascaleMachinePool(t *testing.T) {
test := With(t)
test.T().Parallel()

if !IsOsd() {
test.T().Skip("Skipping test as not running on an OSD cluster")
}

namespace := test.NewTestNamespace()

// Test configuration
cm := CreateConfigMap(test, namespace.Name, map[string][]byte{
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
})

//create OCM connection
connection := CreateOCMConnection(test)
defer connection.Close()

// check existing cluster machine pool resources
// look for machine pool with aw name - expect not to find it
test.Expect(GetMachinePools(test, connection)).
ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

// Setup batch job and AppWrapper
_, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm)
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name)

// assert that AppWrapper goes to "Running" state
test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive)))

// look for machine pool with aw name - expect to find it
test.Eventually(MachinePools(test, connection), TestTimeoutLong).
Should(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted)))

// look for machine pool with aw name - expect not to find it
test.Eventually(MachinePools(test, connection), TestTimeoutLong).
ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

}
21 changes: 21 additions & 0 deletions test/support/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,27 @@ import (
"k8s.io/apimachinery/pkg/runtime"
)

func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap {
configMap := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
GenerateName: "config-",
Namespace: namespace,
},
BinaryData: content,
Immutable: Ptr(true),
}

configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{})
t.Expect(err).NotTo(gomega.HaveOccurred())
t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name)

return configMap
}

func Raw(t Test, obj runtime.Object) runtime.RawExtension {
t.T().Helper()
data, err := json.Marshal(obj)
Expand Down
25 changes: 24 additions & 1 deletion test/support/codeflare.go → test/support/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package support

import (
"os"
"strings"
)

const (
Expand All @@ -30,8 +31,13 @@ const (
CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE"

// The testing output directory, to write output files into.

CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR"

// The namespace where a secret containing InstaScale OCM token is stored and the secret name.
InstaScaleOcmSecret = "INSTASCALE_OCM_SECRET"

// Cluster ID for OSD cluster used in tests, used for testing InstaScale
OsdClusterID = "CLUSTERID"
)

func GetCodeFlareSDKVersion() string {
Expand All @@ -50,6 +56,23 @@ func GetPyTorchImage() string {
return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime")
}

func GetInstascaleOcmSecret() (string, string) {
res := strings.SplitN(lookupEnvOrDefault(InstaScaleOcmSecret, "default/instascale-com-secret"), "/", 2)
return res[0], res[1]
}

func GetOsdClusterId() (string, bool) {
return os.LookupEnv(OsdClusterID)
}

func IsOsd() bool {
osdClusterId, found := GetOsdClusterId()
if found && osdClusterId != "" {
return true
}
return false
}

func lookupEnvOrDefault(key, value string) string {
if v, ok := os.LookupEnv(key); ok {
return v
Expand Down
Loading
Loading