From 7cd925b389a06a76d1a1cd5863a6145628634b36 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 13 Jun 2023 12:01:42 +0200 Subject: [PATCH 01/34] Initial e2e tests --- .github/workflows/e2e_tests.yaml | 180 +++++++++++++++++++++++++++ .pre-commit-config.yaml | 1 - Makefile | 17 ++- go.mod | 2 + go.sum | 4 + test/e2e/kind.yaml | 32 +++++ test/e2e/mcad_test.go | 203 +++++++++++++++++++++++++++++++ test/support/client.go | 87 +++++++++++++ test/support/codeflare.go | 22 ++++ test/support/core.go | 17 +++ test/support/gomega.go | 28 +++++ test/support/mcad.go | 22 ++++ test/support/namespace.go | 56 +++++++++ test/support/support.go | 17 +++ test/support/test.go | 88 ++++++++++++++ test/support/utils.go | 5 + 16 files changed, 777 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/e2e_tests.yaml create mode 100644 test/e2e/kind.yaml create mode 100644 test/e2e/mcad_test.go create mode 100644 test/support/client.go create mode 100644 test/support/codeflare.go create mode 100644 test/support/core.go create mode 100644 test/support/gomega.go create mode 100644 test/support/mcad.go create mode 100644 test/support/namespace.go create mode 100644 test/support/support.go create mode 100644 test/support/test.go create mode 100644 test/support/utils.go diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml new file mode 100644 index 00000000..43e1c2ef --- /dev/null +++ b/.github/workflows/e2e_tests.yaml @@ -0,0 +1,180 @@ +name: e2e + +on: + pull_request: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + push: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + +concurrency: + group: ${{ github.head_ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + kubernetes: + + runs-on: ubuntu-20.04 + + steps: + - name: Cleanup + run: | + ls -lart + echo "Initial status:" + df -h + + echo "Cleaning up resources:" + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + docker rmi $(docker image ls -aq) + + echo "Final status:" + df -h + + - name: Checkout code + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Init directories + run: | + TEMP_DIR="$(pwd)/tmp" + mkdir -p "${TEMP_DIR}" + echo "TEMP_DIR=${TEMP_DIR}" >> $GITHUB_ENV + + mkdir -p "$(pwd)/bin" + echo "$(pwd)/bin" >> $GITHUB_PATH + + - name: Set Go + uses: actions/setup-go@v3 + with: + go-version: v1.18 + + - name: Container image registry + run: | + podman run -d -p 5000:5000 --name registry registry:2.8.1 + + export REGISTRY_ADDRESS=$(hostname -i):5000 + echo "REGISTRY_ADDRESS=${REGISTRY_ADDRESS}" >> $GITHUB_ENV + echo "Container image registry started at ${REGISTRY_ADDRESS}" + + KIND_CONFIG_FILE=${{ env.TEMP_DIR }}/kind.yaml + echo "KIND_CONFIG_FILE=${KIND_CONFIG_FILE}" >> $GITHUB_ENV + envsubst < ./test/e2e/kind.yaml > ${KIND_CONFIG_FILE} + + sudo --preserve-env=REGISTRY_ADDRESS sh -c 'cat > /etc/containers/registries.conf.d/local.conf < Date: Mon, 19 Jun 2023 17:05:36 +0200 Subject: [PATCH 02/34] test: Submit MNIST RayJob --- go.mod | 1 + go.sum | 2 + test/e2e/mcad_test.go | 80 +++++++++++++++++++++- test/e2e/mnist.py | 148 +++++++++++++++++++++++++++++++++++++++++ test/e2e/support.go | 6 ++ test/support/client.go | 13 ++++ test/support/ray.go | 24 +++++++ 7 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 test/e2e/mnist.py create mode 100644 test/e2e/support.go create mode 100644 test/support/ray.go diff --git a/go.mod b/go.mod index 621853fb..fff6657e 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/onsi/gomega v1.27.6 github.com/project-codeflare/multi-cluster-app-dispatcher v1.31.0 github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 + github.com/rs/xid v1.5.0 go.uber.org/zap v1.24.0 k8s.io/api v0.26.3 k8s.io/apimachinery v0.26.3 diff --git a/go.sum b/go.sum index 8e877d1f..c475743c 100644 --- a/go.sum +++ b/go.sum @@ -465,6 +465,8 @@ github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 h github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9/go.mod h1:2auArgwD9dXXJz1oc7SqQ4U/rHdpwnrBwG98kr8OWXA= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= diff --git a/test/e2e/mcad_test.go b/test/e2e/mcad_test.go index 5db8bf7c..3c86f51a 100644 --- a/test/e2e/mcad_test.go +++ b/test/e2e/mcad_test.go @@ -18,6 +18,7 @@ limitations under the License. package e2e import ( + "encoding/base64" "testing" . "github.com/onsi/gomega" @@ -26,10 +27,11 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" - rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + "github.com/rs/xid" . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" ) func TestJobSubmissionInRayCluster(t *testing.T) { @@ -39,7 +41,29 @@ func TestJobSubmissionInRayCluster(t *testing.T) { // Create a namespace namespace := test.NewTestNamespace() + // Job script + mnist, err := scripts.ReadFile("mnist.py") + test.Expect(err).NotTo(HaveOccurred()) + + configMap := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.GroupName, + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "mnist.py": mnist, + }, + Immutable: Ptr(true), + } + configMap, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), configMap, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + // RayCluster + clusterID := xid.New() rayCluster := &rayv1alpha1.RayCluster{ TypeMeta: metav1.TypeMeta{ APIVersion: rayv1alpha1.GroupVersion.String(), @@ -48,6 +72,9 @@ func TestJobSubmissionInRayCluster(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "raycluster-autoscaler", Namespace: namespace.Name, + Labels: map[string]string{ + RayJobDefaultClusterSelectorKey: clusterID.String(), + }, }, Spec: rayv1alpha1.RayClusterSpec{ RayVersion: "2.0.0", @@ -110,6 +137,24 @@ func TestJobSubmissionInRayCluster(t *testing.T) { corev1.ResourceMemory: resource.MustParse("1G"), }, }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "mnist", + MountPath: "/home/ray/jobs", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "mnist", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: configMap.Name, + }, + }, + }, }, }, }, @@ -195,9 +240,38 @@ func TestJobSubmissionInRayCluster(t *testing.T) { }, } - _, err := test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) + _, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) test.Expect(err).NotTo(HaveOccurred()) test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + rayJob := &rayv1alpha1.RayJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rayv1alpha1.GroupVersion.String(), + Kind: "RayJob", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: rayv1alpha1.RayJobSpec{ + Entrypoint: "python /home/ray/jobs/mnist.py", + RuntimeEnv: base64.StdEncoding.EncodeToString([]byte(` +pytorch_lightning==1.5.10 +ray_lightning +torchmetrics==0.9.1 +torchvision==0.12.0 +`)), + ClusterSelector: map[string]string{ + RayJobDefaultClusterSelectorKey: clusterID.String(), + }, + ShutdownAfterJobFinishes: false, + }, + } + _, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + test.Eventually(RayJob(test, namespace, rayJob.Name), TestTimeoutLong). + Should(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) } diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py new file mode 100644 index 00000000..e60ec7c6 --- /dev/null +++ b/test/e2e/mnist.py @@ -0,0 +1,148 @@ +# In[] +import os + +import torch +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from pytorch_lightning.loggers import CSVLogger +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + print("Downloading MNIST dataset...") + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + self.data_dir, train=False, transform=self.transform + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST() + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=5, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model) diff --git a/test/e2e/support.go b/test/e2e/support.go new file mode 100644 index 00000000..27e9e54b --- /dev/null +++ b/test/e2e/support.go @@ -0,0 +1,6 @@ +package e2e + +import "embed" + +//go:embed *.py +var scripts embed.FS diff --git a/test/support/client.go b/test/support/client.go index 0026b293..634df4f1 100644 --- a/test/support/client.go +++ b/test/support/client.go @@ -27,18 +27,21 @@ import ( codeflareclient "github.com/project-codeflare/codeflare-operator/client/clientset/versioned" mcadclient "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned" + rayclient "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned" ) type Client interface { Core() kubernetes.Interface CodeFlare() codeflareclient.Interface MCAD() mcadclient.Interface + Ray() rayclient.Interface } type testClient struct { core kubernetes.Interface codeflare codeflareclient.Interface mcad mcadclient.Interface + ray rayclient.Interface } var _ Client = (*testClient)(nil) @@ -55,6 +58,10 @@ func (t *testClient) MCAD() mcadclient.Interface { return t.mcad } +func (t *testClient) Ray() rayclient.Interface { + return t.ray +} + func newTestClient() (Client, error) { cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( clientcmd.NewDefaultClientConfigLoadingRules(), @@ -79,9 +86,15 @@ func newTestClient() (Client, error) { return nil, err } + rayClient, err := rayclient.NewForConfig(cfg) + if err != nil { + return nil, err + } + return &testClient{ core: kubeClient, codeflare: codeFlareClient, mcad: mcadClient, + ray: rayClient, }, nil } diff --git a/test/support/ray.go b/test/support/ray.go new file mode 100644 index 00000000..dfc3b2bb --- /dev/null +++ b/test/support/ray.go @@ -0,0 +1,24 @@ +package support + +import ( + "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" +) + +const RayJobDefaultClusterSelectorKey = "ray.io/cluster" + +func RayJob(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *rayv1alpha1.RayJob { + return func(g gomega.Gomega) *rayv1alpha1.RayJob { + job, err := t.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + return job + } +} + +func RayJobStatus(job *rayv1alpha1.RayJob) rayv1alpha1.JobStatus { + return job.Status.JobStatus +} From 3dc70d360d4b5e721c24775176fd9e569d035c6d Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 19 Jun 2023 17:25:31 +0200 Subject: [PATCH 03/34] test: Fix RayCluster labels selector --- go.mod | 1 - go.sum | 2 -- test/e2e/mcad_test.go | 8 +------- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/go.mod b/go.mod index fff6657e..621853fb 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,6 @@ require ( github.com/onsi/gomega v1.27.6 github.com/project-codeflare/multi-cluster-app-dispatcher v1.31.0 github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 - github.com/rs/xid v1.5.0 go.uber.org/zap v1.24.0 k8s.io/api v0.26.3 k8s.io/apimachinery v0.26.3 diff --git a/go.sum b/go.sum index c475743c..8e877d1f 100644 --- a/go.sum +++ b/go.sum @@ -465,8 +465,6 @@ github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 h github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9/go.mod h1:2auArgwD9dXXJz1oc7SqQ4U/rHdpwnrBwG98kr8OWXA= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= -github.com/rs/xid v1.5.0 h1:mKX4bl4iPYJtEIxp6CYiUuLQ/8DYMoz0PUdtGgMFRVc= -github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= diff --git a/test/e2e/mcad_test.go b/test/e2e/mcad_test.go index 3c86f51a..4557484d 100644 --- a/test/e2e/mcad_test.go +++ b/test/e2e/mcad_test.go @@ -27,8 +27,6 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/rs/xid" - . "github.com/project-codeflare/codeflare-operator/test/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" @@ -63,7 +61,6 @@ func TestJobSubmissionInRayCluster(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) // RayCluster - clusterID := xid.New() rayCluster := &rayv1alpha1.RayCluster{ TypeMeta: metav1.TypeMeta{ APIVersion: rayv1alpha1.GroupVersion.String(), @@ -72,9 +69,6 @@ func TestJobSubmissionInRayCluster(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "raycluster-autoscaler", Namespace: namespace.Name, - Labels: map[string]string{ - RayJobDefaultClusterSelectorKey: clusterID.String(), - }, }, Spec: rayv1alpha1.RayClusterSpec{ RayVersion: "2.0.0", @@ -264,7 +258,7 @@ torchmetrics==0.9.1 torchvision==0.12.0 `)), ClusterSelector: map[string]string{ - RayJobDefaultClusterSelectorKey: clusterID.String(), + RayJobDefaultClusterSelectorKey: rayCluster.Name, }, ShutdownAfterJobFinishes: false, }, From b824ef1512f30e6db0513643241cf692b5bc9955 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 19 Jun 2023 17:28:10 +0200 Subject: [PATCH 04/34] e2e: Print KubeRay operator logs --- .github/workflows/e2e_tests.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 43e1c2ef..1f1a80e1 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -178,3 +178,9 @@ jobs: run: | echo "Printing MCAD controller logs" kubectl logs -n codeflare-system --tail -1 -l component=multi-cluster-application-dispatcher + + - name: Print KubeRay operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing KubeRay operator logs" + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay From a6011cf4d05ffe1e28e873199eb87c560b69da3c Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 19 Jun 2023 17:44:58 +0200 Subject: [PATCH 05/34] test: Fix RayJob runtime environment --- test/e2e/mcad_test.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/e2e/mcad_test.go b/test/e2e/mcad_test.go index 4557484d..930222a6 100644 --- a/test/e2e/mcad_test.go +++ b/test/e2e/mcad_test.go @@ -252,10 +252,16 @@ func TestJobSubmissionInRayCluster(t *testing.T) { Spec: rayv1alpha1.RayJobSpec{ Entrypoint: "python /home/ray/jobs/mnist.py", RuntimeEnv: base64.StdEncoding.EncodeToString([]byte(` -pytorch_lightning==1.5.10 -ray_lightning -torchmetrics==0.9.1 -torchvision==0.12.0 +{ + "pip": [ + "pytorch_lightning==1.5.10", + "ray_lightning", + "torchmetrics==0.9.1", + "torchvision==0.12.0" + ], + "env_vars": { + } +} `)), ClusterSelector: map[string]string{ RayJobDefaultClusterSelectorKey: rayCluster.Name, From aed9d2655ff74de28bf4e64f33223b2e922b8fda Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 20 Jun 2023 10:33:00 +0200 Subject: [PATCH 06/34] test: Print RayJob logs --- test/e2e/mcad_test.go | 7 ++++++- test/support/ray.go | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test/e2e/mcad_test.go b/test/e2e/mcad_test.go index 930222a6..263bbcb4 100644 --- a/test/e2e/mcad_test.go +++ b/test/e2e/mcad_test.go @@ -269,9 +269,14 @@ func TestJobSubmissionInRayCluster(t *testing.T) { ShutdownAfterJobFinishes: false, }, } - _, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) + rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) test.Eventually(RayJob(test, namespace, rayJob.Name), TestTimeoutLong). Should(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) + + rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Get(test.Ctx(), rayJob.Name, metav1.GetOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + test.T().Log(GetRayJobLogs(test, rayJob)) } diff --git a/test/support/ray.go b/test/support/ray.go index dfc3b2bb..3e1389ed 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -1,6 +1,8 @@ package support import ( + "encoding/json" + "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -22,3 +24,18 @@ func RayJob(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gome func RayJobStatus(job *rayv1alpha1.RayJob) rayv1alpha1.JobStatus { return job.Status.JobStatus } + +func GetRayJobLogs(t Test, job *rayv1alpha1.RayJob) string { + t.T().Helper() + response := t.Client().Core().CoreV1().RESTClient(). + Get(). + AbsPath("/api/v1/namespaces", job.Namespace, "services", "http:"+job.Status.RayClusterName+"-head-svc:dashboard", "proxy", "api", "jobs", job.Status.JobId, "logs").Do(t.Ctx()) + t.Expect(response.Error()).NotTo(gomega.HaveOccurred()) + + body := map[string]string{} + bytes, _ := response.Raw() + t.Expect(json.Unmarshal(bytes, &body)).To(gomega.Succeed()) + t.Expect(body).To(gomega.HaveKey("logs")) + + return body["logs"] +} From d98ff93fbac4888bd9ef11ba4d244a31bb86841a Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 21 Jun 2023 14:44:26 +0200 Subject: [PATCH 07/34] test: Document how to run e2e tests locally --- README.md | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index afc9dafa..31f21640 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # codeflare-operator + Operator for installation and lifecycle management of CodeFlare distributed workload stack, starting with MCAD and InstaScale @@ -14,7 +15,36 @@ CodeFlare Stack Compatibility Matrix | KubeRay | v0.5.0 | -## Release process +## Development + +### Testing + +The e2e tests can be executed locally by running the following commands: + +1. Setup the test cluster: + + ```bash + # Create a KinD cluster + $ kind create cluster --image kindest/node:v1.25.8 + # Install the CRDs + $ make install + ``` + +2. Start the operator locally: + + ```bash + $ make run + ``` + +3. In a separate terminal, run the e2e suite: + + ```bash + $ make test-e2e + ``` + + Alternatively, You can run the e2e test(s) from your IDE / debugger. + +## Release Prerequisite: - Build and release [MCAD](https://github.com/project-codeflare/multi-cluster-app-dispatcher) From 0a38102f167c384f717ab44732dca32206f26d96 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 21 Jun 2023 15:24:11 +0200 Subject: [PATCH 08/34] test: Polish MNIST RayJob test --- test/e2e/kind.yaml | 23 +++++++++---------- ...o => mnist_rayjob_mcad_raycluster_test.go} | 15 ++++++------ test/e2e/support.go | 16 +++++++++++++ test/support/client.go | 15 ++++++------ test/support/codeflare.go | 22 ------------------ test/support/core.go | 16 +++++++++++++ test/support/gomega.go | 16 +++++++++++++ test/support/mcad.go | 16 +++++++++++++ test/support/namespace.go | 15 ++++++------ test/support/ray.go | 16 +++++++++++++ test/support/support.go | 16 +++++++++++++ test/support/test.go | 16 +++++++++++++ test/support/utils.go | 16 +++++++++++++ 13 files changed, 160 insertions(+), 58 deletions(-) rename test/e2e/{mcad_test.go => mnist_rayjob_mcad_raycluster_test.go} (94%) delete mode 100644 test/support/codeflare.go diff --git a/test/e2e/kind.yaml b/test/e2e/kind.yaml index ebb1f731..4546589b 100644 --- a/test/e2e/kind.yaml +++ b/test/e2e/kind.yaml @@ -1,18 +1,17 @@ # --------------------------------------------------------------------------- -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at +# Copyright 2023. # -# http://www.apache.org/licenses/LICENSE-2.0 +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # --------------------------------------------------------------------------- kind: Cluster diff --git a/test/e2e/mcad_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go similarity index 94% rename from test/e2e/mcad_test.go rename to test/e2e/mnist_rayjob_mcad_raycluster_test.go index 263bbcb4..0fe82043 100644 --- a/test/e2e/mcad_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -1,12 +1,11 @@ /* -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to You under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at +Copyright 2023. - http://www.apache.org/licenses/LICENSE-2.0 +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -32,7 +31,7 @@ import ( rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" ) -func TestJobSubmissionInRayCluster(t *testing.T) { +func TestMNISTRayJobMCADRayCluster(t *testing.T) { test := With(t) test.T().Parallel() diff --git a/test/e2e/support.go b/test/e2e/support.go index 27e9e54b..82f980eb 100644 --- a/test/e2e/support.go +++ b/test/e2e/support.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package e2e import "embed" diff --git a/test/support/client.go b/test/support/client.go index 634df4f1..a8efa76d 100644 --- a/test/support/client.go +++ b/test/support/client.go @@ -1,12 +1,11 @@ /* -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to You under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/test/support/codeflare.go b/test/support/codeflare.go deleted file mode 100644 index 3c26caa2..00000000 --- a/test/support/codeflare.go +++ /dev/null @@ -1,22 +0,0 @@ -package support - -import ( - "github.com/onsi/gomega" - - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - codeflarev1alpha1 "github.com/project-codeflare/codeflare-operator/api/codeflare/v1alpha1" -) - -func MCAD(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *codeflarev1alpha1.MCAD { - return func(g gomega.Gomega) *codeflarev1alpha1.MCAD { - mcad, err := t.Client().CodeFlare().CodeflareV1alpha1().MCADs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{}) - g.Expect(err).NotTo(gomega.HaveOccurred()) - return mcad - } -} - -func ReadyStatus(mcad *codeflarev1alpha1.MCAD) bool { - return mcad.Status.Ready -} diff --git a/test/support/core.go b/test/support/core.go index 21c3537e..10fbd4b2 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/gomega.go b/test/support/gomega.go index 27bb7223..5631044a 100644 --- a/test/support/gomega.go +++ b/test/support/gomega.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/mcad.go b/test/support/mcad.go index 0930fc38..4f268985 100644 --- a/test/support/mcad.go +++ b/test/support/mcad.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/namespace.go b/test/support/namespace.go index e9f57399..145acbb4 100644 --- a/test/support/namespace.go +++ b/test/support/namespace.go @@ -1,12 +1,11 @@ /* -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to You under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/test/support/ray.go b/test/support/ray.go index 3e1389ed..369db80b 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/support.go b/test/support/support.go index 43a4951a..184b0186 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/test.go b/test/support/test.go index 5fcb6f6f..5b3b271b 100644 --- a/test/support/test.go +++ b/test/support/test.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support import ( diff --git a/test/support/utils.go b/test/support/utils.go index 9ddf287d..ed40309f 100644 --- a/test/support/utils.go +++ b/test/support/utils.go @@ -1,3 +1,19 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package support func Ptr[T any](v T) *T { From 48a60849615de649312906a7594b7c9c5b51daca Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 09:50:47 +0200 Subject: [PATCH 09/34] test: Add MNIST training with MCAD Job --- test/e2e/mnist_pytorch_mcad_job_test.go | 182 ++++++++++++++++++ test/e2e/mnist_rayjob_mcad_raycluster_test.go | 2 +- test/support/batch.go | 33 ++++ test/support/conditions.go | 53 +++++ 4 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 test/e2e/mnist_pytorch_mcad_job_test.go create mode 100644 test/support/batch.go create mode 100644 test/support/conditions.go diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go new file mode 100644 index 00000000..242bef05 --- /dev/null +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -0,0 +1,182 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "testing" + + . "github.com/onsi/gomega" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" +) + +func TestMNISTPyTorchMCAD(t *testing.T) { + test := With(t) + test.T().Parallel() + + // Create a namespace + namespace := test.NewTestNamespace() + + // MNIST training script + mnist, err := scripts.ReadFile("mnist.py") + test.Expect(err).NotTo(HaveOccurred()) + + mnistScript := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "mnist.py": mnist, + }, + Immutable: Ptr(true), + } + mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + // pip requirements + requirements := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "requirements", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "requirements.txt": []byte(` +pytorch_lightning==1.5.10 +torchmetrics==0.9.1 +torchvision==0.12.0 +`), + }, + Immutable: Ptr(true), + } + requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + // Batch Job + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "job", + Image: "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime", + Command: []string{"/bin/sh", "-c", "pip install -r /test/runtime/requirements.txt && torchrun /test/job/mnist.py"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "mnist", + MountPath: "/test/job", + }, + { + Name: "requirements", + MountPath: "/test/runtime", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "mnist", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mnistScript.Name, + }, + }, + }, + }, + { + Name: "requirements", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: requirements.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + // Create an AppWrapper resource + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + DesiredAvailable: 1, + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + }, + }, + }, + }, + } + + _, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) + test.Expect(err).NotTo(HaveOccurred()) + + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + test.Eventually(Job(test, namespace, job.Name), TestTimeoutLong). + Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) +} diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 0fe82043..0af97f5c 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -44,7 +44,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { configMap := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.GroupName, + APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ diff --git a/test/support/batch.go b/test/support/batch.go new file mode 100644 index 00000000..4aea06fd --- /dev/null +++ b/test/support/batch.go @@ -0,0 +1,33 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "github.com/onsi/gomega" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func Job(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *batchv1.Job { + return func(g gomega.Gomega) *batchv1.Job { + job, err := t.Client().Core().BatchV1().Jobs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{}) + g.Expect(err).NotTo(gomega.HaveOccurred()) + return job + } +} diff --git a/test/support/conditions.go b/test/support/conditions.go new file mode 100644 index 00000000..e7c5097a --- /dev/null +++ b/test/support/conditions.go @@ -0,0 +1,53 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" +) + +type conditionType interface { + ~string +} + +func ConditionStatus[T conditionType](conditionType T) func(any) corev1.ConditionStatus { + return func(object any) corev1.ConditionStatus { + switch o := object.(type) { + + case *batchv1.Job: + if c := getJobCondition(o.Status.Conditions, batchv1.JobConditionType(conditionType)); c != nil { + return c.Status + } + + } + + return corev1.ConditionUnknown + } +} + +// TODO: to be replaced with a generic version once common struct fields of a type set can be used. +// See https://github.com/golang/go/issues/48522 +func getJobCondition(conditions []batchv1.JobCondition, conditionType batchv1.JobConditionType) *batchv1.JobCondition { + for _, c := range conditions { + if c.Type == conditionType { + return &c + } + } + return nil +} From 849a4158949b22954ebb53a4693db4309071548d Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 10:54:53 +0200 Subject: [PATCH 10/34] test: Print MNIST batch job logs --- test/e2e/mnist_pytorch_mcad_job_test.go | 13 +++++++++++++ test/support/batch.go | 5 +++++ test/support/core.go | 26 +++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 242bef05..d305d9a6 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -25,6 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" . "github.com/project-codeflare/codeflare-operator/test/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" @@ -179,4 +180,16 @@ torchvision==0.12.0 test.Eventually(Job(test, namespace, job.Name), TestTimeoutLong). Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + + // Refresh the job to get the generated pod selector + job = GetJob(test, namespace, job.Name) + + // Get the job Pod + pods := GetPods(test, namespace, metav1.ListOptions{ + LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, + ) + test.Expect(pods).To(HaveLen(1)) + + // Print the job logs + test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) } diff --git a/test/support/batch.go b/test/support/batch.go index 4aea06fd..c868d738 100644 --- a/test/support/batch.go +++ b/test/support/batch.go @@ -31,3 +31,8 @@ func Job(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) return job } } + +func GetJob(t Test, namespace *corev1.Namespace, name string) *batchv1.Job { + t.T().Helper() + return Job(t, namespace, name)(t) +} diff --git a/test/support/core.go b/test/support/core.go index 10fbd4b2..9da6d0b3 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -18,8 +18,12 @@ package support import ( "encoding/json" + "io" "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -31,3 +35,25 @@ func Raw(t Test, obj runtime.Object) runtime.RawExtension { Raw: data, } } + +func GetPods(t Test, namespace *corev1.Namespace, options metav1.ListOptions) []corev1.Pod { + t.T().Helper() + pods, err := t.Client().Core().CoreV1().Pods(namespace.Name).List(t.Ctx(), options) + t.Expect(err).NotTo(gomega.HaveOccurred()) + return pods.Items +} + +func GetPodLogs(t Test, pod *corev1.Pod, options corev1.PodLogOptions) string { + t.T().Helper() + stream, err := t.Client().Core().CoreV1().Pods(pod.GetNamespace()).GetLogs(pod.GetName(), &options).Stream(t.Ctx()) + t.Expect(err).NotTo(gomega.HaveOccurred()) + + defer func() { + t.Expect(stream.Close()).To(gomega.Succeed()) + }() + + bytes, err := io.ReadAll(stream) + t.Expect(err).NotTo(gomega.HaveOccurred()) + + return string(bytes) +} From e356b4b04df243fc810ac80dec0473524d008ff8 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 11:07:21 +0200 Subject: [PATCH 11/34] test: Use RayCluster 'complete' configuration --- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 45 +++++-------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 0af97f5c..8e8f97f8 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -38,7 +38,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { // Create a namespace namespace := test.NewTestNamespace() - // Job script + // MNIST training script mnist, err := scripts.ReadFile("mnist.py") test.Expect(err).NotTo(HaveOccurred()) @@ -66,39 +66,21 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { Kind: "RayCluster", }, ObjectMeta: metav1.ObjectMeta{ - Name: "raycluster-autoscaler", + Name: "raycluster", Namespace: namespace.Name, }, Spec: rayv1alpha1.RayClusterSpec{ - RayVersion: "2.0.0", - EnableInTreeAutoscaling: Ptr(true), - AutoscalerOptions: &rayv1alpha1.AutoscalerOptions{ - UpscalingMode: Ptr[rayv1alpha1.UpscalingMode]("Default"), - IdleTimeoutSeconds: Ptr(int32(60)), - ImagePullPolicy: Ptr(corev1.PullAlways), - Resources: &corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("250m"), - corev1.ResourceMemory: resource.MustParse("512Mi"), - }, - }, - }, + RayVersion: "2.0.0", HeadGroupSpec: rayv1alpha1.HeadGroupSpec{ RayStartParams: map[string]string{ "dashboard-host": "0.0.0.0", - "block": "true", }, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "ray-head", - Image: "rayproject/ray:2.0.0", - ImagePullPolicy: corev1.PullAlways, + Name: "ray-head", + Image: "rayproject/ray:2.0.0", Ports: []corev1.ContainerPort{ { ContainerPort: 6379, @@ -155,13 +137,11 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, WorkerGroupSpecs: []rayv1alpha1.WorkerGroupSpec{ { - Replicas: Ptr(int32(1)), - MinReplicas: Ptr(int32(1)), - MaxReplicas: Ptr(int32(3)), - GroupName: "small-group", - RayStartParams: map[string]string{ - "block": "true", - }, + Replicas: Ptr(int32(1)), + MinReplicas: Ptr(int32(1)), + MaxReplicas: Ptr(int32(2)), + GroupName: "small-group", + RayStartParams: map[string]string{}, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ InitContainers: []corev1.Container{ @@ -173,9 +153,8 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, Containers: []corev1.Container{ { - Name: "machine-learning", - Image: "rayproject/ray:2.0.0", - ImagePullPolicy: corev1.PullAlways, + Name: "ray-worker", + Image: "rayproject/ray:2.0.0", Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ From 9d1ad868a82c67a89e671ddd7a63b8c7747674bb Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 11:17:25 +0200 Subject: [PATCH 12/34] test: Add step log statements --- test/e2e/mnist_pytorch_mcad_job_test.go | 6 ++++++ test/e2e/mnist_rayjob_mcad_raycluster_test.go | 14 ++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index d305d9a6..fcf5da43 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -58,6 +58,7 @@ func TestMNISTPyTorchMCAD(t *testing.T) { } mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", mnistScript.Namespace, mnistScript.Name) // pip requirements requirements := &corev1.ConfigMap{ @@ -80,6 +81,7 @@ torchvision==0.12.0 } requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", requirements.Namespace, requirements.Name) // Batch Job job := &batchv1.Job{ @@ -174,10 +176,13 @@ torchvision==0.12.0 _, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created MCAD %s/%s successfully", aw.Namespace, aw.Name) + test.T().Logf("Waiting for MCAD %s/%s to be running", aw.Namespace, aw.Name) test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) test.Eventually(Job(test, namespace, job.Name), TestTimeoutLong). Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) @@ -191,5 +196,6 @@ torchvision==0.12.0 test.Expect(pods).To(HaveLen(1)) // Print the job logs + test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) } diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 8e8f97f8..6bdba998 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -42,7 +42,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { mnist, err := scripts.ReadFile("mnist.py") test.Expect(err).NotTo(HaveOccurred()) - configMap := &corev1.ConfigMap{ + mnistScript := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", @@ -56,8 +56,9 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, Immutable: Ptr(true), } - configMap, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), configMap, metav1.CreateOptions{}) + mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", mnistScript.Namespace, mnistScript.Name) // RayCluster rayCluster := &rayv1alpha1.RayCluster{ @@ -126,7 +127,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: configMap.Name, + Name: mnistScript.Name, }, }, }, @@ -212,9 +213,11 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, } - _, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) + aw, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created MCAD %s/%s successfully", aw.Namespace, aw.Name) + test.T().Logf("Waiting for MCAD %s/%s to be running", aw.Namespace, aw.Name) test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) @@ -249,12 +252,15 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { } rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + test.T().Logf("Waiting for RayJob %s/%s to complete successfully", rayJob.Namespace, rayJob.Name) test.Eventually(RayJob(test, namespace, rayJob.Name), TestTimeoutLong). Should(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Get(test.Ctx(), rayJob.Name, metav1.GetOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Printing RayJob %s/%s logs", rayJob.Namespace, rayJob.Name) test.T().Log(GetRayJobLogs(test, rayJob)) } From b143732408aeb75d7a0d26f9a304900fb0e9bfa9 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 11:57:34 +0200 Subject: [PATCH 13/34] test: Add defered troubleshooting logs --- test/e2e/mnist_pytorch_mcad_job_test.go | 30 ++++++++++++++++++++++--- test/support/batch.go | 7 +++--- test/support/core.go | 4 ++-- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index fcf5da43..c4d6caf9 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -182,15 +182,17 @@ torchvision==0.12.0 test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + defer troubleshooting(test, job) + test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) - test.Eventually(Job(test, namespace, job.Name), TestTimeoutLong). + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong). Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) // Refresh the job to get the generated pod selector - job = GetJob(test, namespace, job.Name) + job = GetJob(test, job.Namespace, job.Name) // Get the job Pod - pods := GetPods(test, namespace, metav1.ListOptions{ + pods := GetPods(test, job.Namespace, metav1.ListOptions{ LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, ) test.Expect(pods).To(HaveLen(1)) @@ -199,3 +201,25 @@ torchvision==0.12.0 test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) } + +func troubleshooting(test Test, job *batchv1.Job) { + if !test.T().Failed() { + return + } + job = GetJob(test, job.Namespace, job.Name) + + test.T().Errorf("Job %s/%s hasn't completed in time: %s", job.Namespace, job.Name, job) + + pods := GetPods(test, job.Namespace, metav1.ListOptions{ + LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, + ) + + if len(pods) == 0 { + test.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) + } else { + for i, pod := range pods { + test.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) + test.T().Log(GetPodLogs(test, &pods[i], corev1.PodLogOptions{})) + } + } +} diff --git a/test/support/batch.go b/test/support/batch.go index c868d738..2cb2d543 100644 --- a/test/support/batch.go +++ b/test/support/batch.go @@ -20,19 +20,18 @@ import ( "github.com/onsi/gomega" batchv1 "k8s.io/api/batch/v1" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func Job(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *batchv1.Job { +func Job(t Test, namespace, name string) func(g gomega.Gomega) *batchv1.Job { return func(g gomega.Gomega) *batchv1.Job { - job, err := t.Client().Core().BatchV1().Jobs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{}) + job, err := t.Client().Core().BatchV1().Jobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) g.Expect(err).NotTo(gomega.HaveOccurred()) return job } } -func GetJob(t Test, namespace *corev1.Namespace, name string) *batchv1.Job { +func GetJob(t Test, namespace, name string) *batchv1.Job { t.T().Helper() return Job(t, namespace, name)(t) } diff --git a/test/support/core.go b/test/support/core.go index 9da6d0b3..273c049a 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -36,9 +36,9 @@ func Raw(t Test, obj runtime.Object) runtime.RawExtension { } } -func GetPods(t Test, namespace *corev1.Namespace, options metav1.ListOptions) []corev1.Pod { +func GetPods(t Test, namespace string, options metav1.ListOptions) []corev1.Pod { t.T().Helper() - pods, err := t.Client().Core().CoreV1().Pods(namespace.Name).List(t.Ctx(), options) + pods, err := t.Client().Core().CoreV1().Pods(namespace).List(t.Ctx(), options) t.Expect(err).NotTo(gomega.HaveOccurred()) return pods.Items } From e95e08a0a49f0c7ec1ac9138f0ce4878c852bb31 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 16:07:04 +0200 Subject: [PATCH 14/34] test: Add MNIST training in RayCluster with CodeFlare SDK --- test/e2e/mnist_pytorch_mcad_job_test.go | 37 +---- test/e2e/mnist_raycluster_sdk_test.go | 145 ++++++++++++++++++ test/e2e/mnist_rayjob_mcad_raycluster_test.go | 13 +- test/e2e/requirements.txt | 1 + test/e2e/sdk.py | 39 +++++ test/e2e/support.go | 18 ++- test/support/batch.go | 24 +++ 7 files changed, 235 insertions(+), 42 deletions(-) create mode 100644 test/e2e/mnist_raycluster_sdk_test.go create mode 100644 test/e2e/requirements.txt create mode 100644 test/e2e/sdk.py diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index c4d6caf9..a704d089 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -39,10 +39,7 @@ func TestMNISTPyTorchMCAD(t *testing.T) { namespace := test.NewTestNamespace() // MNIST training script - mnist, err := scripts.ReadFile("mnist.py") - test.Expect(err).NotTo(HaveOccurred()) - - mnistScript := &corev1.ConfigMap{ + mnist := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", @@ -52,13 +49,13 @@ func TestMNISTPyTorchMCAD(t *testing.T) { Namespace: namespace.Name, }, BinaryData: map[string][]byte{ - "mnist.py": mnist, + "mnist.py": ReadFile(test, "mnist.py"), }, Immutable: Ptr(true), } - mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{}) + mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", mnistScript.Namespace, mnistScript.Name) + test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) // pip requirements requirements := &corev1.ConfigMap{ @@ -121,7 +118,7 @@ torchvision==0.12.0 VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: mnistScript.Name, + Name: mnist.Name, }, }, }, @@ -182,7 +179,7 @@ torchvision==0.12.0 test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) - defer troubleshooting(test, job) + defer JobTroubleshooting(test, job) test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong). @@ -201,25 +198,3 @@ torchvision==0.12.0 test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) } - -func troubleshooting(test Test, job *batchv1.Job) { - if !test.T().Failed() { - return - } - job = GetJob(test, job.Namespace, job.Name) - - test.T().Errorf("Job %s/%s hasn't completed in time: %s", job.Namespace, job.Name, job) - - pods := GetPods(test, job.Namespace, metav1.ListOptions{ - LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, - ) - - if len(pods) == 0 { - test.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) - } else { - for i, pod := range pods { - test.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) - test.T().Log(GetPodLogs(test, &pods[i], corev1.PodLogOptions{})) - } - } -} diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go new file mode 100644 index 00000000..7283e463 --- /dev/null +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -0,0 +1,145 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "testing" + + . "github.com/onsi/gomega" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" +) + +func TestMNISTRayClusterSDK(t *testing.T) { + test := With(t) + test.T().Parallel() + + test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/146") + + // Create a namespace + namespace := test.NewTestNamespace() + + // SDK script + sdk := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "sdk.py": ReadFile(test, "sdk.py"), + }, + Immutable: Ptr(true), + } + sdk, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), sdk, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", sdk.Namespace, sdk.Name) + + // pip requirements + requirements := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "requirements", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "requirements.txt": ReadFile(test, "requirements.txt"), + }, + Immutable: Ptr(true), + } + requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", requirements.Namespace, requirements.Name) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "sdk", + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Command: []string{"/bin/sh", "-c", "pip install -r /test/runtime/requirements.txt && python /test/job/sdk.py"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "sdk", + MountPath: "/test/job", + }, + { + Name: "requirements", + MountPath: "/test/runtime", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "sdk", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: sdk.Name, + }, + }, + }, + }, + { + Name: "requirements", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: requirements.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + defer JobTroubleshooting(test, job) + + test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutMedium). + Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) +} diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 6bdba998..d7a6d18d 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -39,10 +39,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { namespace := test.NewTestNamespace() // MNIST training script - mnist, err := scripts.ReadFile("mnist.py") - test.Expect(err).NotTo(HaveOccurred()) - - mnistScript := &corev1.ConfigMap{ + mnist := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", @@ -52,13 +49,13 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { Namespace: namespace.Name, }, BinaryData: map[string][]byte{ - "mnist.py": mnist, + "mnist.py": ReadFile(test, "mnist.py"), }, Immutable: Ptr(true), } - mnistScript, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnistScript, metav1.CreateOptions{}) + mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", mnistScript.Namespace, mnistScript.Name) + test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) // RayCluster rayCluster := &rayv1alpha1.RayCluster{ @@ -127,7 +124,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: mnistScript.Name, + Name: mnist.Name, }, }, }, diff --git a/test/e2e/requirements.txt b/test/e2e/requirements.txt new file mode 100644 index 00000000..382b9855 --- /dev/null +++ b/test/e2e/requirements.txt @@ -0,0 +1 @@ +codeflare-sdk==0.4.4 diff --git a/test/e2e/sdk.py b/test/e2e/sdk.py new file mode 100644 index 00000000..578bc48c --- /dev/null +++ b/test/e2e/sdk.py @@ -0,0 +1,39 @@ +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration +# from codeflare_sdk.cluster.auth import TokenAuthentication +from codeflare_sdk.job.jobs import DDPJobDefinition + +cluster = Cluster(ClusterConfiguration( + name='mnist', + # namespace='default', + min_worker=1, + max_worker=1, + min_cpus=0.2, + max_cpus=1, + min_memory=0.5, + max_memory=1, + gpu=0, + instascale=False, +)) + +cluster.up() + +cluster.status() + +cluster.wait_ready() + +cluster.status() + +cluster.details() + +jobdef = DDPJobDefinition( + name="mnist", + script="/test/job/mnist.py", + scheduler_args={"requirements": "/test/runtime/requirements.txt"} +) +job = jobdef.submit(cluster) + +job.status() + +print(job.logs()) + +cluster.down() diff --git a/test/e2e/support.go b/test/e2e/support.go index 82f980eb..7847c075 100644 --- a/test/e2e/support.go +++ b/test/e2e/support.go @@ -16,7 +16,19 @@ limitations under the License. package e2e -import "embed" +import ( + "embed" -//go:embed *.py -var scripts embed.FS + "github.com/onsi/gomega" + + "github.com/project-codeflare/codeflare-operator/test/support" +) + +//go:embed *.py *.txt +var files embed.FS + +func ReadFile(t support.Test, fileName string) []byte { + file, err := files.ReadFile(fileName) + t.Expect(err).NotTo(gomega.HaveOccurred()) + return file +} diff --git a/test/support/batch.go b/test/support/batch.go index 2cb2d543..1bf6874e 100644 --- a/test/support/batch.go +++ b/test/support/batch.go @@ -20,7 +20,9 @@ import ( "github.com/onsi/gomega" batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" ) func Job(t Test, namespace, name string) func(g gomega.Gomega) *batchv1.Job { @@ -35,3 +37,25 @@ func GetJob(t Test, namespace, name string) *batchv1.Job { t.T().Helper() return Job(t, namespace, name)(t) } + +func JobTroubleshooting(test Test, job *batchv1.Job) { + if !test.T().Failed() { + return + } + job = GetJob(test, job.Namespace, job.Name) + + test.T().Errorf("Job %s/%s hasn't completed in time: %s", job.Namespace, job.Name, job) + + pods := GetPods(test, job.Namespace, metav1.ListOptions{ + LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, + ) + + if len(pods) == 0 { + test.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) + } else { + for i, pod := range pods { + test.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) + test.T().Log(GetPodLogs(test, &pods[i], corev1.PodLogOptions{})) + } + } +} From 49563ef9bb6d8a0a69604e5e3131e409ef0623c7 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 22 Jun 2023 17:25:58 +0200 Subject: [PATCH 15/34] test: Customize test timeouts --- .github/workflows/e2e_tests.yaml | 4 ++++ test/support/gomega.go | 13 ---------- test/support/support.go | 41 +++++++++++++++++++++++++++++--- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 1f1a80e1..1f8263d5 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -165,6 +165,10 @@ jobs: - name: Run e2e tests run: | + export CODEFLARE_TEST_TIMEOUT_SHORT=1m + export CODEFLARE_TEST_TIMEOUT_MEDIUM=3m + export CODEFLARE_TEST_TIMEOUT_LONG=8m + make test-e2e - name: Print CodeFlare operator logs diff --git a/test/support/gomega.go b/test/support/gomega.go index 5631044a..74897502 100644 --- a/test/support/gomega.go +++ b/test/support/gomega.go @@ -17,24 +17,11 @@ limitations under the License. package support import ( - "time" - "github.com/onsi/gomega" - "github.com/onsi/gomega/format" "github.com/onsi/gomega/gstruct" "github.com/onsi/gomega/types" ) -func init() { - // Gomega settings - gomega.SetDefaultEventuallyTimeout(TestTimeoutShort) - gomega.SetDefaultEventuallyPollingInterval(1 * time.Second) - gomega.SetDefaultConsistentlyDuration(30 * time.Second) - gomega.SetDefaultConsistentlyPollingInterval(1 * time.Second) - // Disable object truncation on test results - format.MaxLength = 0 -} - func EqualP(expected interface{}) types.GomegaMatcher { return gstruct.PointTo(gomega.Equal(expected)) } diff --git a/test/support/support.go b/test/support/support.go index 184b0186..782e65e7 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -17,17 +17,52 @@ limitations under the License. package support import ( + "fmt" + "os" "time" + "github.com/onsi/gomega" + "github.com/onsi/gomega/format" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -const ( +var ( TestTimeoutShort = 1 * time.Minute TestTimeoutMedium = 2 * time.Minute TestTimeoutLong = 5 * time.Minute -) -var ( ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true} ) + +func init() { + if value, ok := os.LookupEnv("CODEFLARE_TEST_TIMEOUT_SHORT"); ok { + if duration, err := time.ParseDuration(value); err == nil { + TestTimeoutShort = duration + } else { + fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_SHORT. Using default value: %s", TestTimeoutShort) + } + } + if value, ok := os.LookupEnv("CODEFLARE_TEST_TIMEOUT_MEDIUM"); ok { + if duration, err := time.ParseDuration(value); err == nil { + TestTimeoutMedium = duration + } else { + fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_MEDIUM. Using default value: %s", TestTimeoutMedium) + } + } + if value, ok := os.LookupEnv("CODEFLARE_TEST_TIMEOUT_LONG"); ok { + if duration, err := time.ParseDuration(value); err == nil { + TestTimeoutLong = duration + } else { + fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_LONG. Using default value: %s", TestTimeoutLong) + } + } + + // Gomega settings + gomega.SetDefaultEventuallyTimeout(TestTimeoutShort) + gomega.SetDefaultEventuallyPollingInterval(1 * time.Second) + gomega.SetDefaultConsistentlyDuration(30 * time.Second) + gomega.SetDefaultConsistentlyPollingInterval(1 * time.Second) + // Disable object truncation on test results + format.MaxLength = 0 +} From 9a96e455c507fa989130e4d2d763c5dd21540c1d Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Mon, 26 Jun 2023 17:45:11 +0200 Subject: [PATCH 16/34] test: Pass MNIST training with CodeFlare SDK on OpenShift --- test/e2e/mnist.py | 16 ++- test/e2e/mnist_raycluster_sdk.py | 65 +++++++++++ test/e2e/mnist_raycluster_sdk_test.go | 150 +++++++++++++++++++------- test/e2e/requirements.txt | 4 +- test/e2e/sdk.py | 39 ------- test/support/openshift.go | 33 ++++++ 6 files changed, 225 insertions(+), 82 deletions(-) create mode 100644 test/e2e/mnist_raycluster_sdk.py delete mode 100644 test/e2e/sdk.py create mode 100644 test/support/openshift.go diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py index e60ec7c6..244c84d2 100644 --- a/test/e2e/mnist.py +++ b/test/e2e/mnist.py @@ -1,10 +1,22 @@ -# In[] +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import torch from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks.progress import TQDMProgressBar -from pytorch_lightning.loggers import CSVLogger from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader, random_split diff --git a/test/e2e/mnist_raycluster_sdk.py b/test/e2e/mnist_raycluster_sdk.py new file mode 100644 index 00000000..444b1858 --- /dev/null +++ b/test/e2e/mnist_raycluster_sdk.py @@ -0,0 +1,65 @@ +import sys + +from time import sleep + +from torchx.specs.api import AppState, is_terminal + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration +from codeflare_sdk.job.jobs import DDPJobDefinition + +namespace = sys.argv[1] + +cluster = Cluster(ClusterConfiguration( + name='mnist', + namespace=namespace, + min_worker=1, + max_worker=1, + min_cpus='500m', + max_cpus=1, + min_memory=0.5, + max_memory=1, + gpu=0, + instascale=False, +)) + +cluster.up() + +cluster.status() + +cluster.wait_ready() + +cluster.status() + +cluster.details() + +jobdef = DDPJobDefinition( + name="mnist", + script="mnist.py", + scheduler_args={"requirements": "requirements.txt"}, +) +job = jobdef.submit(cluster) + +done = False +time = 0 +timeout = 300 +while not done: + status = job.status() + if is_terminal(status.state): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError(f"job has timed out after waiting {timeout}s") + sleep(5) + time += 5 + +print(f"Job has completed: {status.state}") + +print(job.logs()) + +cluster.down() + +if not status.state == AppState.SUCCEEDED: + exit(1) +else: + exit(0) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 7283e463..43854e79 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -23,57 +23,119 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" ) func TestMNISTRayClusterSDK(t *testing.T) { test := With(t) test.T().Parallel() - test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/146") + if !IsOpenShift(test) { + test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/146") + } // Create a namespace namespace := test.NewTestNamespace() - // SDK script - sdk := &corev1.ConfigMap{ + // Test configuration + configMap := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: "sdk", + Name: "mnist-raycluster-sdk", Namespace: namespace.Name, }, BinaryData: map[string][]byte{ - "sdk.py": ReadFile(test, "sdk.py"), + // SDK script + "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), + // pip requirements + "requirements.txt": ReadFile(test, "requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), }, Immutable: Ptr(true), } - sdk, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), sdk, metav1.CreateOptions{}) + configMap, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), configMap, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", sdk.Namespace, sdk.Name) + test.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) - // pip requirements - requirements := &corev1.ConfigMap{ + // SDK client RBAC + serviceAccount := &corev1.ServiceAccount{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", + Kind: "ServiceAccount", }, ObjectMeta: metav1.ObjectMeta{ - Name: "requirements", + Name: "sdk-user", Namespace: namespace.Name, }, - BinaryData: map[string][]byte{ - "requirements.txt": ReadFile(test, "requirements.txt"), + } + serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + role := &rbacv1.Role{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "Role", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Rules: []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1alpha1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, }, - Immutable: Ptr(true), } - requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{}) + role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + roleBinding := &rbacv1.RoleBinding{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "RoleBinding", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.SchemeGroupVersion.Group, + Kind: "Role", + Name: role.Name, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + APIGroup: corev1.SchemeGroupVersion.Group, + Name: serviceAccount.Name, + Namespace: serviceAccount.Namespace, + }, + }, + } + _, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", requirements.Namespace, requirements.Name) job := &batchv1.Job{ TypeMeta: metav1.TypeMeta{ @@ -92,54 +154,62 @@ func TestMNISTRayClusterSDK(t *testing.T) { Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "sdk", + Name: "test", Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Command: []string{"/bin/sh", "-c", "pip install -r /test/runtime/requirements.txt && python /test/job/sdk.py"}, + Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==0.4.4 && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, VolumeMounts: []corev1.VolumeMount{ { - Name: "sdk", - MountPath: "/test/job", - }, - { - Name: "requirements", - MountPath: "/test/runtime", + Name: "test", + MountPath: "/test", }, }, }, }, Volumes: []corev1.Volume{ { - Name: "sdk", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: sdk.Name, - }, - }, - }, - }, - { - Name: "requirements", + Name: "test", VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: requirements.Name, + Name: configMap.Name, }, }, }, }, }, - RestartPolicy: corev1.RestartPolicyNever, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: serviceAccount.Name, }, }, }, } job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) defer JobTroubleshooting(test, job) - test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutMedium). - Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Refresh the job to get the generated pod selector + job = GetJob(test, job.Namespace, job.Name) + + // Get the job Pod + pods := GetPods(test, job.Namespace, metav1.ListOptions{ + LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, + ) + test.Expect(pods).To(HaveLen(1)) + + // Print the job logs + test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) + test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) + + // Assert the job has completed successfully + test.T().Logf("Checking the Job %s/%s has completed successfully", job.Namespace, job.Name) + test.Expect(job).To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) } diff --git a/test/e2e/requirements.txt b/test/e2e/requirements.txt index 382b9855..87edeef2 100644 --- a/test/e2e/requirements.txt +++ b/test/e2e/requirements.txt @@ -1 +1,3 @@ -codeflare-sdk==0.4.4 +pytorch_lightning==1.5.10 +torchmetrics==0.9.1 +torchvision==0.12.0 diff --git a/test/e2e/sdk.py b/test/e2e/sdk.py deleted file mode 100644 index 578bc48c..00000000 --- a/test/e2e/sdk.py +++ /dev/null @@ -1,39 +0,0 @@ -from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration -# from codeflare_sdk.cluster.auth import TokenAuthentication -from codeflare_sdk.job.jobs import DDPJobDefinition - -cluster = Cluster(ClusterConfiguration( - name='mnist', - # namespace='default', - min_worker=1, - max_worker=1, - min_cpus=0.2, - max_cpus=1, - min_memory=0.5, - max_memory=1, - gpu=0, - instascale=False, -)) - -cluster.up() - -cluster.status() - -cluster.wait_ready() - -cluster.status() - -cluster.details() - -jobdef = DDPJobDefinition( - name="mnist", - script="/test/job/mnist.py", - scheduler_args={"requirements": "/test/runtime/requirements.txt"} -) -job = jobdef.submit(cluster) - -job.status() - -print(job.logs()) - -cluster.down() diff --git a/test/support/openshift.go b/test/support/openshift.go new file mode 100644 index 00000000..cfe3b5a3 --- /dev/null +++ b/test/support/openshift.go @@ -0,0 +1,33 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "github.com/onsi/gomega" + + "k8s.io/apimachinery/pkg/api/errors" +) + +func IsOpenShift(test Test) bool { + test.T().Helper() + _, err := test.Client().Core().Discovery().ServerResourcesForGroupVersion("image.openshift.io/v1") + if err != nil && errors.IsNotFound(err) { + return false + } + test.Expect(err).NotTo(gomega.HaveOccurred()) + return true +} From 12d106db9080a4a0ff44ede62ea38322871285da Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 10:43:58 +0200 Subject: [PATCH 17/34] test: Print Job logs after successfull or failed completion --- test/e2e/mnist_pytorch_mcad_job_test.go | 28 ++++++++++--------------- test/e2e/mnist_raycluster_sdk_test.go | 19 +++-------------- test/support/batch.go | 17 +++++++-------- 3 files changed, 21 insertions(+), 43 deletions(-) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index a704d089..778fe106 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -25,7 +25,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" . "github.com/project-codeflare/codeflare-operator/test/support" mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" @@ -179,22 +178,17 @@ torchvision==0.12.0 test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) - defer JobTroubleshooting(test, job) - - test.T().Logf("Waiting for Job %s/%s to complete successfully", job.Namespace, job.Name) - test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong). - Should(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) - - // Refresh the job to get the generated pod selector - job = GetJob(test, job.Namespace, job.Name) - - // Get the job Pod - pods := GetPods(test, job.Namespace, metav1.ListOptions{ - LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, - ) - test.Expect(pods).To(HaveLen(1)) + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) // Print the job logs - test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) - test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) + PrintJobLogs(test, job.Namespace, job.Name) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) } diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 43854e79..3761c935 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -25,7 +25,6 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" @@ -187,8 +186,6 @@ func TestMNISTRayClusterSDK(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) - defer JobTroubleshooting(test, job) - test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( Or( @@ -196,20 +193,10 @@ func TestMNISTRayClusterSDK(t *testing.T) { WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), )) - // Refresh the job to get the generated pod selector - job = GetJob(test, job.Namespace, job.Name) - - // Get the job Pod - pods := GetPods(test, job.Namespace, metav1.ListOptions{ - LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, - ) - test.Expect(pods).To(HaveLen(1)) - // Print the job logs - test.T().Logf("Printing Job %s/%s logs", job.Namespace, job.Name) - test.T().Log(GetPodLogs(test, &pods[0], corev1.PodLogOptions{})) + PrintJobLogs(test, job.Namespace, job.Name) // Assert the job has completed successfully - test.T().Logf("Checking the Job %s/%s has completed successfully", job.Namespace, job.Name) - test.Expect(job).To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) } diff --git a/test/support/batch.go b/test/support/batch.go index 1bf6874e..6349efa5 100644 --- a/test/support/batch.go +++ b/test/support/batch.go @@ -38,24 +38,21 @@ func GetJob(t Test, namespace, name string) *batchv1.Job { return Job(t, namespace, name)(t) } -func JobTroubleshooting(test Test, job *batchv1.Job) { - if !test.T().Failed() { - return - } - job = GetJob(test, job.Namespace, job.Name) +func PrintJobLogs(t Test, namespace, name string) { + t.T().Helper() - test.T().Errorf("Job %s/%s hasn't completed in time: %s", job.Namespace, job.Name, job) + job := GetJob(t, namespace, name) - pods := GetPods(test, job.Namespace, metav1.ListOptions{ + pods := GetPods(t, job.Namespace, metav1.ListOptions{ LabelSelector: labels.FormatLabels(job.Spec.Selector.MatchLabels)}, ) if len(pods) == 0 { - test.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) + t.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) } else { for i, pod := range pods { - test.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) - test.T().Log(GetPodLogs(test, &pods[i], corev1.PodLogOptions{})) + t.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) + t.T().Log(GetPodLogs(t, &pods[i], corev1.PodLogOptions{})) } } } From ecf16dc72a6d928d38ee9a05643faef6e09f6bac Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 10:54:09 +0200 Subject: [PATCH 18/34] test: Re-use pip requirements file --- ...rements.txt => mnist_pip_requirements.txt} | 0 test/e2e/mnist_pytorch_mcad_job_test.go | 60 ++++--------------- test/e2e/mnist_raycluster_sdk_test.go | 10 ++-- 3 files changed, 18 insertions(+), 52 deletions(-) rename test/e2e/{requirements.txt => mnist_pip_requirements.txt} (100%) diff --git a/test/e2e/requirements.txt b/test/e2e/mnist_pip_requirements.txt similarity index 100% rename from test/e2e/requirements.txt rename to test/e2e/mnist_pip_requirements.txt diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 778fe106..f1664a53 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -37,47 +37,27 @@ func TestMNISTPyTorchMCAD(t *testing.T) { // Create a namespace namespace := test.NewTestNamespace() - // MNIST training script - mnist := &corev1.ConfigMap{ + // Test configuration + config := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", }, ObjectMeta: metav1.ObjectMeta{ - Name: "mnist", + Name: "mnist-mcad", Namespace: namespace.Name, }, BinaryData: map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script "mnist.py": ReadFile(test, "mnist.py"), }, Immutable: Ptr(true), } - mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{}) - test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) - - // pip requirements - requirements := &corev1.ConfigMap{ - TypeMeta: metav1.TypeMeta{ - APIVersion: corev1.SchemeGroupVersion.String(), - Kind: "ConfigMap", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "requirements", - Namespace: namespace.Name, - }, - BinaryData: map[string][]byte{ - "requirements.txt": []byte(` -pytorch_lightning==1.5.10 -torchmetrics==0.9.1 -torchvision==0.12.0 -`), - }, - Immutable: Ptr(true), - } - requirements, err = test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), requirements, metav1.CreateOptions{}) + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", requirements.Namespace, requirements.Name) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) // Batch Job job := &batchv1.Job{ @@ -98,36 +78,22 @@ torchvision==0.12.0 { Name: "job", Image: "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime", - Command: []string{"/bin/sh", "-c", "pip install -r /test/runtime/requirements.txt && torchrun /test/job/mnist.py"}, + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ { - Name: "mnist", - MountPath: "/test/job", - }, - { - Name: "requirements", - MountPath: "/test/runtime", + Name: "test", + MountPath: "/test", }, }, }, }, Volumes: []corev1.Volume{ { - Name: "mnist", - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - LocalObjectReference: corev1.LocalObjectReference{ - Name: mnist.Name, - }, - }, - }, - }, - { - Name: "requirements", + Name: "test", VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: requirements.Name, + Name: config.Name, }, }, }, diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 3761c935..82087155 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -44,7 +44,7 @@ func TestMNISTRayClusterSDK(t *testing.T) { namespace := test.NewTestNamespace() // Test configuration - configMap := &corev1.ConfigMap{ + config := &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), Kind: "ConfigMap", @@ -57,15 +57,15 @@ func TestMNISTRayClusterSDK(t *testing.T) { // SDK script "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), // pip requirements - "requirements.txt": ReadFile(test, "requirements.txt"), + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), // MNIST training script "mnist.py": ReadFile(test, "mnist.py"), }, Immutable: Ptr(true), } - configMap, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), configMap, metav1.CreateOptions{}) + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) test.Expect(err).NotTo(HaveOccurred()) - test.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) // SDK client RBAC serviceAccount := &corev1.ServiceAccount{ @@ -170,7 +170,7 @@ func TestMNISTRayClusterSDK(t *testing.T) { VolumeSource: corev1.VolumeSource{ ConfigMap: &corev1.ConfigMapVolumeSource{ LocalObjectReference: corev1.LocalObjectReference{ - Name: configMap.Name, + Name: config.Name, }, }, }, From 210b1026521665d9aabb1183bef557fb7e96c3cf Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 15:20:26 +0200 Subject: [PATCH 19/34] test: Parameterize CodeFlare SDK version --- Makefile | 4 ++++ controllers/defaults.go | 5 +++-- test/e2e/mnist_raycluster_sdk_test.go | 2 +- test/support/codeflare.go | 30 +++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 test/support/codeflare.go diff --git a/Makefile b/Makefile index c9295f57..2185264e 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,9 @@ MCAD_CRD ?= ${MCAD_REPO}/config/crd?ref=${MCAD_REF} # KUBERAY_VERSION defines the default version of the KubeRay operator KUBERAY_VERSION ?= v0.5.0 +# CODEFLARE_SDK_VERSION defines the default version of the CodeFlare SDK +CODEFLARE_SDK_VERSION ?= 0.4.4 + # OPERATORS_REPO_ORG points to GitHub repository organization where bundle PR is opened against # OPERATORS_REPO_FORK_ORG points to GitHub repository fork organization where bundle build is pushed to OPERATORS_REPO_ORG ?= redhat-openshift-ecosystem @@ -132,6 +135,7 @@ defaults: @echo "const (" >> $(DEFAULTS_FILE) @echo " MCADImage = \"$(MCAD_IMAGE)\"" >> $(DEFAULTS_FILE) @echo " InstaScaleImage = \"$(INSTASCALE_IMAGE)\"" >> $(DEFAULTS_FILE) + @echo " CodeFlareSDKVersion = \"$(CODEFLARE_SDK_VERSION)\"" >> $(DEFAULTS_FILE) @echo "" >> $(DEFAULTS_FILE) @echo ")" >> $(DEFAULTS_FILE) @echo "" >> $(DEFAULTS_FILE) diff --git a/controllers/defaults.go b/controllers/defaults.go index d3cac3d0..497940a9 100644 --- a/controllers/defaults.go +++ b/controllers/defaults.go @@ -5,6 +5,7 @@ package controllers // *********************** const ( - MCADImage = "quay.io/project-codeflare/mcad-controller:release-v1.31.0" - InstaScaleImage = "quay.io/project-codeflare/instascale-controller:v0.0.4" + MCADImage = "quay.io/project-codeflare/mcad-controller:release-v1.31.0" + InstaScaleImage = "quay.io/project-codeflare/instascale-controller:v0.0.4" + CodeFlareSDKVersion = "0.4.4" ) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 82087155..3ba168a0 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -155,7 +155,7 @@ func TestMNISTRayClusterSDK(t *testing.T) { { Name: "test", Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", - Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==0.4.4 && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, + Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, VolumeMounts: []corev1.VolumeMount{ { Name: "test", diff --git a/test/support/codeflare.go b/test/support/codeflare.go new file mode 100644 index 00000000..69066c2a --- /dev/null +++ b/test/support/codeflare.go @@ -0,0 +1,30 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "os" + + "github.com/project-codeflare/codeflare-operator/controllers" +) + +func GetCodeFlareSDKVersion() string { + if value, ok := os.LookupEnv("CODEFLARE_SDK_VERSION"); ok { + return value + } + return controllers.CodeFlareSDKVersion +} From 08ed88369e8a24f7f850131e1e598c398f04829a Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 15:58:06 +0200 Subject: [PATCH 20/34] test: Remove ray_lightning from requirements --- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 1 - test/support/ray.go | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index d7a6d18d..38a3b97c 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -233,7 +233,6 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { { "pip": [ "pytorch_lightning==1.5.10", - "ray_lightning", "torchmetrics==0.9.1", "torchvision==0.12.0" ], diff --git a/test/support/ray.go b/test/support/ray.go index 369db80b..9e0d22d9 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -45,7 +45,8 @@ func GetRayJobLogs(t Test, job *rayv1alpha1.RayJob) string { t.T().Helper() response := t.Client().Core().CoreV1().RESTClient(). Get(). - AbsPath("/api/v1/namespaces", job.Namespace, "services", "http:"+job.Status.RayClusterName+"-head-svc:dashboard", "proxy", "api", "jobs", job.Status.JobId, "logs").Do(t.Ctx()) + AbsPath("/api/v1/namespaces", job.Namespace, "services", "http:"+job.Status.RayClusterName+"-head-svc:dashboard", "proxy", "api", "jobs", job.Status.JobId, "logs"). + Do(t.Ctx()) t.Expect(response.Error()).NotTo(gomega.HaveOccurred()) body := map[string]string{} From 06a8659c661031954e8f46c1aff34a6a7141858f Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 16:35:59 +0200 Subject: [PATCH 21/34] test: Parameterize Ray image and version --- Makefile | 27 ++++++++++++++++--- controllers/defaults.go | 5 ++-- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 6 ++--- test/support/codeflare.go | 26 +++++++++++++++--- test/support/defaults.go | 11 ++++++++ 5 files changed, 62 insertions(+), 13 deletions(-) create mode 100644 test/support/defaults.go diff --git a/Makefile b/Makefile index 2185264e..c3ce8665 100644 --- a/Makefile +++ b/Makefile @@ -21,9 +21,12 @@ MCAD_REF ?= release-${MCAD_VERSION} MCAD_REPO ?= github.com/project-codeflare/multi-cluster-app-dispatcher MCAD_CRD ?= ${MCAD_REPO}/config/crd?ref=${MCAD_REF} -# KUBERAY_VERSION defines the default version of the KubeRay operator +# KUBERAY_VERSION defines the default version of the KubeRay operator (used for testing) KUBERAY_VERSION ?= v0.5.0 +# RAY_VERSION defines the default version of Ray (used for testing) +RAY_VERSION ?= 2.5.0 + # CODEFLARE_SDK_VERSION defines the default version of the CodeFlare SDK CODEFLARE_SDK_VERSION ?= 0.4.4 @@ -67,6 +70,9 @@ MCAD_IMAGE ?= $(IMAGE_ORG_BASE)/mcad-controller:$(MCAD_REF) # INSTASCALE_IMAGE defines the default container image for the InstaScale controller INSTASCALE_IMAGE ?= $(IMAGE_ORG_BASE)/instascale-controller:$(INSTASCALE_VERSION) +# RAY_IMAGE defines the default container image for Ray (used for testing) +RAY_IMAGE ?= rayproject/ray:$(RAY_VERSION) + # BUNDLE_IMG defines the image:tag used for the bundle. # You can use it as an arg. (E.g make bundle-build BUNDLE_IMG=/:) BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:$(VERSION) @@ -122,6 +128,7 @@ help: ## Display this help. ##@ Development DEFAULTS_FILE := controllers/defaults.go +DEFAULTS_TEST_FILE := test/support/defaults.go .PHONY: defaults defaults: @@ -135,12 +142,26 @@ defaults: @echo "const (" >> $(DEFAULTS_FILE) @echo " MCADImage = \"$(MCAD_IMAGE)\"" >> $(DEFAULTS_FILE) @echo " InstaScaleImage = \"$(INSTASCALE_IMAGE)\"" >> $(DEFAULTS_FILE) - @echo " CodeFlareSDKVersion = \"$(CODEFLARE_SDK_VERSION)\"" >> $(DEFAULTS_FILE) @echo "" >> $(DEFAULTS_FILE) @echo ")" >> $(DEFAULTS_FILE) @echo "" >> $(DEFAULTS_FILE) - gofmt -w $(DEFAULTS_FILE) + $(info Regenerating $(DEFAULTS_TEST_FILE)) + @echo "package support" > $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo "// ***********************" >> $(DEFAULTS_TEST_FILE) + @echo "// DO NOT EDIT THIS FILE" >> $(DEFAULTS_TEST_FILE) + @echo "// ***********************" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo "const (" >> $(DEFAULTS_TEST_FILE) + @echo " CodeFlareSDKVersion = \"$(CODEFLARE_SDK_VERSION)\"" >> $(DEFAULTS_TEST_FILE) + @echo " RayVersion = \"$(RAY_VERSION)\"" >> $(DEFAULTS_TEST_FILE) + @echo " RayImage = \"$(RAY_IMAGE)\"" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo ")" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + + gofmt -w $(DEFAULTS_FILE) $(DEFAULTS_TEST_FILE) .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. diff --git a/controllers/defaults.go b/controllers/defaults.go index 497940a9..d3cac3d0 100644 --- a/controllers/defaults.go +++ b/controllers/defaults.go @@ -5,7 +5,6 @@ package controllers // *********************** const ( - MCADImage = "quay.io/project-codeflare/mcad-controller:release-v1.31.0" - InstaScaleImage = "quay.io/project-codeflare/instascale-controller:v0.0.4" - CodeFlareSDKVersion = "0.4.4" + MCADImage = "quay.io/project-codeflare/mcad-controller:release-v1.31.0" + InstaScaleImage = "quay.io/project-codeflare/instascale-controller:v0.0.4" ) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 38a3b97c..33a9e6a5 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -68,7 +68,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { Namespace: namespace.Name, }, Spec: rayv1alpha1.RayClusterSpec{ - RayVersion: "2.0.0", + RayVersion: GetRayVersion(), HeadGroupSpec: rayv1alpha1.HeadGroupSpec{ RayStartParams: map[string]string{ "dashboard-host": "0.0.0.0", @@ -78,7 +78,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { Containers: []corev1.Container{ { Name: "ray-head", - Image: "rayproject/ray:2.0.0", + Image: GetRayImage(), Ports: []corev1.ContainerPort{ { ContainerPort: 6379, @@ -152,7 +152,7 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { Containers: []corev1.Container{ { Name: "ray-worker", - Image: "rayproject/ray:2.0.0", + Image: GetRayImage(), Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ diff --git a/test/support/codeflare.go b/test/support/codeflare.go index 69066c2a..0763bbd3 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -18,13 +18,31 @@ package support import ( "os" +) - "github.com/project-codeflare/codeflare-operator/controllers" +// The environment variables hereafter can be used to change the components +// used for testing. +const ( + CodeFlareTestSdkVersion = "CODEFLARE_TEST_SDK_VERSION" + CodeFlareTestRayVersion = "CODEFLARE_TEST_RAY_VERSION" + CodeFlareTestRayImage = "CODEFLARE_TEST_RAY_IMAGE" ) func GetCodeFlareSDKVersion() string { - if value, ok := os.LookupEnv("CODEFLARE_SDK_VERSION"); ok { - return value + return lookupEnvOrDefault(CodeFlareTestSdkVersion, CodeFlareSDKVersion) +} + +func GetRayVersion() string { + return lookupEnvOrDefault(CodeFlareTestRayVersion, RayVersion) +} + +func GetRayImage() string { + return lookupEnvOrDefault(CodeFlareTestRayImage, RayImage) +} + +func lookupEnvOrDefault(key, value string) string { + if v, ok := os.LookupEnv(key); ok { + return v } - return controllers.CodeFlareSDKVersion + return value } diff --git a/test/support/defaults.go b/test/support/defaults.go new file mode 100644 index 00000000..a0aaad42 --- /dev/null +++ b/test/support/defaults.go @@ -0,0 +1,11 @@ +package support + +// *********************** +// DO NOT EDIT THIS FILE +// *********************** + +const ( + CodeFlareSDKVersion = "0.4.4" + RayVersion = "2.5.0" + RayImage = "rayproject/ray:2.5.0" +) From f2c66186062521c3a33d9039294afc1be8912f2c Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 16:40:36 +0200 Subject: [PATCH 22/34] test: Parameterize PyTorch image --- test/e2e/mnist_pytorch_mcad_job_test.go | 2 +- test/support/codeflare.go | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index f1664a53..1b1125af 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -77,7 +77,7 @@ func TestMNISTPyTorchMCAD(t *testing.T) { Containers: []corev1.Container{ { Name: "job", - Image: "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime", + Image: GetPyTorchImage(), Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ { diff --git a/test/support/codeflare.go b/test/support/codeflare.go index 0763bbd3..488a8fff 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -23,9 +23,10 @@ import ( // The environment variables hereafter can be used to change the components // used for testing. const ( - CodeFlareTestSdkVersion = "CODEFLARE_TEST_SDK_VERSION" - CodeFlareTestRayVersion = "CODEFLARE_TEST_RAY_VERSION" - CodeFlareTestRayImage = "CODEFLARE_TEST_RAY_IMAGE" + CodeFlareTestSdkVersion = "CODEFLARE_TEST_SDK_VERSION" + CodeFlareTestRayVersion = "CODEFLARE_TEST_RAY_VERSION" + CodeFlareTestRayImage = "CODEFLARE_TEST_RAY_IMAGE" + CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE" ) func GetCodeFlareSDKVersion() string { @@ -40,6 +41,10 @@ func GetRayImage() string { return lookupEnvOrDefault(CodeFlareTestRayImage, RayImage) } +func GetPyTorchImage() string { + return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime") +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v From 646876d14c35d13cec8ca6df212826e0600625d9 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 16:42:48 +0200 Subject: [PATCH 23/34] test: Add FIXME for SDK user base image --- test/e2e/mnist_raycluster_sdk_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 3ba168a0..d919fdcf 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -153,7 +153,9 @@ func TestMNISTRayClusterSDK(t *testing.T) { Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "test", + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, VolumeMounts: []corev1.VolumeMount{ From 9823d64cb7083276744b97a766c2a1ccc5ee4c03 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Tue, 27 Jun 2023 17:35:38 +0200 Subject: [PATCH 24/34] Align go.mod with MCAD version --- Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c3ce8665..f70edf1e 100644 --- a/Makefile +++ b/Makefile @@ -214,12 +214,16 @@ vet: ## Run go vet against code. ##@ Build +.PHONY: modules +modules: ## Update Go dependencies. + go get $(MCAD_REPO)@$(MCAD_VERSION) + .PHONY: build -build: defaults generate fmt vet ## Build manager binary. +build: modules defaults generate fmt vet ## Build manager binary. go build -o bin/manager main.go .PHONY: run -run: defaults manifests generate fmt vet ## Run a controller from your host. +run: modules defaults manifests generate fmt vet ## Run a controller from your host. go run ./main.go .PHONY: image-build From 02ee9e4076285e6442b31ce17068ea2d6c424d61 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 28 Jun 2023 15:49:46 +0200 Subject: [PATCH 25/34] test: Print Ray job logs after successful or failed completion --- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 15 ++++++++------- test/support/ray.go | 15 +++++++++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 33a9e6a5..024dee92 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -250,13 +250,14 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) - test.T().Logf("Waiting for RayJob %s/%s to complete successfully", rayJob.Namespace, rayJob.Name) - test.Eventually(RayJob(test, namespace, rayJob.Name), TestTimeoutLong). - Should(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) - - rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Get(test.Ctx(), rayJob.Name, metav1.GetOptions{}) - test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name) + test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). + Should(WithTransform(RayJobStatus, Satisfy(rayv1alpha1.IsJobTerminal))) test.T().Logf("Printing RayJob %s/%s logs", rayJob.Namespace, rayJob.Name) - test.T().Log(GetRayJobLogs(test, rayJob)) + test.T().Log(GetRayJobLogs(test, rayJob.Namespace, rayJob.Name)) + + // Assert the Ray job has completed successfully + test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)). + To(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) } diff --git a/test/support/ray.go b/test/support/ray.go index 9e0d22d9..75d77570 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -21,7 +21,6 @@ import ( "github.com/onsi/gomega" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" @@ -29,20 +28,28 @@ import ( const RayJobDefaultClusterSelectorKey = "ray.io/cluster" -func RayJob(t Test, namespace *corev1.Namespace, name string) func(g gomega.Gomega) *rayv1alpha1.RayJob { +func RayJob(t Test, namespace, name string) func(g gomega.Gomega) *rayv1alpha1.RayJob { return func(g gomega.Gomega) *rayv1alpha1.RayJob { - job, err := t.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Get(t.Ctx(), name, metav1.GetOptions{}) + job, err := t.Client().Ray().RayV1alpha1().RayJobs(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) g.Expect(err).NotTo(gomega.HaveOccurred()) return job } } +func GetRayJob(t Test, namespace, name string) *rayv1alpha1.RayJob { + t.T().Helper() + return RayJob(t, namespace, name)(t) +} + func RayJobStatus(job *rayv1alpha1.RayJob) rayv1alpha1.JobStatus { return job.Status.JobStatus } -func GetRayJobLogs(t Test, job *rayv1alpha1.RayJob) string { +func GetRayJobLogs(t Test, namespace, name string) string { t.T().Helper() + + job := GetRayJob(t, namespace, name) + response := t.Client().Core().CoreV1().RESTClient(). Get(). AbsPath("/api/v1/namespaces", job.Namespace, "services", "http:"+job.Status.RayClusterName+"-head-svc:dashboard", "proxy", "api", "jobs", job.Status.JobId, "logs"). From 5ab5db024a1dc974d76be799249ad407555001b0 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 29 Jun 2023 11:18:42 +0200 Subject: [PATCH 26/34] test: Upload job logs --- .github/workflows/e2e_tests.yaml | 12 ++++++ test/e2e/mnist_pytorch_mcad_job_test.go | 4 +- test/e2e/mnist_raycluster_sdk_test.go | 4 +- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 4 +- test/e2e/support.go | 1 + test/support/batch.go | 6 +-- test/support/codeflare.go | 9 +++- test/support/core.go | 4 +- test/support/ray.go | 4 +- test/support/test.go | 41 +++++++++++++++++-- test/support/utils.go | 20 +++++++++ 11 files changed, 90 insertions(+), 19 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 1f8263d5..8d9318c7 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -169,6 +169,9 @@ jobs: export CODEFLARE_TEST_TIMEOUT_MEDIUM=3m export CODEFLARE_TEST_TIMEOUT_LONG=8m + export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} + echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV + make test-e2e - name: Print CodeFlare operator logs @@ -188,3 +191,12 @@ jobs: run: | echo "Printing KubeRay operator logs" kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay + + - name: Upload logs + uses: actions/upload-artifact@v3 + if: always() && steps.deploy.outcome == 'success' + with: + name: logs + retention-days: 10 + path: | + ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 1b1125af..07bcf95a 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -151,8 +151,8 @@ func TestMNISTPyTorchMCAD(t *testing.T) { WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), )) - // Print the job logs - PrintJobLogs(test, job.Namespace, job.Name) + // Retrieve the job logs + WriteJobLogs(test, job.Namespace, job.Name) // Assert the job has completed successfully test.Expect(GetJob(test, job.Namespace, job.Name)). diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index d919fdcf..4e804fd5 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -195,8 +195,8 @@ func TestMNISTRayClusterSDK(t *testing.T) { WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), )) - // Print the job logs - PrintJobLogs(test, job.Namespace, job.Name) + // Retrieve the job logs + WriteJobLogs(test, job.Namespace, job.Name) // Assert the job has completed successfully test.Expect(GetJob(test, job.Namespace, job.Name)). diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index 024dee92..f20ba0a7 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -254,8 +254,8 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). Should(WithTransform(RayJobStatus, Satisfy(rayv1alpha1.IsJobTerminal))) - test.T().Logf("Printing RayJob %s/%s logs", rayJob.Namespace, rayJob.Name) - test.T().Log(GetRayJobLogs(test, rayJob.Namespace, rayJob.Name)) + test.T().Logf("Retrieving RayJob %s/%s logs", rayJob.Namespace, rayJob.Name) + WriteToOutputDir(test, rayJob.Name, Log, GetRayJobLogs(test, rayJob.Namespace, rayJob.Name)) // Assert the Ray job has completed successfully test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)). diff --git a/test/e2e/support.go b/test/e2e/support.go index 7847c075..14bf19ce 100644 --- a/test/e2e/support.go +++ b/test/e2e/support.go @@ -28,6 +28,7 @@ import ( var files embed.FS func ReadFile(t support.Test, fileName string) []byte { + t.T().Helper() file, err := files.ReadFile(fileName) t.Expect(err).NotTo(gomega.HaveOccurred()) return file diff --git a/test/support/batch.go b/test/support/batch.go index 6349efa5..db8feeb5 100644 --- a/test/support/batch.go +++ b/test/support/batch.go @@ -38,7 +38,7 @@ func GetJob(t Test, namespace, name string) *batchv1.Job { return Job(t, namespace, name)(t) } -func PrintJobLogs(t Test, namespace, name string) { +func WriteJobLogs(t Test, namespace, name string) { t.T().Helper() job := GetJob(t, namespace, name) @@ -51,8 +51,8 @@ func PrintJobLogs(t Test, namespace, name string) { t.T().Errorf("Job %s/%s has no pods scheduled", job.Namespace, job.Name) } else { for i, pod := range pods { - t.T().Logf("Printing Pod %s/%s logs", pod.Namespace, pod.Name) - t.T().Log(GetPodLogs(t, &pods[i], corev1.PodLogOptions{})) + t.T().Logf("Retrieving Pod %s/%s logs", pod.Namespace, pod.Name) + WriteToOutputDir(t, pod.Name, Log, GetPodLogs(t, &pods[i], corev1.PodLogOptions{})) } } } diff --git a/test/support/codeflare.go b/test/support/codeflare.go index 488a8fff..04b1f3e9 100644 --- a/test/support/codeflare.go +++ b/test/support/codeflare.go @@ -20,13 +20,18 @@ import ( "os" ) -// The environment variables hereafter can be used to change the components -// used for testing. const ( + // The environment variables hereafter can be used to change the components + // used for testing. + CodeFlareTestSdkVersion = "CODEFLARE_TEST_SDK_VERSION" CodeFlareTestRayVersion = "CODEFLARE_TEST_RAY_VERSION" CodeFlareTestRayImage = "CODEFLARE_TEST_RAY_IMAGE" CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE" + + // The testing output directory, to write output files into. + + CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR" ) func GetCodeFlareSDKVersion() string { diff --git a/test/support/core.go b/test/support/core.go index 273c049a..a9be18bf 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -43,7 +43,7 @@ func GetPods(t Test, namespace string, options metav1.ListOptions) []corev1.Pod return pods.Items } -func GetPodLogs(t Test, pod *corev1.Pod, options corev1.PodLogOptions) string { +func GetPodLogs(t Test, pod *corev1.Pod, options corev1.PodLogOptions) []byte { t.T().Helper() stream, err := t.Client().Core().CoreV1().Pods(pod.GetNamespace()).GetLogs(pod.GetName(), &options).Stream(t.Ctx()) t.Expect(err).NotTo(gomega.HaveOccurred()) @@ -55,5 +55,5 @@ func GetPodLogs(t Test, pod *corev1.Pod, options corev1.PodLogOptions) string { bytes, err := io.ReadAll(stream) t.Expect(err).NotTo(gomega.HaveOccurred()) - return string(bytes) + return bytes } diff --git a/test/support/ray.go b/test/support/ray.go index 75d77570..3833d43b 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -45,7 +45,7 @@ func RayJobStatus(job *rayv1alpha1.RayJob) rayv1alpha1.JobStatus { return job.Status.JobStatus } -func GetRayJobLogs(t Test, namespace, name string) string { +func GetRayJobLogs(t Test, namespace, name string) []byte { t.T().Helper() job := GetRayJob(t, namespace, name) @@ -61,5 +61,5 @@ func GetRayJobLogs(t Test, namespace, name string) string { t.Expect(json.Unmarshal(bytes, &body)).To(gomega.Succeed()) t.Expect(body).To(gomega.HaveKey("logs")) - return body["logs"] + return []byte(body["logs"]) } diff --git a/test/support/test.go b/test/support/test.go index 5b3b271b..75556ca4 100644 --- a/test/support/test.go +++ b/test/support/test.go @@ -18,6 +18,8 @@ package support import ( "context" + "os" + "path" "sync" "testing" @@ -30,6 +32,7 @@ type Test interface { T() *testing.T Ctx() context.Context Client() Client + OutputDir() string gomega.Gomega @@ -70,9 +73,13 @@ type T struct { *gomega.WithT t *testing.T // nolint: containedctx - ctx context.Context - client Client - once sync.Once + ctx context.Context + client Client + outputDir string + once struct { + client sync.Once + outputDir sync.Once + } } func (t *T) T() *testing.T { @@ -84,7 +91,8 @@ func (t *T) Ctx() context.Context { } func (t *T) Client() Client { - t.once.Do(func() { + t.T().Helper() + t.once.client.Do(func() { c, err := newTestClient() if err != nil { t.T().Fatalf("Error creating client: %v", err) @@ -94,6 +102,31 @@ func (t *T) Client() Client { return t.client } +func (t *T) OutputDir() string { + t.T().Helper() + t.once.outputDir.Do(func() { + if parent, ok := os.LookupEnv(CodeFlareTestOutputDir); ok { + if !path.IsAbs(parent) { + if cwd, err := os.Getwd(); err == nil { + // best effort to output the parent absolute path + parent = path.Join(cwd, parent) + } + } + t.T().Logf("Creating output directory in parent directory: %s", parent) + dir, err := os.MkdirTemp(parent, t.T().Name()) + if err != nil { + t.T().Fatalf("Error creating output directory: %v", err) + } + t.outputDir = dir + } else { + t.T().Logf("Creating ephemeral output directory as %s env variable is unset", CodeFlareTestOutputDir) + t.outputDir = t.T().TempDir() + } + t.T().Logf("Output directory has been created at: %s", t.outputDir) + }) + return t.outputDir +} + func (t *T) NewTestNamespace(options ...Option[*corev1.Namespace]) *corev1.Namespace { t.T().Helper() namespace := createTestNamespace(t, options...) diff --git a/test/support/utils.go b/test/support/utils.go index ed40309f..595ac410 100644 --- a/test/support/utils.go +++ b/test/support/utils.go @@ -16,6 +16,26 @@ limitations under the License. package support +import ( + "io/fs" + "os" + "path" + + "github.com/onsi/gomega" +) + func Ptr[T any](v T) *T { return &v } + +type OutputType string + +const ( + Log OutputType = "log" +) + +func WriteToOutputDir(t Test, fileName string, fileType OutputType, data []byte) { + t.T().Helper() + t.Expect(os.WriteFile(path.Join(t.OutputDir(), fileName+"."+string(fileType)), data, fs.ModePerm)). + To(gomega.Succeed()) +} From d4d8f005e6c21ccde917bd877c60b17182e1cdee Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Thu, 29 Jun 2023 13:35:05 +0200 Subject: [PATCH 27/34] test Remove unused functions --- test/support/gomega.go | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 test/support/gomega.go diff --git a/test/support/gomega.go b/test/support/gomega.go deleted file mode 100644 index 74897502..00000000 --- a/test/support/gomega.go +++ /dev/null @@ -1,31 +0,0 @@ -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package support - -import ( - "github.com/onsi/gomega" - "github.com/onsi/gomega/gstruct" - "github.com/onsi/gomega/types" -) - -func EqualP(expected interface{}) types.GomegaMatcher { - return gstruct.PointTo(gomega.Equal(expected)) -} - -func MatchFieldsP(options gstruct.Options, fields gstruct.Fields) types.GomegaMatcher { - return gstruct.PointTo(gstruct.MatchFields(options, fields)) -} From 99682d0b900dcd240999cd7de4ca39491bb74441 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 30 Jun 2023 14:56:33 +0200 Subject: [PATCH 28/34] test: Fix Unexpected kind-action input --- .github/workflows/e2e_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 8d9318c7..2783c52b 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -90,7 +90,7 @@ jobs: - name: Setup KinD cluster uses: helm/kind-action@v1.5.0 with: - name: cluster + cluster_name: cluster version: v0.17.0 config: ${{ env.KIND_CONFIG_FILE }} From 494afe6549f0fbfb9917dcff6a3147c93136589b Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 30 Jun 2023 15:14:18 +0200 Subject: [PATCH 29/34] test: Format test output using gotestfmt --- .github/workflows/e2e_tests.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 2783c52b..1ec1cd0a 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -68,6 +68,11 @@ jobs: with: go-version: v1.18 + - name: Set up gotestfmt + uses: gotesttools/gotestfmt-action@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + - name: Container image registry run: | podman run -d -p 5000:5000 --name registry registry:2.8.1 @@ -172,7 +177,8 @@ jobs: export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV - make test-e2e + set -euo pipefail + go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt - name: Print CodeFlare operator logs if: always() && steps.deploy.outcome == 'success' From 7394965ddab65b36e33346f81f806eb48e236134 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 30 Jun 2023 15:52:00 +0200 Subject: [PATCH 30/34] test: Add codeflare stack logs to uploaded artifacts --- .github/workflows/e2e_tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 1ec1cd0a..0c6bb2b9 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -184,19 +184,19 @@ jobs: if: always() && steps.deploy.outcome == 'success' run: | echo "Printing CodeFlare operator logs" - kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log - name: Print MCAD controller logs if: always() && steps.deploy.outcome == 'success' run: | echo "Printing MCAD controller logs" - kubectl logs -n codeflare-system --tail -1 -l component=multi-cluster-application-dispatcher + kubectl logs -n codeflare-system --tail -1 -l component=multi-cluster-application-dispatcher | tee ${CODEFLARE_TEST_OUTPUT_DIR}/mcad.log - name: Print KubeRay operator logs if: always() && steps.deploy.outcome == 'success' run: | echo "Printing KubeRay operator logs" - kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log - name: Upload logs uses: actions/upload-artifact@v3 From 2045c7ad31f7feb77dff35b93a666188b8e93e87 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 30 Jun 2023 16:44:06 +0200 Subject: [PATCH 31/34] test: Add description to e2e tests --- test/e2e/mnist_pytorch_mcad_job_test.go | 1 + test/e2e/mnist_raycluster_sdk_test.go | 6 ++++++ test/e2e/mnist_rayjob_mcad_raycluster_test.go | 2 ++ 3 files changed, 9 insertions(+) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 07bcf95a..41376a0b 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -30,6 +30,7 @@ import ( mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" ) +// Trains the MNIST dataset as a batch Job managed by MCAD, and asserts successful completion of the training job. func TestMNISTPyTorchMCAD(t *testing.T) { test := With(t) test.T().Parallel() diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 4e804fd5..202f3bcc 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -32,6 +32,12 @@ import ( mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" ) +// Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK. +// Asserts successful completion of the training job. +// +// This covers the installation of the CodeFlare SDK, as well as the RBAC required +// for the SDK to successfully perform requests to the cluster, on behalf of the +// impersonated user. func TestMNISTRayClusterSDK(t *testing.T) { test := With(t) test.T().Parallel() diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index f20ba0a7..b6b01653 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -31,6 +31,8 @@ import ( rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" ) +// Trains the MNIST dataset as a RayJob, executed by a Ray cluster managed by MCAD, +// and asserts successful completion of the training job. func TestMNISTRayJobMCADRayCluster(t *testing.T) { test := With(t) test.T().Parallel() From 9eabd958228543b988085983f4575751c9cd67fb Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Fri, 30 Jun 2023 17:11:26 +0200 Subject: [PATCH 32/34] test: Factorize e2e tests setup --- .github/workflows/e2e_tests.yaml | 55 +----------------------- Makefile | 6 +-- test/e2e/setup.sh | 73 ++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 56 deletions(-) create mode 100755 test/e2e/setup.sh diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 0c6bb2b9..435c72f0 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -108,65 +108,14 @@ jobs: - name: Deploy CodeFlare stack id: deploy run: | - KUBERAY_VERSION=$(make get-kuberay-version) - echo Deploying KubeRay ${KUBERAY_VERSION} - kubectl create -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=90s" - echo Deploying CodeFlare operator IMG="${REGISTRY_ADDRESS}"/codeflare-operator make image-push -e IMG="${IMG}" make deploy -e IMG="${IMG}" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager - echo Deploying MCAD controller - kubectl create ns codeflare-system - cat < Date: Fri, 30 Jun 2023 17:16:07 +0200 Subject: [PATCH 33/34] test: Update e2e tests local run documentation --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 31f21640..31ce40f0 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ CodeFlare Stack Compatibility Matrix The e2e tests can be executed locally by running the following commands: -1. Setup the test cluster: +1. Use an existing cluster, or set up a test cluster, e.g.: ```bash # Create a KinD cluster @@ -30,13 +30,20 @@ The e2e tests can be executed locally by running the following commands: $ make install ``` -2. Start the operator locally: +2. Set up the CodeFlare stack: + ```bash + $ make setup-e2e + ``` + +3. Start the operator locally: ```bash $ make run ``` -3. In a separate terminal, run the e2e suite: + Alternatively, You can run the operator from your IDE / debugger. + +4. In a separate terminal, run the e2e suite: ```bash $ make test-e2e From 4463ec50a8936820e0044555545ff0d7e97410d2 Mon Sep 17 00:00:00 2001 From: Antonin Stefanutti Date: Wed, 5 Jul 2023 14:30:52 +0200 Subject: [PATCH 34/34] test: Write logs also for jobs that have timed out --- test/e2e/mnist_pytorch_mcad_job_test.go | 6 +++--- test/e2e/mnist_raycluster_sdk_test.go | 6 +++--- test/e2e/mnist_rayjob_mcad_raycluster_test.go | 6 +++--- test/support/ray.go | 5 +++++ 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index 41376a0b..168c7b13 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -145,6 +145,9 @@ func TestMNISTPyTorchMCAD(t *testing.T) { test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( Or( @@ -152,9 +155,6 @@ func TestMNISTPyTorchMCAD(t *testing.T) { WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), )) - // Retrieve the job logs - WriteJobLogs(test, job.Namespace, job.Name) - // Assert the job has completed successfully test.Expect(GetJob(test, job.Namespace, job.Name)). To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go index 202f3bcc..e8bb1f5c 100644 --- a/test/e2e/mnist_raycluster_sdk_test.go +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -194,6 +194,9 @@ func TestMNISTRayClusterSDK(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( Or( @@ -201,9 +204,6 @@ func TestMNISTRayClusterSDK(t *testing.T) { WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), )) - // Retrieve the job logs - WriteJobLogs(test, job.Namespace, job.Name) - // Assert the job has completed successfully test.Expect(GetJob(test, job.Namespace, job.Name)). To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index b6b01653..958b3047 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -252,13 +252,13 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { test.Expect(err).NotTo(HaveOccurred()) test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + // Retrieving the job logs once it has completed or timed out + defer WriteRayJobLogs(test, rayJob.Namespace, rayJob.Name) + test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name) test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). Should(WithTransform(RayJobStatus, Satisfy(rayv1alpha1.IsJobTerminal))) - test.T().Logf("Retrieving RayJob %s/%s logs", rayJob.Namespace, rayJob.Name) - WriteToOutputDir(test, rayJob.Name, Log, GetRayJobLogs(test, rayJob.Namespace, rayJob.Name)) - // Assert the Ray job has completed successfully test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)). To(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) diff --git a/test/support/ray.go b/test/support/ray.go index 3833d43b..cd5a9d87 100644 --- a/test/support/ray.go +++ b/test/support/ray.go @@ -63,3 +63,8 @@ func GetRayJobLogs(t Test, namespace, name string) []byte { return []byte(body["logs"]) } + +func WriteRayJobLogs(t Test, namespace, name string) { + t.T().Logf("Retrieving RayJob %s/%s logs", namespace, name) + WriteToOutputDir(t, name, Log, GetRayJobLogs(t, namespace, name)) +}