diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml new file mode 100644 index 00000000..435c72f0 --- /dev/null +++ b/.github/workflows/e2e_tests.yaml @@ -0,0 +1,157 @@ +name: e2e + +on: + pull_request: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + push: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + +concurrency: + group: ${{ github.head_ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + kubernetes: + + runs-on: ubuntu-20.04 + + steps: + - name: Cleanup + run: | + ls -lart + echo "Initial status:" + df -h + + echo "Cleaning up resources:" + sudo swapoff -a + sudo rm -f /swapfile + sudo apt clean + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + docker rmi $(docker image ls -aq) + + echo "Final status:" + df -h + + - name: Checkout code + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Init directories + run: | + TEMP_DIR="$(pwd)/tmp" + mkdir -p "${TEMP_DIR}" + echo "TEMP_DIR=${TEMP_DIR}" >> $GITHUB_ENV + + mkdir -p "$(pwd)/bin" + echo "$(pwd)/bin" >> $GITHUB_PATH + + - name: Set Go + uses: actions/setup-go@v3 + with: + go-version: v1.18 + + - name: Set up gotestfmt + uses: gotesttools/gotestfmt-action@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Container image registry + run: | + podman run -d -p 5000:5000 --name registry registry:2.8.1 + + export REGISTRY_ADDRESS=$(hostname -i):5000 + echo "REGISTRY_ADDRESS=${REGISTRY_ADDRESS}" >> $GITHUB_ENV + echo "Container image registry started at ${REGISTRY_ADDRESS}" + + KIND_CONFIG_FILE=${{ env.TEMP_DIR }}/kind.yaml + echo "KIND_CONFIG_FILE=${KIND_CONFIG_FILE}" >> $GITHUB_ENV + envsubst < ./test/e2e/kind.yaml > ${KIND_CONFIG_FILE} + + sudo --preserve-env=REGISTRY_ADDRESS sh -c 'cat > /etc/containers/registries.conf.d/local.conf <> $GITHUB_ENV + + set -euo pipefail + go test -timeout 30m -v ./test/e2e -json 2>&1 | tee ${CODEFLARE_TEST_OUTPUT_DIR}/gotest.log | gotestfmt + + - name: Print CodeFlare operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing CodeFlare operator logs" + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log + + - name: Print MCAD controller logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing MCAD controller logs" + kubectl logs -n codeflare-system --tail -1 -l component=multi-cluster-application-dispatcher | tee ${CODEFLARE_TEST_OUTPUT_DIR}/mcad.log + + - name: Print KubeRay operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing KubeRay operator logs" + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log + + - name: Upload logs + uses: actions/upload-artifact@v3 + if: always() && steps.deploy.outcome == 'success' + with: + name: logs + retention-days: 10 + path: | + ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ba68a441..985aeaa7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,5 +25,4 @@ repos: hooks: - id: go-fmt - id: golangci-lint - - id: go-build - id: go-mod-tidy diff --git a/Makefile b/Makefile index a6a5c1f1..b4844103 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,15 @@ MCAD_REF ?= release-${MCAD_VERSION} MCAD_REPO ?= github.com/project-codeflare/multi-cluster-app-dispatcher MCAD_CRD ?= ${MCAD_REPO}/config/crd?ref=${MCAD_REF} +# KUBERAY_VERSION defines the default version of the KubeRay operator (used for testing) +KUBERAY_VERSION ?= v0.5.0 + +# RAY_VERSION defines the default version of Ray (used for testing) +RAY_VERSION ?= 2.5.0 + +# CODEFLARE_SDK_VERSION defines the default version of the CodeFlare SDK +CODEFLARE_SDK_VERSION ?= 0.4.4 + # OPERATORS_REPO_ORG points to GitHub repository organization where bundle PR is opened against # OPERATORS_REPO_FORK_ORG points to GitHub repository fork organization where bundle build is pushed to OPERATORS_REPO_ORG ?= redhat-openshift-ecosystem @@ -61,6 +70,9 @@ MCAD_IMAGE ?= $(IMAGE_ORG_BASE)/mcad-controller:$(MCAD_REF) # INSTASCALE_IMAGE defines the default container image for the InstaScale controller INSTASCALE_IMAGE ?= $(IMAGE_ORG_BASE)/instascale-controller:$(INSTASCALE_VERSION) +# RAY_IMAGE defines the default container image for Ray (used for testing) +RAY_IMAGE ?= rayproject/ray:$(RAY_VERSION) + # BUNDLE_IMG defines the image:tag used for the bundle. # You can use it as an arg. (E.g make bundle-build BUNDLE_IMG=/:) BUNDLE_IMG ?= $(IMAGE_TAG_BASE)-bundle:$(VERSION) @@ -116,6 +128,7 @@ help: ## Display this help. ##@ Development DEFAULTS_FILE := controllers/defaults.go +DEFAULTS_TEST_FILE := test/support/defaults.go .PHONY: defaults defaults: @@ -133,7 +146,22 @@ defaults: @echo ")" >> $(DEFAULTS_FILE) @echo "" >> $(DEFAULTS_FILE) - gofmt -w $(DEFAULTS_FILE) + $(info Regenerating $(DEFAULTS_TEST_FILE)) + @echo "package support" > $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo "// ***********************" >> $(DEFAULTS_TEST_FILE) + @echo "// DO NOT EDIT THIS FILE" >> $(DEFAULTS_TEST_FILE) + @echo "// ***********************" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo "const (" >> $(DEFAULTS_TEST_FILE) + @echo " CodeFlareSDKVersion = \"$(CODEFLARE_SDK_VERSION)\"" >> $(DEFAULTS_TEST_FILE) + @echo " RayVersion = \"$(RAY_VERSION)\"" >> $(DEFAULTS_TEST_FILE) + @echo " RayImage = \"$(RAY_IMAGE)\"" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + @echo ")" >> $(DEFAULTS_TEST_FILE) + @echo "" >> $(DEFAULTS_TEST_FILE) + + gofmt -w $(DEFAULTS_FILE) $(DEFAULTS_TEST_FILE) .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. @@ -186,12 +214,16 @@ vet: ## Run go vet against code. ##@ Build +.PHONY: modules +modules: ## Update Go dependencies. + go get $(MCAD_REPO)@$(MCAD_VERSION) + .PHONY: build -build: defaults generate fmt vet ## Build manager binary. +build: modules defaults generate fmt vet ## Build manager binary. go build -o bin/manager main.go .PHONY: run -run: defaults manifests generate fmt vet ## Run a controller from your host. +run: modules defaults manifests generate fmt vet ## Run a controller from your host. go run ./main.go .PHONY: image-build @@ -199,7 +231,7 @@ image-build: test-unit ## Build container image with the manager. podman build -t ${IMG} . .PHONY: image-push -image-push: ## Push container image with the manager. +image-push: image-build ## Push container image with the manager. podman push ${IMG} ##@ Deployment @@ -383,5 +415,13 @@ catalog-push: ## Push a catalog image. $(MAKE) image-push IMG=$(CATALOG_IMG) .PHONY: test-unit -test-unit: defaults manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./... -coverprofile cover.out +test-unit: defaults manifests generate fmt vet envtest ## Run unit tests. + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(go list ./... | grep -v /test/) -coverprofile cover.out + +.PHONY: test-e2e +test-e2e: defaults manifests generate fmt vet ## Run e2e tests. + go test -timeout 30m -v ./test/e2e + +.PHONY: setup-e2e +setup-e2e: ## Set up e2e tests. + KUBERAY_VERSION=$(KUBERAY_VERSION) test/e2e/setup.sh diff --git a/README.md b/README.md index afc9dafa..31ce40f0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # codeflare-operator + Operator for installation and lifecycle management of CodeFlare distributed workload stack, starting with MCAD and InstaScale @@ -14,7 +15,43 @@ CodeFlare Stack Compatibility Matrix | KubeRay | v0.5.0 | -## Release process +## Development + +### Testing + +The e2e tests can be executed locally by running the following commands: + +1. Use an existing cluster, or set up a test cluster, e.g.: + + ```bash + # Create a KinD cluster + $ kind create cluster --image kindest/node:v1.25.8 + # Install the CRDs + $ make install + ``` + +2. Set up the CodeFlare stack: + ```bash + $ make setup-e2e + ``` + +3. Start the operator locally: + + ```bash + $ make run + ``` + + Alternatively, You can run the operator from your IDE / debugger. + +4. In a separate terminal, run the e2e suite: + + ```bash + $ make test-e2e + ``` + + Alternatively, You can run the e2e test(s) from your IDE / debugger. + +## Release Prerequisite: - Build and release [MCAD](https://github.com/project-codeflare/multi-cluster-app-dispatcher) diff --git a/go.mod b/go.mod index b898acde..621853fb 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,8 @@ require ( github.com/manifestival/manifestival v0.7.2 github.com/onsi/ginkgo/v2 v2.9.2 github.com/onsi/gomega v1.27.6 + github.com/project-codeflare/multi-cluster-app-dispatcher v1.31.0 + github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 go.uber.org/zap v1.24.0 k8s.io/api v0.26.3 k8s.io/apimachinery v0.26.3 diff --git a/go.sum b/go.sum index 672bbaba..8e877d1f 100644 --- a/go.sum +++ b/go.sum @@ -428,6 +428,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA= +github.com/project-codeflare/multi-cluster-app-dispatcher v1.31.0 h1:vq4fAuvlv4Zvnx0dA53WaYkTWG1BFxAkamxuzHfZO2M= +github.com/project-codeflare/multi-cluster-app-dispatcher v1.31.0/go.mod h1:fmbU5LuV1Z2Sbu1FCEoVuw8qxDFcalXvkPyMfGZHHTc= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= @@ -459,6 +461,8 @@ github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1 github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= +github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9 h1:qIThU9GGqEay/y78y4Y9e1FVfrdkH5MFnT0zEJ9yh0A= +github.com/ray-project/kuberay/ray-operator v0.0.0-20230614221720-085c29d40fa9/go.mod h1:2auArgwD9dXXJz1oc7SqQ4U/rHdpwnrBwG98kr8OWXA= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/test/e2e/kind.yaml b/test/e2e/kind.yaml new file mode 100644 index 00000000..4546589b --- /dev/null +++ b/test/e2e/kind.yaml @@ -0,0 +1,31 @@ +# --------------------------------------------------------------------------- +# Copyright 2023. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# --------------------------------------------------------------------------- + +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + image: kindest/node:v1.25.3@sha256:f52781bc0d7a19fb6c405c2af83abfeb311f130707a0e219175677e366cc45d1 + kubeadmConfigPatches: + - | + kind: InitConfiguration + nodeRegistration: + kubeletExtraArgs: + node-labels: "ingress-ready=true" +containerdConfigPatches: + - |- + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."${REGISTRY_ADDRESS}"] + endpoint = ["http://${REGISTRY_ADDRESS}"] diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py new file mode 100644 index 00000000..244c84d2 --- /dev/null +++ b/test/e2e/mnist.py @@ -0,0 +1,160 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + print("Downloading MNIST dataset...") + MNIST(self.data_dir, train=True, download=True) + MNIST(self.data_dir, train=False, download=True) + + def setup(self, stage=None): + + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + self.data_dir, train=False, transform=self.transform + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST() + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=5, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model) diff --git a/test/e2e/mnist_pip_requirements.txt b/test/e2e/mnist_pip_requirements.txt new file mode 100644 index 00000000..87edeef2 --- /dev/null +++ b/test/e2e/mnist_pip_requirements.txt @@ -0,0 +1,3 @@ +pytorch_lightning==1.5.10 +torchmetrics==0.9.1 +torchvision==0.12.0 diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go new file mode 100644 index 00000000..168c7b13 --- /dev/null +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -0,0 +1,161 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "testing" + + . "github.com/onsi/gomega" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" +) + +// Trains the MNIST dataset as a batch Job managed by MCAD, and asserts successful completion of the training job. +func TestMNISTPyTorchMCAD(t *testing.T) { + test := With(t) + test.T().Parallel() + + // Create a namespace + namespace := test.NewTestNamespace() + + // Test configuration + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-mcad", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + // Batch Job + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "job", + Image: GetPyTorchImage(), + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + // Create an AppWrapper resource + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + DesiredAvailable: 1, + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + }, + }, + }, + }, + } + + _, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created MCAD %s/%s successfully", aw.Namespace, aw.Name) + + test.T().Logf("Waiting for MCAD %s/%s to be running", aw.Namespace, aw.Name) + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) +} diff --git a/test/e2e/mnist_raycluster_sdk.py b/test/e2e/mnist_raycluster_sdk.py new file mode 100644 index 00000000..444b1858 --- /dev/null +++ b/test/e2e/mnist_raycluster_sdk.py @@ -0,0 +1,65 @@ +import sys + +from time import sleep + +from torchx.specs.api import AppState, is_terminal + +from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration +from codeflare_sdk.job.jobs import DDPJobDefinition + +namespace = sys.argv[1] + +cluster = Cluster(ClusterConfiguration( + name='mnist', + namespace=namespace, + min_worker=1, + max_worker=1, + min_cpus='500m', + max_cpus=1, + min_memory=0.5, + max_memory=1, + gpu=0, + instascale=False, +)) + +cluster.up() + +cluster.status() + +cluster.wait_ready() + +cluster.status() + +cluster.details() + +jobdef = DDPJobDefinition( + name="mnist", + script="mnist.py", + scheduler_args={"requirements": "requirements.txt"}, +) +job = jobdef.submit(cluster) + +done = False +time = 0 +timeout = 300 +while not done: + status = job.status() + if is_terminal(status.state): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError(f"job has timed out after waiting {timeout}s") + sleep(5) + time += 5 + +print(f"Job has completed: {status.state}") + +print(job.logs()) + +cluster.down() + +if not status.state == AppState.SUCCEEDED: + exit(1) +else: + exit(0) diff --git a/test/e2e/mnist_raycluster_sdk_test.go b/test/e2e/mnist_raycluster_sdk_test.go new file mode 100644 index 00000000..e8bb1f5c --- /dev/null +++ b/test/e2e/mnist_raycluster_sdk_test.go @@ -0,0 +1,210 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "testing" + + . "github.com/onsi/gomega" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" +) + +// Creates a Ray cluster, and trains the MNIST dataset using the CodeFlare SDK. +// Asserts successful completion of the training job. +// +// This covers the installation of the CodeFlare SDK, as well as the RBAC required +// for the SDK to successfully perform requests to the cluster, on behalf of the +// impersonated user. +func TestMNISTRayClusterSDK(t *testing.T) { + test := With(t) + test.T().Parallel() + + if !IsOpenShift(test) { + test.T().Skip("Requires https://github.com/project-codeflare/codeflare-sdk/pull/146") + } + + // Create a namespace + namespace := test.NewTestNamespace() + + // Test configuration + config := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist-raycluster-sdk", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + // SDK script + "mnist_raycluster_sdk.py": ReadFile(test, "mnist_raycluster_sdk.py"), + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name) + + // SDK client RBAC + serviceAccount := &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ServiceAccount", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk-user", + Namespace: namespace.Name, + }, + } + serviceAccount, err = test.Client().Core().CoreV1().ServiceAccounts(namespace.Name).Create(test.Ctx(), serviceAccount, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + role := &rbacv1.Role{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "Role", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Rules: []rbacv1.PolicyRule{ + { + Verbs: []string{"get", "create", "delete", "list", "patch", "update"}, + APIGroups: []string{mcadv1beta1.GroupName}, + Resources: []string{"appwrappers"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{rayv1alpha1.GroupVersion.Group}, + Resources: []string{"rayclusters", "rayclusters/status"}, + }, + { + Verbs: []string{"get", "list"}, + APIGroups: []string{"route.openshift.io"}, + Resources: []string{"routes"}, + }, + }, + } + role, err = test.Client().Core().RbacV1().Roles(namespace.Name).Create(test.Ctx(), role, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + roleBinding := &rbacv1.RoleBinding{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rbacv1.SchemeGroupVersion.String(), + Kind: "RoleBinding", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.SchemeGroupVersion.Group, + Kind: "Role", + Name: role.Name, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + APIGroup: corev1.SchemeGroupVersion.Group, + Name: serviceAccount.Name, + Namespace: serviceAccount.Namespace, + }, + }, + } + _, err = test.Client().Core().RbacV1().RoleBindings(namespace.Name).Create(test.Ctx(), roleBinding, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "sdk", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + BackoffLimit: Ptr(int32(0)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test", + // FIXME: switch to base Python image once the dependency on OpenShift CLI is removed + // See https://github.com/project-codeflare/codeflare-sdk/pull/146 + Image: "quay.io/opendatahub/notebooks:jupyter-minimal-ubi8-python-3.8-4c8f26e", + Command: []string{"/bin/sh", "-c", "pip install codeflare-sdk==" + GetCodeFlareSDKVersion() + " && cp /test/* . && python mnist_raycluster_sdk.py" + " " + namespace.Name}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: serviceAccount.Name, + }, + }, + }, + } + job, err = test.Client().Core().BatchV1().Jobs(namespace.Name).Create(test.Ctx(), job, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created Job %s/%s successfully", job.Namespace, job.Name) + + // Retrieving the job logs once it has completed or timed out + defer WriteJobLogs(test, job.Namespace, job.Name) + + test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name) + test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should( + Or( + WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)), + WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)), + )) + + // Assert the job has completed successfully + test.Expect(GetJob(test, job.Namespace, job.Name)). + To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue))) +} diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go new file mode 100644 index 00000000..958b3047 --- /dev/null +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -0,0 +1,265 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "encoding/base64" + "testing" + + . "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" +) + +// Trains the MNIST dataset as a RayJob, executed by a Ray cluster managed by MCAD, +// and asserts successful completion of the training job. +func TestMNISTRayJobMCADRayCluster(t *testing.T) { + test := With(t) + test.T().Parallel() + + // Create a namespace + namespace := test.NewTestNamespace() + + // MNIST training script + mnist := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + BinaryData: map[string][]byte{ + "mnist.py": ReadFile(test, "mnist.py"), + }, + Immutable: Ptr(true), + } + mnist, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), mnist, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created ConfigMap %s/%s successfully", mnist.Namespace, mnist.Name) + + // RayCluster + rayCluster := &rayv1alpha1.RayCluster{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rayv1alpha1.GroupVersion.String(), + Kind: "RayCluster", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "raycluster", + Namespace: namespace.Name, + }, + Spec: rayv1alpha1.RayClusterSpec{ + RayVersion: GetRayVersion(), + HeadGroupSpec: rayv1alpha1.HeadGroupSpec{ + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + }, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "ray-head", + Image: GetRayImage(), + Ports: []corev1.ContainerPort{ + { + ContainerPort: 6379, + Name: "gcs", + }, + { + ContainerPort: 8265, + Name: "dashboard", + }, + { + ContainerPort: 10001, + Name: "client", + }, + }, + Lifecycle: &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/sh", "-c", "ray stop"}, + }, + }, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "mnist", + MountPath: "/home/ray/jobs", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "mnist", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mnist.Name, + }, + }, + }, + }, + }, + }, + }, + }, + WorkerGroupSpecs: []rayv1alpha1.WorkerGroupSpec{ + { + Replicas: Ptr(int32(1)), + MinReplicas: Ptr(int32(1)), + MaxReplicas: Ptr(int32(2)), + GroupName: "small-group", + RayStartParams: map[string]string{}, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + InitContainers: []corev1.Container{ + { + Name: "init-myservice", + Image: "busybox:1.28", + Command: []string{"sh", "-c", "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"}, + }, + }, + Containers: []corev1.Container{ + { + Name: "ray-worker", + Image: GetRayImage(), + Lifecycle: &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{"/bin/sh", "-c", "ray stop"}, + }, + }, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("256Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Create an AppWrapper resource + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ray-cluster", + Namespace: namespace.Name, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + DesiredAvailable: 1, + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 2, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, rayCluster), + }, + }, + }, + }, + } + + aw, err = test.Client().MCAD().ArbV1().AppWrappers(namespace.Name).Create(aw) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created MCAD %s/%s successfully", aw.Namespace, aw.Name) + + test.T().Logf("Waiting for MCAD %s/%s to be running", aw.Namespace, aw.Name) + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutMedium). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + rayJob := &rayv1alpha1.RayJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: rayv1alpha1.GroupVersion.String(), + Kind: "RayJob", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: rayv1alpha1.RayJobSpec{ + Entrypoint: "python /home/ray/jobs/mnist.py", + RuntimeEnv: base64.StdEncoding.EncodeToString([]byte(` +{ + "pip": [ + "pytorch_lightning==1.5.10", + "torchmetrics==0.9.1", + "torchvision==0.12.0" + ], + "env_vars": { + } +} +`)), + ClusterSelector: map[string]string{ + RayJobDefaultClusterSelectorKey: rayCluster.Name, + }, + ShutdownAfterJobFinishes: false, + }, + } + rayJob, err = test.Client().Ray().RayV1alpha1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("Created RayJob %s/%s successfully", rayJob.Namespace, rayJob.Name) + + // Retrieving the job logs once it has completed or timed out + defer WriteRayJobLogs(test, rayJob.Namespace, rayJob.Name) + + test.T().Logf("Waiting for RayJob %s/%s to complete", rayJob.Namespace, rayJob.Name) + test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutLong). + Should(WithTransform(RayJobStatus, Satisfy(rayv1alpha1.IsJobTerminal))) + + // Assert the Ray job has completed successfully + test.Expect(GetRayJob(test, rayJob.Namespace, rayJob.Name)). + To(WithTransform(RayJobStatus, Equal(rayv1alpha1.JobStatusSucceeded))) +} diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh new file mode 100755 index 00000000..dbc973dd --- /dev/null +++ b/test/e2e/setup.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail +: "${KUBERAY_VERSION}" + +echo Deploying KubeRay "${KUBERAY_VERSION}" +kubectl apply --server-side -k "github.com/ray-project/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}&timeout=90s" + +kubectl create ns codeflare-system --dry-run=client -o yaml | kubectl apply -f - + +echo Deploying MCAD controller +cat <