From 1de910a3e13dc7c56475475eccd2e271fe53d291 Mon Sep 17 00:00:00 2001 From: Laurentiu Bradin <109964136+z103cb@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:32:37 +0300 Subject: [PATCH] [Feature] Add end to end tests to apiserver Fixes #1388 --- .github/workflows/test-job.yaml | 2 +- apiserver/DEVELOPMENT.md | 44 +- apiserver/Makefile | 25 +- apiserver/Volumes.md | 112 ++-- apiserver/go.mod | 4 +- apiserver/go.sum | 5 + apiserver/hack/kind-cluster-config.yaml | 15 +- apiserver/pkg/server/cluster_server.go | 19 +- apiserver/pkg/server/job_server.go | 19 +- apiserver/pkg/server/serve_server.go | 60 +- apiserver/pkg/server/validations.go | 65 +++ apiserver/pkg/server/validations_test.go | 513 ++++++++++++++++++ apiserver/test/e2e/cluster_server_e2e_test.go | 256 +++++++++ apiserver/test/e2e/config_server_e2e_test.go | 136 +++++ apiserver/test/e2e/doc.go | 15 + apiserver/test/e2e/job_server_e2e_test.go | 269 +++++++++ apiserver/test/e2e/service_server_e2e_test.go | 478 ++++++++++++++++ apiserver/test/e2e/types.go | 364 +++++++++++++ apiserver/test/e2e/utils.go | 44 ++ 19 files changed, 2282 insertions(+), 163 deletions(-) create mode 100644 apiserver/pkg/server/validations.go create mode 100644 apiserver/pkg/server/validations_test.go create mode 100644 apiserver/test/e2e/cluster_server_e2e_test.go create mode 100644 apiserver/test/e2e/config_server_e2e_test.go create mode 100644 apiserver/test/e2e/doc.go create mode 100644 apiserver/test/e2e/job_server_e2e_test.go create mode 100644 apiserver/test/e2e/service_server_e2e_test.go create mode 100644 apiserver/test/e2e/types.go create mode 100644 apiserver/test/e2e/utils.go diff --git a/.github/workflows/test-job.yaml b/.github/workflows/test-job.yaml index be8f69c9232..2c21109bede 100644 --- a/.github/workflows/test-job.yaml +++ b/.github/workflows/test-job.yaml @@ -144,7 +144,7 @@ jobs: working-directory: ${{env.working-directory}} - name: Test - run: go test ./... + run: go test ./pkg/... ./cmd/... -race -parallel 4 working-directory: ${{env.working-directory}} - name: Set up Docker diff --git a/apiserver/DEVELOPMENT.md b/apiserver/DEVELOPMENT.md index 0978e123cee..6ded870fcd6 100644 --- a/apiserver/DEVELOPMENT.md +++ b/apiserver/DEVELOPMENT.md @@ -61,6 +61,35 @@ make build make test ``` +#### End to End Testing + +There are two `make` targets provide execute the end to end test (integration between Kuberay API server and Kuberay Operator): + +* `make e2e-test` executes all the tests defined in the [test/e2e package](./test/e2e/). It uses the cluster defined in `~/.kube/config` to submit the workloads. +* `make local-e2e-test` creates a local kind cluster, deploys the nightly operator image and a freshly build Kuberay API server into the kind cluster and shuts down the kind cluster upon successful execution of the end to end test. + +The `e2e` test targets use two variables to control what version of Ray images to use in the end to end tests: + +* `E2E_API_SERVER_RAY_IMAGE` -- for the ray docker image. Currently set to `rayproject/ray:2.7.0-py310`. On Apple silicon or arm64 development machines the `-aarch64` suffix is added. +* `E2E_API_SERVER_URL` -- for the base URL of the deployed KubeRayAPI server. The default value is: `http://localhost:31888` + +The end to end test targets share the usage of the `GO_TEST_FLAGS`. Overriding the make file variable with a `-v` option allows for both unit and end to end tests to print any output / debug messages. By default, only if there's a test failure those messages are show. + +The default values of the variables can be overridden using the `-e` make command line arguments. + +Examples: + +```bash +# To run end to end test using default cluster +make e2e-test + +# To run end to end test in fresh cluster. +# Please note that: +# * the cluster created for this test is the same as the cluster created by make cluster. +# * if the end to end tests fail the cluster will still be up and will have to be explicitly shutdown by executing make clean-cluster +make local-e2e-test +``` + #### Swagger UI updates To update the swagger ui files deployed with the Kuberay API server, you'll need to: @@ -117,7 +146,7 @@ make run #### Access -Access the service at `localhost:8888` for http, and `locahost:8887` for the RPC port. +Access the service at `localhost:8888` for http, and `localhost:8887` for the RPC port. ### Kubernetes Deployment @@ -160,9 +189,9 @@ As a convenience for local development the following `make` targets are provided * `make cluster` -- creates a local kind cluster, using the configuration from `hack/kind-cluster-config.yaml`. It creates a port mapping allowing for the service running in the kind cluster to be accessed on `localhost:31888` for HTTP and `localhost:31887` for RPC. * `make clean-cluster` -- deletes the local kind cluster created with `make cluster` * `load-image` -- loads the docker image defined by the `IMG` make variable into the kind cluster. The default value for variable is: `kuberay/apiserver:latest`. The name of the image can be changed by using `make load-image -e IMG=` -* `operator-image` -- Build the operator image to be loaded in your kind cluster. The tag for the operator image is `kuberay/operator:latest`. This step is optional. -* `load-operator-image` -- Load the operator image to the kind cluster created with `create-kind-cluster`. The tag for the operator image is `kuberay/operator:latest`, and the tag can be overridden using `make load-operator-image -E OPERATOR_IMAGE_TAG=`. To use the nightly operator tag, set `OPERATOR_IMAGE_TAG` to `nightly`. -* `deploy-operator` -- Deploy operator into your cluster. The tag for the operator image is `kuberay/operator:latest`. +* `operator-image` -- Build the operator image to be loaded in your kind cluster. You must specify a value for the operator image tag. Since the default value is set to `nightly`, the local image with this value will be overridden if `make deploy` operator is used later. This step is optional. Example: `make operator-image -e OPERATOR_IMAGE_TAG=latest` +* `load-operator-image` -- Load the operator image to the kind cluster created with `create-kind-cluster`. The tag for the operator image is `kuberay/operator:nightly`, and the tag can be overridden using `make load-operator-image -E OPERATOR_IMAGE_TAG=`. +* `deploy-operator` -- Deploy operator into your cluster. The tag for the operator image is `kuberay/operator:nightly`. * `undeploy-operator` -- Undeploy operator from your cluster When developing and testing with kind you might want to execute these targets together: @@ -172,9 +201,12 @@ When developing and testing with kind you might want to execute these targets to make docker-image cluster load-image deploy #To create a new API server image, operator image and deploy them on a new cluster -make docker-image operator-image cluster load-image load-operator-image deploy deploy-operator +make docker-image operator-image cluster load-image load-operator-image deploy deploy-operator -e OPERATOR_IMAGE_TAG=latest + +#To execute end 2 end tests with a local build operator and verbose output +make operator-image local-e2e-test -e OPERATOR_IMAGE_TAG=latest -e GO_TEST_FLAGS="-v" ``` #### Access API Server in the Cluster -Access the service at `localhost:31888` for http and `locahost:31887` for the RPC port. +Access the service at `localhost:31888` for http and `localhost:31887` for the RPC port. diff --git a/apiserver/Makefile b/apiserver/Makefile index bea4f256d67..0adf02996a6 100644 --- a/apiserver/Makefile +++ b/apiserver/Makefile @@ -7,6 +7,13 @@ REPO_ROOT_BIN := $(REPO_ROOT)/bin IMG_TAG ?=latest IMG ?= kuberay/apiserver:$(IMG_TAG) +# Allow for additional test flags (-v, etc) +GO_TEST_FLAGS ?= +# Ray docker images to use for end to end tests +E2E_API_SERVER_RAY_IMAGE ?=rayproject/ray:2.7.0-py310 +# Kuberay API Server base URL to use in end to end tests +E2E_API_SERVER_URL ?=http://localhost:31888 + # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -56,11 +63,18 @@ imports: goimports ## Run goimports against code. $(GOIMPORTS) -l -w . test: fmt vet fumpt imports lint ## Run unit tests. - go test ./... -race -coverprofile ray-kube-api-server-coverage.out + go test ./pkg/... ./cmd/... $(GO_TEST_FLAGS) -race -coverprofile ray-kube-api-server-coverage.out -parallel 4 lint: golangci-lint fmt vet fumpt imports ## Run the linter. $(GOLANGCI_LINT) run --timeout=3m +.PHONY: e2e-test +e2e-test: ## Run end to end tests using a pre-exiting cluster. + go test ./test/e2e/... $(GO_TEST_FLAGS) -timeout 30m -race -parallel 4 -count=1 + +.PHONY: local-e2e-test +local-e2e-test: docker-image cluster load-image load-operator-image deploy-operator deploy e2e-test clean-cluster ## Run end to end tests, create a fresh kind cluster will all components deployed. + ##@ Build build: fmt vet fumpt imports lint ## Build api server binary. @@ -70,10 +84,10 @@ run: fmt vet fumpt imports lint ## Run the api server from your host. go run -race cmd/main.go -localSwaggerPath ${REPO_ROOT}/proto/swagger docker-image: test ## Build image with the api server. - ${ENGINE} build -t ${IMG} -f Dockerfile .. + $(ENGINE) build -t ${IMG} -f Dockerfile .. docker-push: ## Push image with the api server. - ${ENGINE} push ${IMG} + $(ENGINE) push ${IMG} .PHONY: build-swagger build-swagger: go-bindata @@ -170,7 +184,7 @@ clean-dev-tools: ## Remove all development tools ##@ Testing Setup and Tools KIND_CONFIG ?= hack/kind-cluster-config.yaml KIND_CLUSTER_NAME ?= ray-api-server-cluster -OPERATOR_IMAGE_TAG ?= latest +OPERATOR_IMAGE_TAG ?= nightly .PHONY: cluster cluster: kind ## Start kind development cluster. $(KIND) create cluster -n $(KIND_CLUSTER_NAME) --config $(KIND_CONFIG) @@ -200,4 +214,7 @@ undeploy-operator: ## Undeploy operator via helm from the K8s cluster specified .PHONY: load-operator-image load-operator-image: ## Load the operator image to the kind cluster created with create-kind-cluster. +ifeq (${OPERATOR_IMAGE_TAG}, nightly) + $(ENGINE) pull kuberay/operator:$(OPERATOR_IMAGE_TAG) +endif $(KIND) load docker-image kuberay/operator:$(OPERATOR_IMAGE_TAG) -n $(KIND_CLUSTER_NAME) diff --git a/apiserver/Volumes.md b/apiserver/Volumes.md index e4ed867e16d..4955aac1697 100644 --- a/apiserver/Volumes.md +++ b/apiserver/Volumes.md @@ -7,96 +7,81 @@ API server allows to specify multiple types of volumes mounted to the Ray pods ( [config maps](https://kubernetes.io/docs/concepts/storage/volumes/#configmap), [secrets](https://kubernetes.io/docs/concepts/storage/volumes/#secret), and [empty dir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir). -Multiple volumes of different type can be mounted to both head and worker nodes, by defining a volume array for them - +Multiple volumes of different type can be mounted to both head and worker nodes, by defining a volume array for them ## HostPath volumes -A hostPath volume mounts a file or directory from the host node's filesystem into your Pod. This is not something that -most Pods will need, but it offers a powerful escape hatch for some applications. +A hostPath volume mounts a file or directory from the host node's filesystem into your Pod. This is not something that most Pods will need, but it offers a powerful escape hatch for some applications. For example, some uses for a hostPath are: * running a container that needs access to Docker internals; use a hostPath of /var/lib/docker * running cAdvisor in a container; use a hostPath of /sys -* allowing a Pod to specify whether a given hostPath should exist prior to the Pod running, whether it should be -created, and what it should exist as +* allowing a Pod to specify whether a given hostPath should exist prior to the Pod running, whether it should be created, and what it should exist as The code below gives an example of hostPath volume definition: -```` +```json { - "name": "hostPath", # unique name - "source": "/tmp", # data location on host - "mountPath": "/tmp/hostPath", # mounting path - "volumeType": 1, # volume type - host path - "hostPathType": 0, # host path type - directory - "mountPropagationMode": 1 # mount propagation - host to container + "name": "hostPath", # unique name + "source": "/tmp", # data location on host + "mountPath": "/tmp/hostPath", # mounting path + "volumeType": 1, # volume type - host path + "hostPathType": 0, # host path type - directory + "mountPropagationMode": 1 # mount propagation - host to container } -```` +``` ## PVC volumes -A Persistent Volume Claim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources -and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request -specific size and access modes (e.g., they can be mounted `ReadWriteOnce`, `ReadOnlyMany` or `ReadWriteMany`). +A Persistent Volume Claim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes (e.g., they can be mounted `ReadWriteOnce`, `ReadOnlyMany` or `ReadWriteMany`). -The caveat of using PVC volumes is that the same PVC is mounted to all nodes. As a result only PVCs with access -mode `ReadOnlyMany` can be used in this case. +The caveat of using PVC volumes is that the same PVC is mounted to all nodes. As a result only PVCs with access mode `ReadOnlyMany` can be used in this case. The code below gives an example of PVC volume definition: -```` +```json { - "name": "pvc", # unique name - "mountPath": "/tmp/pvc", # mounting path - "volumeType": 0, # volume type - PVC - "mountPropagationMode": 2, # mount propagation mode - bidirectional - "readOnly": false # read only + "name": "pvc", # unique name + "mountPath": "/tmp/pvc", # mounting path + "volumeType": 0, # volume type - PVC + "mountPropagationMode": 2, # mount propagation mode - bidirectional + "readOnly": false # read only } -```` +``` ## Ephemeral volumes -Some application need additional storage but don't care whether that data is stored persistently across restarts. For -example, caching services are often limited by memory size and can move infrequently used data into storage that is -slower than memory with little impact on overall performance. Ephemeral volumes are designed for these use cases. -Because volumes follow the Pod's lifetime and get created and deleted along with the Pod, Pods can be stopped and -restarted without being limited to where some persistent volume is available. +Some application need additional storage but don't care whether that data is stored persistently across restarts. For example, caching services are often limited by memory size and can move infrequently used data into storage that is slower than memory with little impact on overall performance. Ephemeral volumes are designed for these use cases. Because volumes follow the Pod's lifetime and get created and deleted along with the Pod, Pods can be stopped and restarted without being limited to where some persistent volume is available. -Although there are several option of ephemeral volumes, here we are using generic ephemeral volumes, which can be -provided by all storage drivers that also support persistent volumes. Generic ephemeral volumes are similar to emptyDir -volumes in the sense that they provide a per-pod directory for scratch data that is usually empty after provisioning. -But they may also have additional features: +Although there are several option of ephemeral volumes, here we are using generic ephemeral volumes, which can be provided by all storage drivers that also support persistent volumes. Generic ephemeral volumes are similar to emptyDir volumes in the sense that they provide a per-pod directory for scratch data that is usually empty after provisioning. But they may also have additional features: * Storage can be local or network-attached. * Volumes can have a fixed size that Pods are not able to exceed. The code below gives an example of ephemeral volume definition: -```` +```json { - "name": "ephemeral", # unique name - "mountPath": "/tmp/ephemeral" # mounting path, - "mountPropagationMode": 0, # mount propagation mode - None - "volumeType": 2, # volume type - ephemeral - "storage": "5Gi", # disk size - "storageClass": "default" # storage class - optional - "accessMode": 0 # access mode RWO - optional + "name": "ephemeral", # unique name + "mountPath": "/tmp/ephemeral" # mounting path, + "mountPropagationMode": 0, # mount propagation mode - None + "volumeType": 2, # volume type - ephemeral + "storage": "5Gi", # disk size + "storageClass": "default", # storage class - optional + "accessMode": 0 # access mode RWO - optional } -```` +``` ## Config map volumes -A ConfigMap provides a way to inject configuration data into pods. The data stored in a ConfigMap can be referenced in -a volume of type configMap and then consumed by containerized applications running in a pod. +A ConfigMap provides a way to inject configuration data into pods. The data stored in a ConfigMap can be referenced in a volume of type configMap and then consumed by containerized applications running in a pod. -When referencing a ConfigMap, you provide the name of the ConfigMap in the volume. You can customize the path to use -for a specific entry in the ConfigMap. +When referencing a ConfigMap, you provide the name of the ConfigMap in the volume. You can customize the path to use for a specific entry in the ConfigMap. The code below gives an example of config map volume definition: -```` +```json { "name":"code-sample", # Unique name "mountPath":"/home/ray/samples", # mounting path @@ -106,17 +91,15 @@ The code below gives an example of config map volume definition: "sample_code.py":"sample_code.py" } } -```` +``` ## Secret volumes -A secret volume is used to pass sensitive information, such as passwords, to Pods. You can store secrets in the -Kubernetes API and mount them as files for use by pods without coupling to Kubernetes directly. Secret volumes are -backed by tmpfs (a RAM-backed filesystem) so they are never written to non-volatile storage. +A secret volume is used to pass sensitive information, such as passwords, to Pods. You can store secrets in the Kubernetes API and mount them as files for use by pods without coupling to Kubernetes directly. Secret volumes are backed by tmpfs (a RAM-backed filesystem) so they are never written to non-volatile storage. The code below gives an example of secret volume definition: -```` +```json { "name":"important-secret", # Unique name "mountPath":"/home/ray/sensitive", # mounting path @@ -126,22 +109,19 @@ The code below gives an example of secret volume definition: "subPath": "password" } } -```` +``` ## Emptydir volumes -An emptyDir volume is first created when a Pod is assigned to a node, and exists as long as that Pod is running on -that node. As the name says, the emptyDir volume is initially empty. All containers in the Pod can read and write the -same files in the emptyDir volume, though that volume can be mounted at the same or different paths in each container. -When a Pod is removed from a node for any reason, the data in the emptyDir is deleted permanently. +An emptyDir volume is first created when a Pod is assigned to a node, and exists as long as that Pod is running on that node. As the name says, the emptyDir volume is initially empty. All containers in the Pod can read and write the same files in the emptyDir volume, though that volume can be mounted at the same or different paths in each container. When a Pod is removed from a node for any reason, the data in the emptyDir is deleted permanently. -The code below gives an example of empydir volume definition: +The code below gives an example of empty directory volume definition: -```` +```json { - "name": "emptyDir", # unique name - "mountPath": "/tmp/emptydir" # mounting path, - "volumeType": 5, # vlume type - ephemeral - "storage": "5Gi", # max storage size - optional + "name": "emptyDir", # unique name + "mountPath": "/tmp/emptydir" # mounting path, + "volumeType": 5, # vlume type - ephemeral + "storage": "5Gi", # max storage size - optional } -```` +``` diff --git a/apiserver/go.mod b/apiserver/go.mod index a6841c01ff2..fd4e575225f 100644 --- a/apiserver/go.mod +++ b/apiserver/go.mod @@ -20,12 +20,15 @@ require ( ) require ( + github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49 github.com/elazarl/go-bindata-assetfs v1.0.1 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/grpc-ecosystem/grpc-gateway/v2 v2.6.0 ) +require github.com/pmezard/go-difflib v1.0.0 // indirect + require ( github.com/asaskevich/govalidator v0.0.0-20200428143746-21a406dcc535 // indirect github.com/beorn7/perks v1.0.1 // indirect @@ -48,7 +51,6 @@ require ( github.com/mitchellh/mapstructure v1.4.1 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_model v0.2.0 // indirect github.com/prometheus/common v0.28.0 // indirect github.com/prometheus/procfs v0.6.0 // indirect diff --git a/apiserver/go.sum b/apiserver/go.sum index 63b60d84e5e..a26122c759d 100644 --- a/apiserver/go.sum +++ b/apiserver/go.sum @@ -85,6 +85,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/docker/go-units v0.3.3/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49 h1:6SNWi8VxQeCSwmLuTbEvJd7xvPmdS//zvMBWweZLgck= +github.com/dustinkirkland/golang-petname v0.0.0-20230626224747-e794b9370d49/go.mod h1:V+Qd57rJe8gd4eiGzZyg4h54VLHmYVVw54iMnlAMrF8= github.com/elazarl/go-bindata-assetfs v1.0.1 h1:m0kkaHRKEu7tUIUFVwhGGGYClXvyl4RE03qmvRTNfbw= github.com/elazarl/go-bindata-assetfs v1.0.1/go.mod h1:v+YaWX3bdea5J/mo8dSETolEo7R71Vk1u8bnjau5yw4= github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153/go.mod h1:/Zj4wYkgs4iZTTu3o/KG3Itv/qCCa8VVMlb3i9OVuzc= @@ -432,6 +434,7 @@ github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= @@ -440,6 +443,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= diff --git a/apiserver/hack/kind-cluster-config.yaml b/apiserver/hack/kind-cluster-config.yaml index e0c4aeaa160..0bb108a48f5 100644 --- a/apiserver/hack/kind-cluster-config.yaml +++ b/apiserver/hack/kind-cluster-config.yaml @@ -10,17 +10,6 @@ nodes: kubeletExtraArgs: node-labels: "ingress-ready=true" extraPortMappings: - - containerPort: 30265 - hostPort: 8265 - listenAddress: "0.0.0.0" - protocol: tcp - - containerPort: 30001 - hostPort: 10001 - listenAddress: "0.0.0.0" - protocol: tcp - - containerPort: 8000 - hostPort: 8000 - listenAddress: "0.0.0.0" - containerPort: 31888 hostPort: 31888 listenAddress: "0.0.0.0" @@ -31,3 +20,7 @@ nodes: image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb - role: worker image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb +- role: worker + image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb +- role: worker + image: kindest/node:v1.23.17@sha256:59c989ff8a517a93127d4a536e7014d28e235fb3529d9fba91b3951d461edfdb diff --git a/apiserver/pkg/server/cluster_server.go b/apiserver/pkg/server/cluster_server.go index 7506e72dd9a..74ec5ff15e5 100644 --- a/apiserver/pkg/server/cluster_server.go +++ b/apiserver/pkg/server/cluster_server.go @@ -153,23 +153,8 @@ func ValidateCreateClusterRequest(request *api.CreateClusterRequest) error { return util.NewInvalidInputError("User who create the cluster is empty. Please specify a valid value.") } - if len(request.Cluster.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") - } - - for index, spec := range request.Cluster.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) - } + if err := ValidateClusterSpec(request.Cluster.ClusterSpec); err != nil { + return err } return nil diff --git a/apiserver/pkg/server/job_server.go b/apiserver/pkg/server/job_server.go index 3b6ddb2ee0e..2944872ef30 100644 --- a/apiserver/pkg/server/job_server.go +++ b/apiserver/pkg/server/job_server.go @@ -128,23 +128,8 @@ func ValidateCreateJobRequest(request *api.CreateRayJobRequest) error { return nil } - if len(request.Job.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") - } - - for index, spec := range request.Job.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) - } + if err := ValidateClusterSpec(request.Job.ClusterSpec); err != nil { + return err } return nil diff --git a/apiserver/pkg/server/serve_server.go b/apiserver/pkg/server/serve_server.go index c956dcf417e..8fad2c90279 100644 --- a/apiserver/pkg/server/serve_server.go +++ b/apiserver/pkg/server/serve_server.go @@ -2,6 +2,7 @@ package server import ( "context" + "strings" "github.com/ray-project/kuberay/apiserver/pkg/manager" "github.com/ray-project/kuberay/apiserver/pkg/model" @@ -155,6 +156,9 @@ func (s *RayServiceServer) DeleteRayService(ctx context.Context, request *api.De } func ValidateCreateServiceRequest(request *api.CreateRayServiceRequest) error { + if request == nil { + return util.NewInvalidInputError("A non nill request is expected") + } if request.Namespace == "" { return util.NewInvalidInputError("Namespace is empty. Please specify a valid value.") } @@ -175,25 +179,23 @@ func ValidateCreateServiceRequest(request *api.CreateRayServiceRequest) error { return util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value.") } - if len(request.Service.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") + if request.Service.ServeDeploymentGraphSpec == nil && strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + return util.NewInvalidInputError("A serve config v2 or deployment graph specs is required. Please specify either.") } - for index, spec := range request.Service.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) + if request.Service.ServeDeploymentGraphSpec != nil && strings.TrimSpace(request.Service.ServeConfig_V2) != "" { + return util.NewInvalidInputError("Both serve config v2 or deployment graph specs were specified. Please specify one or the other.") + } + if strings.TrimSpace(request.Service.ServeConfig_V2) == "" { + if err := ValidateServeDeploymentGraphSpec(request.Service.ServeDeploymentGraphSpec); err != nil { + return err } } + if err := ValidateClusterSpec(request.Service.ClusterSpec); err != nil { + return err + } + return nil } @@ -221,23 +223,8 @@ func ValidateUpdateServiceRequest(request *api.UpdateRayServiceRequest) error { return util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value.") } - if len(request.Service.ClusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") - } - - for index, spec := range request.Service.ClusterSpec.WorkerGroupSpec { - if len(spec.GroupName) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) - } - if len(spec.ComputeTemplate) == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) - } - if spec.MaxReplicas == 0 { - return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) - } - if spec.MinReplicas > spec.MaxReplicas { - return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) - } + if err := ValidateClusterSpec(request.Service.ClusterSpec); err != nil { + return err } return nil @@ -265,15 +252,8 @@ func ValidateUpdateRayServiceConfigsRequest(request *api.UpdateRayServiceConfigs } } if updateServiceBody.ServeDeploymentGraphSpec != nil { - for _, spec := range updateServiceBody.ServeDeploymentGraphSpec.ServeConfigs { - if spec.Replicas <= 0 { - return util.NewInvalidInputError("input invalid, replicas must be greater than 0.") - } - if spec.ActorOptions != nil { - if spec.ActorOptions.CpusPerActor <= 0 && spec.ActorOptions.GpusPerActor <= 0 && spec.ActorOptions.MemoryPerActor <= 0 { - return util.NewInvalidInputError("input invalid, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0.") - } - } + if err := ValidateServeDeploymentGraphSpec(updateServiceBody.ServeDeploymentGraphSpec); err != nil { + return err } } return nil diff --git a/apiserver/pkg/server/validations.go b/apiserver/pkg/server/validations.go new file mode 100644 index 00000000000..35ffe3fe5d8 --- /dev/null +++ b/apiserver/pkg/server/validations.go @@ -0,0 +1,65 @@ +package server + +import ( + "strings" + + "github.com/ray-project/kuberay/apiserver/pkg/util" + api "github.com/ray-project/kuberay/proto/go_client" +) + +// ValidateClusterSpec validates that the *api.ClusterSpec is not nil and +// has all the required fields +func ValidateClusterSpec(clusterSpec *api.ClusterSpec) error { + if clusterSpec == nil { + return util.NewInvalidInputError("A ClusterSpec object is required. Please specify one.") + } + if clusterSpec.HeadGroupSpec == nil { + return util.NewInvalidInputError("Cluster Spec Object requires HeadGroupSpec to be populated. Please specify one.") + } + if len(clusterSpec.HeadGroupSpec.ComputeTemplate) == 0 { + return util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value.") + } + if len(clusterSpec.HeadGroupSpec.RayStartParams) == 0 { + return util.NewInvalidInputError("HeadGroupSpec RayStartParams is empty. Please specify values.") + } + + for index, spec := range clusterSpec.WorkerGroupSpec { + if len(spec.GroupName) == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d group name is empty. Please specify a valid value.", index) + } + if len(spec.ComputeTemplate) == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d compute template is empty. Please specify a valid value.", index) + } + if spec.MaxReplicas == 0 { + return util.NewInvalidInputError("WorkerNodeSpec %d MaxReplicas can not be 0. Please specify a valid value.", index) + } + if spec.MinReplicas > spec.MaxReplicas { + return util.NewInvalidInputError("WorkerNodeSpec %d MinReplica > MaxReplicas. Please specify a valid value.", index) + } + } + return nil +} + +// ValidateServeDeploymentGraphSpec validates that the ServeDeploymentGraphSpec has the CRD required fields +func ValidateServeDeploymentGraphSpec(deploymentGraphSpec *api.ServeDeploymentGraphSpec) error { + if deploymentGraphSpec == nil { + return util.NewInvalidInputError("ServeDeploymentGraphSpec must be not nil. Please specify a valid object.") + } + if strings.TrimSpace(deploymentGraphSpec.ImportPath) == "" { + return util.NewInvalidInputError("ServeDeploymentGraphSpec import path must have a value. Please specify valid value.") + } + for index, serveConfig := range deploymentGraphSpec.ServeConfigs { + if strings.TrimSpace(serveConfig.DeploymentName) == "" { + return util.NewInvalidInputError("ServeConfig %d deployment name is empty. Please specify a valid value.", index) + } + if serveConfig.Replicas <= 0 { + return util.NewInvalidInputError("ServeConfig %d replicas must be greater than 0. Please specify a valid value.", index) + } + if serveConfig.ActorOptions != nil { + if serveConfig.ActorOptions.CpusPerActor <= 0 && serveConfig.ActorOptions.GpusPerActor <= 0 && serveConfig.ActorOptions.MemoryPerActor <= 0 { + return util.NewInvalidInputError("ServeConfig %d invalid ActorOptions, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0.", index) + } + } + } + return nil +} diff --git a/apiserver/pkg/server/validations_test.go b/apiserver/pkg/server/validations_test.go new file mode 100644 index 00000000000..b7f36abf891 --- /dev/null +++ b/apiserver/pkg/server/validations_test.go @@ -0,0 +1,513 @@ +package server_test + +import ( + "testing" + + "github.com/ray-project/kuberay/apiserver/pkg/server" + "github.com/ray-project/kuberay/apiserver/pkg/util" + api "github.com/ray-project/kuberay/proto/go_client" + "github.com/stretchr/testify/require" +) + +func TestValidateClusterSpec(t *testing.T) { + tests := []struct { + name string + clusterSpec *api.ClusterSpec + expectedError error + }{ + { + name: "A valid cluster spec", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group-1", + ComputeTemplate: "group-1-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + { + GroupName: "group-2", + ComputeTemplate: "group-2-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: nil, + }, + { + name: "A nill cluster spec", + clusterSpec: nil, + expectedError: util.NewInvalidInputError("A ClusterSpec object is required. Please specify one."), + }, + { + name: "An empty cluster spec", + clusterSpec: &api.ClusterSpec{}, + expectedError: util.NewInvalidInputError("Cluster Spec Object requires HeadGroupSpec to be populated. Please specify one."), + }, + { + name: "An empty head group cluster spec", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{}, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: util.NewInvalidInputError("HeadGroupSpec compute template is empty. Please specify a valid value."), + }, + { + name: "A head group without ray start parameters", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: nil, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: util.NewInvalidInputError("HeadGroupSpec RayStartParams is empty. Please specify values."), + }, + { + name: "An empty worker group", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{}, + }, + expectedError: nil, + }, + { + name: "Two empty worker group specs", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + {}, + {}, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 group name is empty. Please specify a valid value."), + }, + { + name: "A worker group spec without a group name", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "", + ComputeTemplate: "group-1-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 group name is empty. Please specify a valid value."), + }, + { + name: "A worker group spec without a template", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 compute template is empty. Please specify a valid value."), + }, + { + name: "A worker group spec with 0 max replicas", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a template", + MaxReplicas: 0, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 MaxReplicas can not be 0. Please specify a valid value."), + }, + { + name: "A worker group spec with invalid min replicas", + clusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a template", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a template", + MinReplicas: 5, + MaxReplicas: 1, + }, + }, + }, + expectedError: util.NewInvalidInputError("WorkerNodeSpec 0 MinReplica > MaxReplicas. Please specify a valid value."), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + actualError := server.ValidateClusterSpec(tc.clusterSpec) + if tc.expectedError == nil { + require.NoError(t, actualError, "No error expected.") + } else { + require.EqualError(t, actualError, tc.expectedError.Error(), "A matching error is expected") + } + }) + } +} + +func TestValidateCreateServiceRequest(t *testing.T) { + tests := []struct { + name string + request *api.CreateRayServiceRequest + expectedError error + }{ + { + name: "A valid create service request V2", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "some yaml", + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + EnableIngress: false, + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: nil, + }, + { + name: "A valid create service request V1", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: nil, + }, + { + name: "A nil name create service request", + request: nil, + expectedError: util.NewInvalidInputError("A non nill request is expected"), + }, + { + name: "An empty create service request", + request: &api.CreateRayServiceRequest{}, + expectedError: util.NewInvalidInputError("Namespace is empty. Please specify a valid value."), + }, + { + name: "A create service request with a nill service spec", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: nil, + }, + expectedError: util.NewInvalidInputError("Service is empty, please input a valid payload."), + }, + { + name: "A create service request with mismatching namespaces", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "another-namespace", + }, + }, + expectedError: util.NewInvalidInputError("The namespace in the request is different from the namespace in the service definition."), + }, + { + name: "A create service request with no name", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + }, + }, + expectedError: util.NewInvalidInputError("Service name is empty. Please specify a valid value."), + }, + { + name: "A create service request with no user name", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + Name: "fruit-stand", + User: "", + }, + }, + expectedError: util.NewInvalidInputError("User who create the Service is empty. Please specify a valid value."), + }, + { + name: "A create service with no service graph or V2 config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Namespace: "a-namespace", + Name: "fruit-stand", + User: "3cp0", + }, + }, + expectedError: util.NewInvalidInputError("A serve config v2 or deployment graph specs is required. Please specify either."), + }, + { + name: "A create service request with both V1 and graph spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + RuntimeEnv: "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + UserConfig: "price: 2", + ActorOptions: &api.ActorOptions{ + CpusPerActor: 0.1, + }, + }, + }, + }, + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: &api.ClusterSpec{ + HeadGroupSpec: &api.HeadGroupSpec{ + ComputeTemplate: "a compute template name", + RayStartParams: map[string]string{ + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080", + }, + Volumes: []*api.Volume{}, + }, + WorkerGroupSpec: []*api.WorkerGroupSpec{ + { + GroupName: "group 1", + ComputeTemplate: "a-template", + Replicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + }, + }, + }, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("Both serve config v2 or deployment graph specs were specified. Please specify one or the other."), + }, + { + name: "A create request with no cluster spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeConfig_V2: "some yaml", + ServiceUnhealthySecondThreshold: 900, + DeploymentUnhealthySecondThreshold: 300, + ClusterSpec: nil, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("A ClusterSpec object is required. Please specify one."), + }, + { + name: "A create request with empty deployment graph spec", + request: &api.CreateRayServiceRequest{ + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{}, + ClusterSpec: nil, + }, + Namespace: "a-namespace", + }, + expectedError: util.NewInvalidInputError("ServeDeploymentGraphSpec import path must have a value. Please specify valid value."), + }, + { + name: "A create request with a invalid deployment graph spec empty serve config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + {}, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 deployment name is empty. Please specify a valid value."), + }, + { + name: "A create request with a invalid deployment graph spec no replicas serve config", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: -1, + }, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 replicas must be greater than 0. Please specify a valid value."), + }, + { + name: "A create request with a invalid deployment graph spec with invalid actor options", + request: &api.CreateRayServiceRequest{ + Namespace: "a-namespace", + Service: &api.RayService{ + Name: "a-name", + Namespace: "a-namespace", + User: "a-user", + ServeDeploymentGraphSpec: &api.ServeDeploymentGraphSpec{ + ImportPath: "fruit.deployment_graph", + ServeConfigs: []*api.ServeConfig{ + { + DeploymentName: "OrangeStand", + Replicas: 1, + ActorOptions: &api.ActorOptions{ + CpusPerActor: -1, + GpusPerActor: -1, + MemoryPerActor: 0, + }, + }, + }, + }, + }, + }, + expectedError: util.NewInvalidInputError("ServeConfig 0 invalid ActorOptions, cpusPerActor, gpusPerActor and memoryPerActor must be greater than 0."), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + t.Run(tc.name, func(t *testing.T) { + actualError := server.ValidateCreateServiceRequest(tc.request) + if tc.expectedError == nil { + require.NoError(t, actualError, "No error expected.") + } else { + require.EqualError(t, actualError, tc.expectedError.Error(), "A matching error is expected") + } + }) + } +} diff --git a/apiserver/test/e2e/cluster_server_e2e_test.go b/apiserver/test/e2e/cluster_server_e2e_test.go new file mode 100644 index 00000000000..7a864c4f793 --- /dev/null +++ b/apiserver/test/e2e/cluster_server_e2e_test.go @@ -0,0 +1,256 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" +) + +// TestClusterServerEndpoints sequentially iterates over the endpoints of the Clusters endpoints +func TestClusterServerEndpoints(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []End2EndTest{ + { + Name: "Create a valid cluster", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters", + HttpMethod: "POST", + InputJSON: `{ + "name": "{cluster-name}", + "namespace": "{namespace}", + "user": "3cpo", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 1, + "maxReplicas": 1, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForRunningCluster(), + }, + { + Name: "Get all clusters", + Endpoint: "/apis/v1alpha2/clusters", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get all clusters in a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get a specific cluster from a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters/{cluster-name}", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get a non existing cluster from a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters/a-name", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + { + Name: "Delete an existing cluster ", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters/{cluster-name}", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForDeletedCluster(), + }, + { + Name: "Delete an non existing cluster ", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters/a-name", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + } + + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + } +} + +func TestClusterServerWithVolumes(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + tests := []End2EndTest{ + { + Name: "Create cluster with config map volume", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/clusters", + HttpMethod: "POST", + InputJSON: ` + { + "name": "{cluster-name}", + "namespace": "{namespace}", + "user": "boris", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [ + { + "name": "code-sample", + "mountPath": "/home/ray/samples", + "volumeType": "CONFIGMAP", + "source": "{config-map-name}", + "items": {"sample_code.py" : "sample_code.py"} + } + ] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + }, + "volumes": [ + { + "name": "code-sample", + "mountPath": "/home/ray/samples", + "volumeType": "CONFIGMAP", + "source": "{config-map-name}", + "items": {"sample_code.py" : "sample_code.py"} + } + ] + } + ] + } + }`, + ExpectedHttpCode: http.StatusOK, + SetupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + // create config map and register a cleanup hook upon success + tCtx.CreateConfigMap(t, map[string]string{ + "sample_code.py": ` +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0" +`, + }) + }, + CleanupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + tCtx.DeleteRayCluster(t) + }, + ValidationFunc: waitForRunningCluster(), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + tCtx.ResetNames() + } +} + +func waitForRunningCluster() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the cluster to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := tCtx.GetRayCluster() + if err00 != nil { + return true, err00 + } + t.Logf("Found cluster state of '%s' for ray cluster '%s'", rayCluster.Status.State, tCtx.GetRayClusterName()) + return rayCluster.Status.State == v1alpha1.Ready, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray cluster: '%s', err %v", tCtx.GetRayClusterName(), err) + } +} + +func waitForDeletedCluster() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the cluster to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayCluster, err00 := tCtx.GetRayCluster() + if err00 != nil && + assert.EqualError(t, err00, "rayclusters.ray.io \""+tCtx.GetRayClusterName()+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray cluster '%s'", rayCluster.Status.State, tCtx.GetRayClusterName()) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray cluster: '%s', err %v", tCtx.GetRayClusterName(), err) + } +} diff --git a/apiserver/test/e2e/config_server_e2e_test.go b/apiserver/test/e2e/config_server_e2e_test.go new file mode 100644 index 00000000000..7f8ec725478 --- /dev/null +++ b/apiserver/test/e2e/config_server_e2e_test.go @@ -0,0 +1,136 @@ +package e2e + +import ( + "net/http" + "testing" + + "github.com/stretchr/testify/require" +) + +// TestConfigServerEndpoints sequentially iterates over the endpoints of the Config endpoints +func TestConfigServerEndpoints(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tests := []End2EndTest{ + { + Name: "Create a valid template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "{compute-template-name}", + "namespace": "{namespace}", + "cpu": 2, + "memory": 4 + }`, + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Create an invalid template with no name", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "", + "namespace": "{namespace}", + "cpu": 2, + "memory": 4 + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create an invalid template with different namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "{compute-template-name}", + "namespace": "another", + "cpu": 2, + "memory": 4 + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create an invalid template with zero cpu", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "{compute-template-name}", + "namespace": "{namespace}", + "cpu": 0, + "memory": 4 + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create an invalid template with zero memory", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "{compute-template-name}", + "namespace": "{namespace}", + "cpu": 2, + "memory": 0 + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create an duplicate template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "POST", + InputJSON: `{ + "name": "{compute-template-name}", + "namespace": "{namespace}", + "cpu": 2, + "memory": 4 + }`, + ExpectedHttpCode: http.StatusConflict, + }, + { + Name: "Get all the templates", + Endpoint: "/apis/v1alpha2/compute_templates", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get all the templates in the namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get template by name in a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates/{compute-template-name}", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get non existing template by name in a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates/some-template", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + { + Name: "Delete existing template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates/{compute-template-name}", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Delete non existing template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/compute_templates/some-template", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + } +} diff --git a/apiserver/test/e2e/doc.go b/apiserver/test/e2e/doc.go new file mode 100644 index 00000000000..e065bde842b --- /dev/null +++ b/apiserver/test/e2e/doc.go @@ -0,0 +1,15 @@ +// Package e2e provides test functions, utility function and structs that allow for integration testing +// of Kuberay API server and Kuberay operator. +// +// The code assumes that cluster found in [~/.kube/config] up and has the needed components (Kuberay API server +// Kuberay Operator) deployed and functional. +// +// The code is organized as follows: +// +// - types.go -- provides for data types +// - utils.go -- provides for utility functions +// - cluster_server_e2e_test.go -- provides the test function for the Cluster GRPC Server +// - config_server_e2e_test.go -- provides the test function for the Config GRPC Server +// - job_server_e2e_test.go -- provides the test function for the Job GRPC Server +// - service_server_e2e_test.go -- provides the test function +package e2e diff --git a/apiserver/test/e2e/job_server_e2e_test.go b/apiserver/test/e2e/job_server_e2e_test.go new file mode 100644 index 00000000000..58ec7b715bf --- /dev/null +++ b/apiserver/test/e2e/job_server_e2e_test.go @@ -0,0 +1,269 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" +) + +// TestJobServerEndpoints sequentially iterates over the Job endpoints +func TestJobServerEndpoints(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []End2EndTest{ + { + Name: "Create a valid job", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs", + HttpMethod: "POST", + InputJSON: ` + { + "name": "{job-name}", + "namespace": "{namespace}", + "user": "3cp0", + "entrypoint": "python -V", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0" + } + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 1, + "rayStartParams": { + "metrics-export-port": "8080" + } + } + ] + } + } + `, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForSucceededJob(), + }, + { + Name: "Get all the jobs", + Endpoint: "/apis/v1alpha2/jobs", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get all the jobs from a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get all the jobs from a namespace", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get a specific job from a specific endpoint", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs/{job-name}", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + }, + { + Name: "Get an non existing job", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs/a-job-name", + HttpMethod: "GET", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + { + Name: "Delete an existing job", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs/{job-name}", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForDeletedJob(), + }, + { + Name: "Delete an non existing job", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs/a-job-name", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + } + + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + } +} + +func TestJobServerWithVolumes(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []End2EndTest{ + { + Name: "Create sample job", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/jobs", + HttpMethod: "POST", + InputJSON: ` + { + "name": "{job-name}", + "namespace": "{namespace}", + "user": "boris", + "entrypoint": "python /home/ray/samples/sample_code.py", + "runtimeEnv": "pip:\n - requests==2.26.0\n - pendulum==2.1.2\nenv_vars:\n counter_name: test_counter\n", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [ + { + "name": "code-sample", + "mountPath": "/home/ray/samples", + "volumeType": "CONFIGMAP", + "source": "{config-map-name}", + "items": {"sample_code.py" : "sample_code.py"} + } + ] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + }, + "volumes": [ + { + "name": "code-sample", + "mountPath": "/home/ray/samples", + "volumeType": "CONFIGMAP", + "source": "{config-map-name}", + "items": {"sample_code.py" : "sample_code.py"} + } + ] + } + ] + } + } + `, + ExpectedHttpCode: http.StatusOK, + SetupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + // create config map and register a cleanup hook upon success + tCtx.CreateConfigMap(t, map[string]string{ + // The indentation for this code is intentional. + // Changes in it will fail the job. + "sample_code.py": ` +import ray +import os +import requests + +ray.init() + +@ray.remote +class Counter: + def __init__(self): + # Used to verify runtimeEnv + self.name = os.getenv("counter_name") + assert self.name == "test_counter" + self.counter = 0 + + def inc(self): + self.counter += 1 + + def get_counter(self): + return "{} got {}".format(self.name, self.counter) + +counter = Counter.remote() + +for _ in range(5): + ray.get(counter.inc.remote()) + print(ray.get(counter.get_counter.remote())) + +# Verify that the correct runtime env was used for the job. +assert requests.__version__ == "2.26.0"`, + }) + }, + CleanupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + tCtx.DeleteRayJob(t) + }, + ValidationFunc: waitForSucceededJob(), + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + tCtx.ResetNames() + } +} + +func waitForSucceededJob() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the cluster to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayJob, err00 := tCtx.GetRayJob() + if err00 != nil { + return true, err00 + } + t.Logf("Found job state of '%s' for ray cluster '%s'", rayJob.Status.JobStatus, tCtx.GetRayJobName()) + return rayJob.Status.JobStatus == v1alpha1.JobStatusSucceeded, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray job: '%s', err %v", tCtx.GetRayJobName(), err) + } +} + +func waitForDeletedJob() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the job to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayJob, err00 := tCtx.GetRayJob() + if err00 != nil && + assert.EqualError(t, err00, "rayjobs.ray.io \""+tCtx.GetRayJobName()+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray cluster '%s'", rayJob.Status.JobStatus, tCtx.GetRayJobName()) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray job: '%s', err %v", tCtx.GetRayJobName(), err) + } +} diff --git a/apiserver/test/e2e/service_server_e2e_test.go b/apiserver/test/e2e/service_server_e2e_test.go new file mode 100644 index 00000000000..059f9252d04 --- /dev/null +++ b/apiserver/test/e2e/service_server_e2e_test.go @@ -0,0 +1,478 @@ +package e2e + +import ( + "net/http" + "testing" + "time" + + "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/util/wait" +) + +// TestServiceServerV2 sequentially iterates over the endpoints of the service endpoints using +// V2 configurations (yaml) +func TestServiceServerV2(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []End2EndTest{ + { + Name: "Create a fruit stand service V2", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serviceUnhealthySecondThreshold": 900, + "deploymentUnhealthySecondThreshold": 300, + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForRunningService(), + }, + { + Name: "Delete fruit stand service V2", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services/{service-name}", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForDeletedService(), + }, + { + Name: "Delete non existing service", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services/{service-name}", + HttpMethod: "DELETE", + InputJSON: "", + ExpectedHttpCode: http.StatusNotFound, + }, + { + Name: "Create ray serve with mismatching namespaces", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "another-namespace", + "user": "user", + "serveConfigV2": "", + "clusterSpec": {} + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create ray serve with no name", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: ` + { + "name": "", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "", + "clusterSpec": {} + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create ray serve with empty head group cluster spec", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "clusterSpec": { + "headGroupSpec": { + }, + "workerGroupSpec": [ + {} + ] + } + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create ray serve with no workgroup spec", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [] + } + }`, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForRunningService(), + }, + { + Name: "Create ray serve with 2 work group specs cluster spec", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + }, + { + "groupName": "small-wg-2", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + } + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create ray serve service with head group spec with no compute template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + { + Name: "Create ray serve service with worker group spec with no compute template", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serveConfigV2": "applications:\n - name: fruit_app\n import_path: fruit.deployment_graph\n route_prefix: /fruit\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: MangoStand\n num_replicas: 1\n user_config:\n price: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: OrangeStand\n num_replicas: 1\n user_config:\n price: 2\n ray_actor_options:\n num_cpus: 0.1\n - name: PearStand\n num_replicas: 1\n user_config:\n price: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: FruitMarket\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: DAGDriver\n num_replicas: 1\n ray_actor_options:\n num_cpus: 0.1\n - name: math_app\n import_path: conditional_dag.serve_dag\n route_prefix: /calc\n runtime_env:\n working_dir: \"https://github.com/ray-project/test_dag/archive/41d09119cbdf8450599f993f51318e9e27c59098.zip\"\n deployments:\n - name: Adder\n num_replicas: 1\n user_config:\n increment: 3\n ray_actor_options:\n num_cpus: 0.1\n - name: Multiplier\n num_replicas: 1\n user_config:\n factor: 5\n ray_actor_options:\n num_cpus: 0.1\n - name: Router\n num_replicas: 1\n - name: create_order\n num_replicas: 1\n - name: DAGDriver\n num_replicas: 1\n", + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusBadRequest, + }, + } + + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + } +} + +// TestServiceServerV1 sequentially iterates over create serve endpoints with +// V1 serve input +func TestServiceServerV1(t *testing.T) { + tCtx, err := NewEnd2EndTestingContext(t) + require.NoError(t, err, "No error expected when creating testing context") + + tCtx.CreateComputeTemplate(t) + t.Cleanup(func() { + tCtx.DeleteComputeTemplate(t) + }) + + tests := []End2EndTest{ + { + Name: "Create a fruit stand service V1", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serviceUnhealthySecondThreshold": 90, + "deploymentUnhealthySecondThreshold": 30, + "serveDeploymentGraphSpec": { + "importPath": "fruit.deployment_graph", + "runtimeEnv": "working_dir: \"https://github.com/ray-project/test_dag/archive/c620251044717ace0a4c19d766d43c5099af8a77.zip\"\n", + "serveConfigs": [ + { + "deploymentName": "OrangeStand", + "replicas": 1, + "userConfig": "price: 2", + "actorOptions": { + "cpusPerActor": 0.1 + } + }, + { + "deploymentName": "PearStand", + "replicas": 1, + "userConfig": "price: 1", + "actorOptions": { + "cpusPerActor": 0.1 + } + }, + { + "deploymentName": "FruitMarket", + "replicas": 1, + "actorOptions": { + "cpusPerActor": 0.1 + } + }, + { + "deploymentName": "DAGDriver", + "replicas": 1, + "routePrefix": "/", + "actorOptions": { + "cpusPerActor": 0.1 + } + }] + }, + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForRunningService(), + CleanupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + tCtx.DeleteRayService(t) + }, + }, + { + Name: "Create a V1 serve service with empty deployment graph spec", + Endpoint: "/apis/v1alpha2/namespaces/{namespace}/services", + HttpMethod: "POST", + InputJSON: `{ + "name": "{service-name}", + "namespace": "{namespace}", + "user": "user", + "serviceUnhealthySecondThreshold": 60, + "deploymentUnhealthySecondThreshold": 30, + "serveDeploymentGraphSpec": { + "importPath": "fruit.deployment_graph" + }, + "clusterSpec": { + "headGroupSpec": { + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "serviceType": "NodePort", + "rayStartParams": { + "dashboard-host": "0.0.0.0", + "metrics-export-port": "8080" + }, + "volumes": [] + }, + "workerGroupSpec": + [ + { + "groupName": "small-wg", + "computeTemplate": "{compute-template-name}", + "image": "{ray-image}", + "replicas": 1, + "minReplicas": 0, + "maxReplicas": 5, + "rayStartParams": { + "node-ip-address": "$MY_POD_IP" + } + } + ] + } + }`, + ExpectedHttpCode: http.StatusOK, + ValidationFunc: waitForNotRunningService(), + CleanupFunc: func(t *testing.T, tCtx *End2EndTestingContext) { + tCtx.DeleteRayService(t) + }, + }, + } + // Execute tests sequentially + for _, tc := range tests { + tc := tc // capture range variable + tc.RunTest(t, tCtx) + tCtx.ResetNames() + } +} + +func waitForRunningService() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the service to be in a running state for 3 minutes + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayService() + if err00 != nil { + return true, err00 + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, tCtx.GetRayServiceName()) + return rayService.Status.ServiceStatus == v1alpha1.Running, nil + }) + require.NoErrorf(t, err, "No error expected when getting ray service: '%s', err %v", tCtx.GetRayServiceName(), err) + } +} + +func waitForNotRunningService() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the service to be in a `WaitForServeDeploymentReady` state for 90 seconds + // the service will should be in WaitForServeDeploymentReady state forever, + // it expected that this function would timeout + err := wait.Poll(500*time.Millisecond, 90*time.Second, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayService() + if err00 != nil { + return true, err00 + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, tCtx.GetRayServiceName()) + t.Logf("Found cluster named '%s' for ray service '%s'", rayService.Status.PendingServiceStatus.RayClusterName, tCtx.GetRayServiceName()) + return false, nil + }) + require.ErrorIsf(t, err, wait.ErrWaitTimeout, "A ErrWaitTimeout error expected when getting status for ray service: '%s', err %v", tCtx.GetRayServiceName(), err) + } +} + +func waitForDeletedService() func(t *testing.T, tCtx *End2EndTestingContext) { + return func(t *testing.T, tCtx *End2EndTestingContext) { + // wait for the service to be deleted + // if is not in that state, return an error + err := wait.Poll(500*time.Millisecond, 3*time.Minute, func() (done bool, err error) { + rayService, err00 := tCtx.GetRayService() + if err00 != nil && + assert.EqualError(t, err00, "rayservices.ray.io \""+tCtx.GetRayServiceName()+"\" not found") { + return true, nil + } + t.Logf("Found status of '%s' for ray service '%s'", rayService.Status.ServiceStatus, tCtx.GetRayServiceName()) + return false, err00 + }) + require.NoErrorf(t, err, "No error expected when deleting ray service: '%s', err %v", tCtx.GetRayServiceName(), err) + } +} diff --git a/apiserver/test/e2e/types.go b/apiserver/test/e2e/types.go new file mode 100644 index 00000000000..d75b8ca76a3 --- /dev/null +++ b/apiserver/test/e2e/types.go @@ -0,0 +1,364 @@ +package e2e + +import ( + "context" + "net/http" + "os" + "regexp" + "runtime" + "strings" + "testing" + "time" + + petnames "github.com/dustinkirkland/golang-petname" + "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" + rayv1alpha1 "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/typed/ray/v1alpha1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + v1 "k8s.io/api/core/v1" + k8sApiErrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +// End2EndTest struct defines the input values and the expected results +// for Kuberay API server e2e tests +type End2EndTest struct { + Name string + Endpoint string + HttpMethod string + InputJSON string + ExpectedHttpCode int + SetupFunc func(t *testing.T, tCtx *End2EndTestingContext) + CleanupFunc func(t *testing.T, tCtx *End2EndTestingContext) + ValidationFunc func(t *testing.T, tCtx *End2EndTestingContext) +} + +// RunTest runs the test +func (e2eT *End2EndTest) RunTest(t *testing.T, tCtx *End2EndTestingContext) { + t.Run(e2eT.Name, func(t *testing.T) { + if e2eT.SetupFunc != nil { + e2eT.SetupFunc(t, tCtx) + } + if e2eT.CleanupFunc != nil { + t.Cleanup(func() { + e2eT.CleanupFunc(t, tCtx) + }) + } + + request, err := CreateHttpRequest(e2eT.HttpMethod, tCtx.apiServerBaseURL, tCtx.ReplaceAll(e2eT.Endpoint), MakeBodyReader(tCtx.ReplaceAll(e2eT.InputJSON))) + require.NoError(t, err, "Not expecting an error when creating HTTP request.") + t.Logf("Request URL: %s", request.URL.String()) + t.Logf("Request Body:\n '%s'", e2eT.InputJSON) + + response, err := tCtx.apiServerHttpClient.Do(request) + assert.NoError(t, err, "Not expecting an error when executing HTTP request.") + require.NotNil(t, response, "A non-nill response is expected") + assert.Equalf(t, e2eT.ExpectedHttpCode, response.StatusCode, "A valid HTTP response code is expected") + + defer response.Body.Close() + body, err := PrettyPrintResponseBody(response.Body) + assert.NoError(t, err, "Not expecting an error when reading the body.") + t.Logf("Response body:\n '%s'", body) + + if e2eT.ValidationFunc != nil { + e2eT.ValidationFunc(t, tCtx) + } + }) +} + +// End2EndTestingContext provides a common set of values and methods that +// can be used in executing the tests +type End2EndTestingContext struct { + ctx context.Context + apiServerHttpClient *http.Client + rayClient rayv1alpha1.RayV1alpha1Interface + k8client *kubernetes.Clientset + namespaceRegex *regexp.Regexp + computeTemplateRegex *regexp.Regexp + rayImageRegex *regexp.Regexp + clusterNameRegex *regexp.Regexp + jobNameRegex *regexp.Regexp + serviceNameRegex *regexp.Regexp + configMapNameRegex *regexp.Regexp + apiServerBaseURL string + rayImage string + namespaceName string + computeTemplateName string + clusterName string + jobName string + serviceName string + configMapName string +} + +// contextOption is a functional option that allows for building out an instance +// of *End2EndTestingContext +type contextOption func(t *testing.T, tCtx *End2EndTestingContext) error + +// NewEnd2EndTestingContext constructs a *End2EndTestingContext +func NewEnd2EndTestingContext(t *testing.T) (*End2EndTestingContext, error) { + petnames.NonDeterministicMode() + // ordering is important as there dependencies between field values + return newEnd2EndTestingContext(t, + withNames(), + withRayImage(), + withHttpClient(), + withContext(), + withBaseURL(), + withK8sClient(), + withRayClient(), + withNamespace(), + ) +} + +func newEnd2EndTestingContext(t *testing.T, options ...contextOption) (*End2EndTestingContext, error) { + testingContext := &End2EndTestingContext{ + namespaceRegex: regexp.MustCompile("{namespace}"), + computeTemplateRegex: regexp.MustCompile("{compute-template-name}"), + rayImageRegex: regexp.MustCompile("{ray-image}"), + clusterNameRegex: regexp.MustCompile("{cluster-name}"), + jobNameRegex: regexp.MustCompile("{job-name}"), + serviceNameRegex: regexp.MustCompile("{service-name}"), + configMapNameRegex: regexp.MustCompile("{config-map-name}"), + } + for _, o := range options { + err := o(t, testingContext) + if err != nil { + return nil, err + } + } + return testingContext, nil +} + +func withHttpClient() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + testingContext.apiServerHttpClient = &http.Client{Timeout: time.Duration(1) * time.Second} + return nil + } +} + +func withContext() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + testingContext.ctx = context.Background() + return nil + } +} + +func withBaseURL() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + baseURL := os.Getenv("E2E_API_SERVER_URL") + if strings.TrimSpace(baseURL) == "" { + baseURL = "http://localhost:31888" + } + testingContext.apiServerBaseURL = baseURL + return nil + } +} + +func withRayImage() contextOption { + return func(_ *testing.T, testingContext *End2EndTestingContext) error { + rayImage := os.Getenv("E2E_API_SERVER_RAY_IMAGE") + if strings.TrimSpace(rayImage) == "" { + rayImage = "rayproject/ray:2.7.0-py310" + } + // detect if we are running on arm64 machine, most likely apple silicon + // the os name is not checked as it also possible that it might be linux + // also check if the image does not have the `-aarch64` suffix + if runtime.GOARCH == "arm64" && !strings.HasSuffix(rayImage, "-aarch64") { + rayImage = rayImage + "-aarch64" + } + testingContext.rayImage = rayImage + return nil + } +} + +func withK8sClient() contextOption { + return func(t *testing.T, testingContext *End2EndTestingContext) error { + cfg, err := config.GetConfig() + require.NoError(t, err, "No error expected when getting k8s client configuration") + clientSet, err := kubernetes.NewForConfig(cfg) + require.NoError(t, err, "No error expected when creating k8s client") + testingContext.k8client = clientSet + return nil + } +} + +func withNames() contextOption { + return func(t *testing.T, tCtx *End2EndTestingContext) error { + tCtx.namespaceName = petnames.Generate(2, "-") + tCtx.computeTemplateName = petnames.Name() + tCtx.ResetNames() + return nil + } +} + +func withNamespace() contextOption { + return func(t *testing.T, tCtx *End2EndTestingContext) error { + require.NotNil(t, tCtx.k8client, "A k8s client must be created prior to creating a namespace") + require.NotNil(t, tCtx.ctx, "A context must exist prior to creating a namespace") + require.NotEmpty(t, tCtx.namespaceName, "Namespace name must be set prior to creating a namespace") + nsName := &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: tCtx.namespaceName, + }, + } + + _, err := tCtx.k8client.CoreV1().Namespaces().Create(tCtx.ctx, nsName, metav1.CreateOptions{}) + require.NoErrorf(t, err, "Expected to create a namespace '%s", nsName.ObjectMeta.Name) + + // register an automatic deletion of the namespace at test's end + t.Cleanup(func() { + err := tCtx.k8client.CoreV1().Namespaces().Delete(tCtx.ctx, tCtx.namespaceName, metav1.DeleteOptions{}) + assert.NoErrorf(t, err, "No error expected when deleting namespace '%s'", tCtx.namespaceName) + }) + return nil + } +} + +func withRayClient() contextOption { + return func(t *testing.T, tCtx *End2EndTestingContext) error { + cfg, err := config.GetConfig() + require.NoError(t, err, "No error expected when getting k8s client configuration") + tCtx.rayClient, err = rayv1alpha1.NewForConfig(cfg) + require.NoError(t, err, "No error expected when getting a ray") + return nil + } +} + +// ReplaceAll function replaces all the supported markers in the provided string +// +// The supported markers are: +// - {namespace} -- replaced by e2etc.namespaceName value +// - {compute-template-name} -- replaced by the e2etc.computeTemplateName value +// - {ray-image} -- replaced by the e2etc.rayImage value +// - {cluster-name} -- replaced by the e2etc.clusterName value +// - {job-name} -- replaced by the e2etc.jobName value +// - {service-name} -- replaced by the e2etc.serviceName value +// - {config-map-name} -- replaced by e2etc.configMapName value +func (e2etc *End2EndTestingContext) ReplaceAll(inputPattern string) string { + returnValue := e2etc.namespaceRegex.ReplaceAllString(inputPattern, e2etc.namespaceName) + returnValue = e2etc.computeTemplateRegex.ReplaceAllString(returnValue, e2etc.computeTemplateName) + returnValue = e2etc.rayImageRegex.ReplaceAllString(returnValue, e2etc.rayImage) + returnValue = e2etc.clusterNameRegex.ReplaceAllString(returnValue, e2etc.clusterName) + returnValue = e2etc.jobNameRegex.ReplaceAllString(returnValue, e2etc.jobName) + returnValue = e2etc.serviceNameRegex.ReplaceAllString(returnValue, e2etc.serviceName) + returnValue = e2etc.configMapNameRegex.ReplaceAllString(returnValue, e2etc.configMapName) + return returnValue +} + +func (e2etc *End2EndTestingContext) GetRayService() (*v1alpha1.RayService, error) { + return e2etc.rayClient.RayServices(e2etc.namespaceName).Get(e2etc.ctx, e2etc.serviceName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetRayServiceName() string { + return e2etc.serviceName +} + +func (e2etc *End2EndTestingContext) GetRayCluster() (*v1alpha1.RayCluster, error) { + return e2etc.rayClient.RayClusters(e2etc.namespaceName).Get(e2etc.ctx, e2etc.clusterName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetRayClusterName() string { + return e2etc.clusterName +} + +func (e2etc *End2EndTestingContext) GetRayJob() (*v1alpha1.RayJob, error) { + return e2etc.rayClient.RayJobs(e2etc.namespaceName).Get(e2etc.ctx, e2etc.jobName, metav1.GetOptions{}) +} + +func (e2etc *End2EndTestingContext) GetRayJobName() string { + return e2etc.jobName +} + +func (e2etc *End2EndTestingContext) GetConfigMapName() string { + return e2etc.configMapName +} + +func (e2etc *End2EndTestingContext) ResetNames() { + e2etc.clusterName = petnames.Name() + e2etc.jobName = petnames.Name() + e2etc.serviceName = petnames.Generate(2, "-") + e2etc.configMapName = petnames.Generate(2, "-") +} + +func (e2etc *End2EndTestingContext) CreateComputeTemplate(t *testing.T) { + createURL := e2etc.apiServerBaseURL + "/apis/v1alpha2/namespaces/" + e2etc.namespaceName + "/compute_templates" + + request, err := http.NewRequest("POST", createURL, strings.NewReader( + `{ + "name": "`+e2etc.computeTemplateName+`", + "namespace": "`+e2etc.namespaceName+`", + "cpu": 2, + "memory": 4 + }`)) + require.NoError(t, err, "No error expected when creating http request for compute template creation") + request.Header.Add("Accept", "application/json") + request.Header.Add("Content-Type", "application/json") + + response, err := e2etc.apiServerHttpClient.Do(request) + require.NoError(t, err, "No error expected when executing http request for compute template creation") + + defer response.Body.Close() + if response.StatusCode != http.StatusOK { + assert.FailNowf(t, "Failed to create compute template", "HTTP code: %d", response.StatusCode) + } +} + +func (e2etc *End2EndTestingContext) DeleteComputeTemplate(t *testing.T) { + deleteURL := e2etc.apiServerBaseURL + "/apis/v1alpha2/namespaces/" + e2etc.namespaceName + "/compute_templates/" + e2etc.computeTemplateName + + request, err := http.NewRequest("DELETE", deleteURL, nil) + require.NoError(t, err, "No error expected when creating a delete compute template request") + request.Header.Add("Accept", "application/json") + + response, err := e2etc.apiServerHttpClient.Do(request) + require.NoError(t, err, "No error expected when creating http request for compute template delete") + defer response.Body.Close() + + if response.StatusCode != http.StatusOK { + assert.FailNowf(t, "Failed to delete compute template", "HTTP code: %d", response.StatusCode) + } +} + +func (e2etc *End2EndTestingContext) DeleteRayService(t *testing.T) { + err := e2etc.rayClient.RayServices(e2etc.namespaceName).Delete(e2etc.ctx, e2etc.serviceName, metav1.DeleteOptions{}) + if err != nil { + assert.Truef(t, k8sApiErrors.IsNotFound(err), "Only IsNotFoundException allowed, received %v", err) + } +} + +func (e2etc *End2EndTestingContext) DeleteRayCluster(t *testing.T) { + err := e2etc.rayClient.RayClusters(e2etc.namespaceName).Delete(e2etc.ctx, e2etc.clusterName, metav1.DeleteOptions{}) + if err != nil { + assert.Truef(t, k8sApiErrors.IsNotFound(err), "Only IsNotFoundException allowed, received %v", err) + } +} + +func (e2etc *End2EndTestingContext) DeleteRayJob(t *testing.T) { + err := e2etc.rayClient.RayJobs(e2etc.namespaceName).Delete(e2etc.ctx, e2etc.jobName, metav1.DeleteOptions{}) + if err != nil { + assert.Truef(t, k8sApiErrors.IsNotFound(err), "Only IsNotFoundException allowed, received %v", err) + } +} + +func (e2etc *End2EndTestingContext) CreateConfigMap(t *testing.T, values map[string]string) { + cm := &v1.ConfigMap{ + TypeMeta: metav1.TypeMeta{Kind: "ConfigMap", APIVersion: "v1"}, + ObjectMeta: metav1.ObjectMeta{Name: e2etc.configMapName, Namespace: e2etc.namespaceName}, + Immutable: new(bool), + Data: values, + } + _, err := e2etc.k8client.CoreV1().ConfigMaps(e2etc.namespaceName).Create(e2etc.ctx, cm, metav1.CreateOptions{}) + require.NoErrorf(t, err, "No error expeted when creating configmap '%s' in namespace '%s'", e2etc.configMapName, e2etc.namespaceName) + t.Cleanup(func() { + e2etc.DeleteConfigMap(t) + }) +} + +func (e2etc *End2EndTestingContext) DeleteConfigMap(t *testing.T) { + err := e2etc.k8client.CoreV1().ConfigMaps(e2etc.namespaceName).Delete(e2etc.ctx, e2etc.configMapName, metav1.DeleteOptions{}) + if err != nil { + assert.Falsef(t, k8sApiErrors.IsNotFound(err), "Only IsNotFoundException allowed, received %v", err) + } +} diff --git a/apiserver/test/e2e/utils.go b/apiserver/test/e2e/utils.go new file mode 100644 index 00000000000..5b7c35e679e --- /dev/null +++ b/apiserver/test/e2e/utils.go @@ -0,0 +1,44 @@ +package e2e + +import ( + "bytes" + "encoding/json" + "io" + "net/http" + "strings" +) + +// CreateHttpRequest instantiates a http request for the specified endpoint and host +func CreateHttpRequest(method string, host string, endPoint string, body io.Reader) (*http.Request, error) { + url := host + endPoint + req, err := http.NewRequest(method, url, body) + if err != nil { + return nil, err + } + req.Header.Add("Accept", "application/json") + req.Header.Add("Content-Type", "application/json") + return req, nil +} + +// MakeBodyReader creates a io.Reader from the supplied string if is not empty after +// trimming the spaces +func MakeBodyReader(s string) io.Reader { + if strings.TrimSpace(s) != "" { + return strings.NewReader(s) + } + return nil +} + +// PrettyPrintResponseBody generates a "pretty" formatted JSON string from the body +func PrettyPrintResponseBody(body io.ReadCloser) (string, error) { + inputBytez, err := io.ReadAll(body) + if err != nil { + return "", err + } + var prettyJSON bytes.Buffer + error := json.Indent(&prettyJSON, inputBytez, "", "\t") + if error != nil { + return "", err + } + return prettyJSON.String(), nil +}