diff --git a/applications/ray/TPU_guide.md b/applications/ray/TPU_guide.md index 9e71934e5..62f92451e 100644 --- a/applications/ray/TPU_guide.md +++ b/applications/ray/TPU_guide.md @@ -38,7 +38,7 @@ accelerator_type = "nvidia-tesla-t4" ### Installing the TPU Initialization Webhook -The TPU Initialization Webhook automatically injects the `TPU_WORKER_ID`, `TPU_NAME`, and `TPU_WORKER_HOSTNAMES` environment variables necessary for multi-host TPU clusters. The webhook needs to be installed once per GKE cluster. The instructions can be found [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/ray/kuberay-tpu-webhook). +The TPU Initialization Webhook automatically injects the `TPU_WORKER_ID`, `TPU_NAME`, and `TPU_WORKER_HOSTNAMES` environment variables necessary for multi-host TPU clusters. The webhook needs to be installed once per GKE cluster and requires a Kuberay Operator running v1.1 and GKE cluster version of 1.28+. The instructions can be found [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/ray/kuberay-tpu-webhook). ### Creating the Kuberay Cluster @@ -54,7 +54,7 @@ The TPU Initialization Webhook automatically injects the `TPU_WORKER_ID`, `TPU_N This should deploy a Kuberay cluster with a single TPU worker node (v4 TPU with `2x2x1` topology). -To deploy a multi-host Ray Cluster, modify the `worker` spec [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/modules/kuberay-cluster/kuberay-tpu-values.yaml) by changing the `cloud.google.com/gke-tpu-topology` `nodeSelector` to a multi-host topology. Set the `replicas` field in the `worker` spec to the number of hosts specified by your chosen topology. For v4 TPUs, each TPU VM has access to 4 TPU chips. Therefore, you can calculate the number of TPU VM hosts by taking the product of the topology and dividing by 4 (i.e. a 2x2x4 TPU podslice will have 4 TPU VM hosts). +To deploy a multi-host Ray Cluster, modify the `worker` spec [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/modules/kuberay-cluster/kuberay-tpu-values.yaml) by changing the `cloud.google.com/gke-tpu-topology` `nodeSelector` to a multi-host topology. Set the `numOfHosts` field in the `worker` spec to the number of hosts specified by your chosen topology. For v4 TPUs, each TPU VM has access to 4 TPU chips. Therefore, you can calculate the number of TPU VM hosts by taking the product of the topology and dividing by 4 (i.e. a 2x2x4 TPU podslice will have 4 TPU VM hosts). ### Running Sample Workloads diff --git a/applications/ray/kuberay-tpu-webhook/Makefile b/applications/ray/kuberay-tpu-webhook/Makefile index 04b4e738b..cbdc869b2 100644 --- a/applications/ray/kuberay-tpu-webhook/Makefile +++ b/applications/ray/kuberay-tpu-webhook/Makefile @@ -1,5 +1,5 @@ # Image URL to use all building/pushing image targets -IMG ?= us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/kuberay-tpu-webhook:v1.0 +IMG ?= us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/kuberay-tpu-webhook:v1.1 # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) diff --git a/applications/ray/kuberay-tpu-webhook/README.md b/applications/ray/kuberay-tpu-webhook/README.md index 8eb97e205..965fc506d 100644 --- a/applications/ray/kuberay-tpu-webhook/README.md +++ b/applications/ray/kuberay-tpu-webhook/README.md @@ -11,6 +11,11 @@ Preinstall on your computer: - Helm - Gcloud +When installing using terraform ensure that: +- GKE cluster is created with version 1.28+ +- Kuberay Operator version is set to v1.1+ + - can edit operator version in ai-on-gke/modules/kuberay-operator/kuberay.tf before running `terraform apply` + ### Installing the GKE Platform 1. If needed, git clone https://github.com/GoogleCloudPlatform/ai-on-gke @@ -52,4 +57,4 @@ After deploying the webhook, follow the steps in ray/TPU_GUIDE to setup Ray on G ### Limitations -The webhook stores unique `TPU_WORKER_ID`s in memory, and will fail to initialize the environment variables correctly if the webhook pod dies or restarts before intercepting all pods. Finally, environment vars are not updated or removed after the initial admission request. \ No newline at end of file +The webhook stores unique `TPU_WORKER_HOSTNAMES` and `TPU_WORKER_ID`s for each slice in memory, and will fail to initialize the environment variables correctly if the webhook pod dies or restarts before intercepting all pods. \ No newline at end of file diff --git a/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook b/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook new file mode 100755 index 000000000..2fe6ec469 Binary files /dev/null and b/applications/ray/kuberay-tpu-webhook/bin/kuberay-tpu-webhook differ diff --git a/applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml b/applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml index fcd82e48d..f3665b8cb 100644 --- a/applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml +++ b/applications/ray/kuberay-tpu-webhook/deployments/deployment.yaml @@ -16,7 +16,7 @@ spec: app: kuberay-tpu-webhook spec: containers: - - image: us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/kuberay-tpu-webhook:v1.0 + - image: us-docker.pkg.dev/ai-on-gke/kuberay-tpu-webhook/kuberay-tpu-webhook:v1.1 imagePullPolicy: Always name: kuberay-tpu-webhook ports: diff --git a/applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml b/applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml index 10e999662..2e78a103f 100644 --- a/applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml +++ b/applications/ray/kuberay-tpu-webhook/deployments/mutating-webhook-cfg.yaml @@ -14,10 +14,6 @@ webhooks: namespace: default path: /mutate rules: - - operations: ["CREATE"] - apiGroups: ["ray.io"] - apiVersions: ["*"] - resources: ["rayclusters"] - operations: ["CREATE"] apiGroups: [""] apiVersions: ["v1"] diff --git a/applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml b/applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml index a29fef59d..29d0cfc53 100644 --- a/applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml +++ b/applications/ray/kuberay-tpu-webhook/deployments/validating-webhook-cfg.yaml @@ -14,6 +14,10 @@ webhooks: namespace: default path: /validate rules: + - operations: ["DELETE"] + apiGroups: [""] + apiVersions: ["v1"] + resources: ["pods"] - operations: ["CREATE"] apiGroups: ["ray.io"] apiVersions: ["*"] diff --git a/applications/ray/kuberay-tpu-webhook/go.mod b/applications/ray/kuberay-tpu-webhook/go.mod index 640cf237b..d07c7c8f2 100644 --- a/applications/ray/kuberay-tpu-webhook/go.mod +++ b/applications/ray/kuberay-tpu-webhook/go.mod @@ -3,32 +3,67 @@ module github.com/GoogleCloudPlatform/kuberay-tpu-webhook go 1.21 require ( - github.com/ray-project/kuberay/ray-operator v1.0.0 - k8s.io/api v0.28.3 - k8s.io/klog/v2 v2.100.1 + github.com/ray-project/kuberay/ray-operator v1.1.0-rc.0 + k8s.io/api v0.29.1 + k8s.io/apimachinery v0.29.1 + k8s.io/klog/v2 v2.120.1 ) require ( - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/go-logr/logr v1.2.4 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/evanphx/json-patch/v5 v5.8.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.22.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect + github.com/google/uuid v1.3.1 // indirect + github.com/imdario/mergo v0.3.12 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/onsi/gomega v1.27.6 // indirect - github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/stretchr/testify v1.8.4 // indirect - golang.org/x/net v0.17.0 // indirect - golang.org/x/sys v0.15.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_golang v1.18.0 // indirect + github.com/prometheus/client_model v0.5.0 // indirect + github.com/prometheus/common v0.45.0 // indirect + github.com/prometheus/procfs v0.12.0 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/spf13/pflag v1.0.5 // indirect + golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect + golang.org/x/net v0.20.0 // indirect + golang.org/x/oauth2 v0.12.0 // indirect + golang.org/x/sys v0.16.0 // indirect + golang.org/x/term v0.16.0 // indirect golang.org/x/text v0.14.0 // indirect + golang.org/x/time v0.3.0 // indirect + golang.org/x/tools v0.17.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.32.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - k8s.io/apimachinery v0.28.3 // indirect - k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 // indirect - sigs.k8s.io/controller-runtime v0.11.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiextensions-apiserver v0.29.0 // indirect + k8s.io/client-go v0.29.0 // indirect + k8s.io/component-base v0.29.0 // indirect + k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect + k8s.io/utils v0.0.0-20240102154912-e7106e64919e // indirect + sigs.k8s.io/controller-runtime v0.17.0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect + sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/applications/ray/kuberay-tpu-webhook/go.sum b/applications/ray/kuberay-tpu-webhook/go.sum index bf999e78d..6966dc71f 100644 --- a/applications/ray/kuberay-tpu-webhook/go.sum +++ b/applications/ray/kuberay-tpu-webhook/go.sum @@ -1,110 +1,206 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= -github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= -github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw= -github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= -github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= +github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= +github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= +github.com/evanphx/json-patch/v5 v5.8.0 h1:lRj6N9Nci7MvzrXuX6HFzU8XjmhPiXPlsKEy1u0KQro= +github.com/evanphx/json-patch/v5 v5.8.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= +github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y= +github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= +github.com/google/uuid v1.3.1 h1:KjJaJ9iWZ3jOFZIf1Lqf4laDRCasjl0BCmnEGxkdLb4= +github.com/google/uuid v1.3.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= +github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU= +github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= -github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= -github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= -github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= -github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE= -github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY= +github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw= +github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8= +github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= -github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/ray-project/kuberay/ray-operator v1.0.0 h1:i69nvbV7az2FG41VHQgxrmhD+SUl8ca+ek4RPbSE2Q0= -github.com/ray-project/kuberay/ray-operator v1.0.0/go.mod h1:7C7ebIkxtkmOX8w1iiLrKM1j4hkZs/Guzm3WdePk/yg= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= +github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= +github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= +github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= +github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM= +github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY= +github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= +github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= +github.com/ray-project/kuberay/ray-operator v1.1.0-rc.0 h1:QZIFpSxxoTFyC64Z6NK+TUCbQqs6PKLj2lCKHsZpLOc= +github.com/ray-project/kuberay/ray-operator v1.1.0-rc.0/go.mod h1:ZqyKKvMP5nKDldQoKmur+Wcx7wVlV9Q98phFqHzr+KY= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= +go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e h1:+WEEuIdZHnUeJJmEUjyYC2gfUMj69yZXw17EnHg/otA= +golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e/go.mod h1:Kr81I6Kryrl9sr8s2FK3vxD90NdsKWRuOIl2O4CvYbA= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= -golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4= +golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.17.0 h1:FvmRgNOcs3kOa+T20R1uhfP9F6HgG2mfxDv1vrx1Htc= +golang.org/x/tools v0.17.0/go.mod h1:xsh6VxdV005rRVaS6SSAf9oiAqljS7UZUacMZ8Bnsps= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.32.0 h1:pPC6BG5ex8PDFnkbrGU3EixyhKcQ2aDuBS36lqK/C7I= +google.golang.org/protobuf v1.32.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= -gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.28.3 h1:Gj1HtbSdB4P08C8rs9AR94MfSGpRhJgsS+GF9V26xMM= -k8s.io/api v0.28.3/go.mod h1:MRCV/jr1dW87/qJnZ57U5Pak65LGmQVkKTzf3AtKFHc= -k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A= -k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8= -k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg= -k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= -k8s.io/utils v0.0.0-20230406110748-d93618cff8a2 h1:qY1Ad8PODbnymg2pRbkyMT/ylpTrCM8P2RJ0yroCyIk= -k8s.io/utils v0.0.0-20230406110748-d93618cff8a2/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.11.1 h1:7YIHT2QnHJArj/dk9aUkYhfqfK5cIxPOX5gPECfdZLU= -sigs.k8s.io/controller-runtime v0.11.1/go.mod h1:KKwLiTooNGu+JmLZGn9Sl3Gjmfj66eMbCQznLP5zcqA= +k8s.io/api v0.29.1 h1:DAjwWX/9YT7NQD4INu49ROJuZAAAP/Ijki48GUPzxqw= +k8s.io/api v0.29.1/go.mod h1:7Kl10vBRUXhnQQI8YR/R327zXC8eJ7887/+Ybta+RoQ= +k8s.io/apiextensions-apiserver v0.29.0 h1:0VuspFG7Hj+SxyF/Z/2T0uFbI5gb5LRgEyUVE3Q4lV0= +k8s.io/apiextensions-apiserver v0.29.0/go.mod h1:TKmpy3bTS0mr9pylH0nOt/QzQRrW7/h7yLdRForMZwc= +k8s.io/apimachinery v0.29.1 h1:KY4/E6km/wLBguvCZv8cKTeOwwOBqFNjwJIdMkMbbRc= +k8s.io/apimachinery v0.29.1/go.mod h1:6HVkd1FwxIagpYrHSwJlQqZI3G9LfYWRPAkUvLnXTKU= +k8s.io/client-go v0.29.0 h1:KmlDtFcrdUzOYrBhXHgKw5ycWzc3ryPX5mQe0SkG3y8= +k8s.io/client-go v0.29.0/go.mod h1:yLkXH4HKMAywcrD82KMSmfYg2DlE8mepPR4JGSo5n38= +k8s.io/component-base v0.29.0 h1:T7rjd5wvLnPBV1vC4zWd/iWRbV8Mdxs+nGaoaFzGw3s= +k8s.io/component-base v0.29.0/go.mod h1:sADonFTQ9Zc9yFLghpDpmNXEdHyQmFIGbiuZbqAXQ1M= +k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= +k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= +k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= +k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= +k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +sigs.k8s.io/controller-runtime v0.17.0 h1:fjJQf8Ukya+VjogLO6/bNX9HE6Y2xpsO5+fyS26ur/s= +sigs.k8s.io/controller-runtime v0.17.0/go.mod h1:+MngTvIQQQhfXtwfdGw/UOQ/aIaqsYywfCINOtwMO/s= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= -sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= -sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= -sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= +sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/applications/ray/kuberay-tpu-webhook/main.go b/applications/ray/kuberay-tpu-webhook/main.go index 03e429e24..ecaa57e7f 100755 --- a/applications/ray/kuberay-tpu-webhook/main.go +++ b/applications/ray/kuberay-tpu-webhook/main.go @@ -6,6 +6,7 @@ import ( "errors" "flag" "fmt" + "math" "net/http" "os" "path/filepath" @@ -20,10 +21,18 @@ import ( ) // our representation of a pod slice -// not necessarily true that worker group scheduled on 1 slice type slice struct { - rayClusterName string - groupName string + clusterName string + groupName string + replicaIndex int + numOfHosts int32 +} + +// our representation of a worker pod +type worker struct { + workerIndex int // TPU_WORKER_ID + replicaIndex int // index of replica worker belongs to + isCreated bool // true = pod has been created, false = pod deleted / hasn't been created yet } // JSON patch describing mutate operation(s) for incoming object @@ -34,12 +43,15 @@ var ( keyPath = "/etc/kuberay-tpu-webhook/tls/tls.key" tpuResourceName = corev1.ResourceName("google.com/tpu") - // headless svc will be of the form: {kuberay-cluster-name}-tpu-worker-svc - headlessServiceSuffix = "tpu-worker-svc" + // headless svc will be of the form: {kuberay-cluster-name}-headless-worker-svc + headlessServiceSuffix = "headless-worker-svc" headlessServiceName string - // map of ray cluster names to # of workers created in the slice - sliceToWorkers map[slice]int + // map of pod slices to workers in the slice + sliceToWorkers map[slice][]worker + + // map of pod slices to TPU_WORKER_HOSTNAMES in that pod slice + sliceToHostnames map[slice]string // Flag arguments. BindAddr string @@ -65,7 +77,7 @@ func containerRequestingTPUs(containers ...corev1.Container) bool { return false } -func getNumTPUHosts(topology string) (int, error) { +func getNumTPUHostsFromTopology(topology string, acceleratorType string) (int32, error) { if topology == "" { return 0, errors.New("TPU topology not specified") } @@ -79,13 +91,23 @@ func getNumTPUHosts(topology string) (int, error) { } chips *= dim } - // number VMs = number chips / 4 - return max(chips/4, 1), nil + // calculate the # of VMs using # of chips per host + acceleratorTypeValues := strings.Split(acceleratorType, "-") + chipsPerHost := 4 // default to 4 chips per VM + if acceleratorTypeValues[0] == "v5litepod" { + // v5e TPU VMs can have 1, 4 or 8 chips + chipsPerHost, err := strconv.Atoi(acceleratorTypeValues[1]) + if err != nil { + klog.Errorf("Unexpected acceleratorType: %s", acceleratorType) + } + chipsPerHost = min(chipsPerHost, 8) // max of 8 chips per host + } + return int32(max(chips/chipsPerHost, 1)), nil } // check if request is for TPU multi-host -func isTPUMultiHost(topology string) (bool, error) { - vms, err := getNumTPUHosts(topology) +func isTPUMultiHost(topology string, acceleratorType string) (bool, error) { + vms, err := getNumTPUHostsFromTopology(topology, acceleratorType) if err != nil { return false, err } @@ -107,100 +129,101 @@ func extractRayCluster(admissionReview *admissionv1.AdmissionReview) (*ray.RayCl return &rayCluster, nil } -func genDNSHostnames(workerGroupSpec ray.WorkerGroupSpec) (string, error) { - replicas := workerGroupSpec.Replicas - if replicas == nil { - return "", errors.New("workerGroupSpec replicas not set") +func genDNSHostnames(workerGroupSpec ray.WorkerGroupSpec, replicaIndex int) (string, error) { + numHosts := workerGroupSpec.NumOfHosts + if numHosts == 0 { + return "", errors.New("workerGroupSpec NumOfHosts not set") } - numWorkers := int(*replicas) workerGroupName := workerGroupSpec.GroupName - hostNames := make([]string, numWorkers) - for j := 0; j < numWorkers; j++ { - hostNames[j] = fmt.Sprintf("%s-%d.%s", workerGroupName, j, headlessServiceName) + hostNames := make([]string, numHosts) + // Host names will be of the form {WORKER_GROUP_NAME}-{REPLICA_INDEX}-{HOST_INDEX}.headless-worker-svc + for j := 0; j < int(numHosts); j++ { + hostNames[j] = fmt.Sprintf("%s-%d-%d.%s", workerGroupName, replicaIndex, j, headlessServiceName) } return strings.Join(hostNames, ","), nil } -func injectHostnames(hostNames string, workerGroupSpec ray.WorkerGroupSpec, workerGroupIndex int, patches *[]patch) { - containers := workerGroupSpec.Template.Spec.Containers - if containers == nil { - klog.Fatalf("Container path not specified") - } - // inject subdomain and TPU_WORKER_HOSTNAMES into pods for TPU multi-host initialization - for j := 0; j < len(containers); j++ { - container := containers[j] - if containerRequestingTPUs(container) { - subdomainPatch, hostNamesPatch := patch{"op": "add"}, patch{"op": "add"} - subdomainPath := fmt.Sprintf("/spec/workerGroupSpecs/%d/template/spec/subdomain", workerGroupIndex) - envPath := fmt.Sprintf("/spec/workerGroupSpecs/%d/template/spec/containers/%d/env", workerGroupIndex, j) - tpuWorkerHostNames := corev1.EnvVar{ - Name: "TPU_WORKER_HOSTNAMES", - Value: hostNames, - } - subdomainPatch["path"] = subdomainPath - subdomainPatch["value"] = headlessServiceName - // create new EnvVar array if container.Env is empty, and append hostnames if not - if len(container.Env) == 0 { - hostNamesPatch["path"] = envPath - hostNamesPatch["value"] = []corev1.EnvVar{tpuWorkerHostNames} - } else { - hostNamesPatch["path"] = fmt.Sprintf("%s/-", envPath) - hostNamesPatch["value"] = tpuWorkerHostNames - } - *patches = append(*patches, subdomainPatch, hostNamesPatch) - } - } +// inject subdomain and TPU_WORKER_HOSTNAMES into pods for TPU multi-host initialization +func injectHostnames(hostNames string, envPath string, container corev1.Container, patches *[]patch) { + subdomainPatch, hostNamesPatch := patch{"op": "add"}, patch{"op": "add"} + subdomainPath := "/spec/subdomain" + tpuWorkerHostNames := corev1.EnvVar{ + Name: "TPU_WORKER_HOSTNAMES", + Value: hostNames, + } + subdomainPatch["path"] = subdomainPath + subdomainPatch["value"] = headlessServiceName + // create new EnvVar array if container.Env is empty, and append hostnames if not + if len(container.Env) == 0 { + hostNamesPatch["path"] = envPath + hostNamesPatch["value"] = []corev1.EnvVar{tpuWorkerHostNames} + } else { + hostNamesPatch["path"] = fmt.Sprintf("%s/-", envPath) + hostNamesPatch["value"] = tpuWorkerHostNames + } + *patches = append(*patches, subdomainPatch, hostNamesPatch) +} + +func injectMultiHostReplicaLabel(replicaIndex int, workerGroupName string, patches *[]patch) { + labelPatch := patch{"op": "replace"} + labelPath := "/metadata/labels/multiHostReplica" + multiHostReplicaValue := workerGroupName + "-" + strconv.Itoa(replicaIndex) + + labelPatch["path"] = labelPath + labelPatch["value"] = multiHostReplicaValue + + *patches = append(*patches, labelPatch) +} + +// inject pod affinity and anti-affinity scheduling constraints using multiHostReplica label +func injectPodAffinity(replicaIndex int, workerGroupName string, patches *[]patch) { + key := "multiHostReplica" + value := workerGroupName + "-" + strconv.Itoa(replicaIndex) + topologyKey := "cloud.google.com/gke-nodepool" + + // construct affinity value to inject - schedule pods with the same multiHostReplica together + podAffinityPatch := patch{"op": "add"} + podAffinityPatch["path"] = "/spec/affinity/podAffinity" + affinitySelectorRequirement := metav1.LabelSelectorRequirement{key, metav1.LabelSelectorOpIn, []string{value}} + affinityMatchExpressions := []metav1.LabelSelectorRequirement{affinitySelectorRequirement} + affinityLabelSelector := metav1.LabelSelector{MatchExpressions: affinityMatchExpressions} + podAffinityTerms := []corev1.PodAffinityTerm{corev1.PodAffinityTerm{LabelSelector: &affinityLabelSelector, TopologyKey: topologyKey}} + podAffinityPatch["value"] = corev1.PodAffinity{RequiredDuringSchedulingIgnoredDuringExecution: podAffinityTerms} + + *patches = append(*patches, podAffinityPatch) } +// check that the # of Ray TPU worker pods equals the # of hosts defined in the topology key func checkWorkersMatchTopology(workerGroupSpec ray.WorkerGroupSpec) (bool, error) { - replicas := workerGroupSpec.Replicas - if replicas == nil { - return false, errors.New("workerGroupSpec replicas not set") + numHosts := workerGroupSpec.NumOfHosts // 1 TPU VM host -> 1 Ray worker pod + if numHosts == 0 { + return false, errors.New("workerGroupSpec NumOfHosts not set") } - numWorkers := int(*replicas) containers := workerGroupSpec.Template.Spec.Containers if containers == nil { return false, errors.New("Container path not specified") } if containerRequestingTPUs(containers...) { topology := workerGroupSpec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-topology"] + acceleratorType := workerGroupSpec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-accelerator"] if topology == "" { klog.Error("TPU topology not specified") } - hosts, err := getNumTPUHosts(topology) + if acceleratorType == "" { + klog.Error("TPU accelerator type not specified") + } + expectedHosts, err := getNumTPUHostsFromTopology(topology, acceleratorType) if err != nil { return false, err } - if hosts != numWorkers { + if expectedHosts != numHosts { return false, nil } } return true, nil } -func isScheduledWithAffinity(workerGroupSpec ray.WorkerGroupSpec) (bool, error) { - containers := workerGroupSpec.Template.Spec.Containers - if containers == nil { - return false, errors.New("Container path not specified") - } - if containerRequestingTPUs(containers...) { - topology := workerGroupSpec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-topology"] - isMultiHost, err := isTPUMultiHost(topology) - if err != nil { - return false, err - } - if isMultiHost { - placementGroup := workerGroupSpec.Template.Spec.NodeSelector["cloud.google.com/gke-placement-group"] - if placementGroup == "" { - return false, nil - } - } - } - - return true, nil -} - func validateRayCluster(admissionReview *admissionv1.AdmissionReview) (*admissionv1.AdmissionResponse, error) { raycluster, err := extractRayCluster(admissionReview) if err != nil { @@ -210,33 +233,44 @@ func validateRayCluster(admissionReview *admissionv1.AdmissionReview) (*admissio admit := true status := "Success" message := "" + clusterName := raycluster.Name + headlessServiceName = fmt.Sprintf("%s-%s", clusterName, headlessServiceSuffix) workerGroupSpecs := raycluster.Spec.WorkerGroupSpecs if workerGroupSpecs == nil { return nil, errors.New("WorkerGroupSpecs not specified") } for i := 0; i < len(workerGroupSpecs); i++ { workerGroupSpec := workerGroupSpecs[i] - workersMatchTopology, err := checkWorkersMatchTopology(workerGroupSpec) - if err != nil { - return nil, err + // create mapping for pod slices -> TPU_WORKER_HOSTNAMES in cluster + replicas := int(*workerGroupSpec.Replicas) + numOfHosts := workerGroupSpec.NumOfHosts + if numOfHosts > 1 { + for replicaIndex := 0; replicaIndex < replicas; replicaIndex++ { + // reset past sliceToWorkers and sliceToHostnames entries for slice in ray cluster + groupName := workerGroupSpec.GroupName + podSlice := slice{clusterName, groupName, replicaIndex, numOfHosts} + sliceToWorkers[podSlice] = nil + sliceToHostnames[podSlice] = "" + // generate TPU_WORKER_HOSTNAMES + joinedHostNames, err := genDNSHostnames(workerGroupSpec, replicaIndex) + if err != nil { + klog.Error("Failed to generate DNS Hostnames") + } + sliceToHostnames[podSlice] = joinedHostNames + } } - hasCorrectAffinity, err := isScheduledWithAffinity(workerGroupSpec) + // validate NumOfHosts for worker group matches topology nodeSelector + workersMatchTopology, err := checkWorkersMatchTopology(workerGroupSpec) if err != nil { return nil, err } - if !(workersMatchTopology && hasCorrectAffinity) { + if !workersMatchTopology { admit = false status = "Failure" - if !workersMatchTopology && !hasCorrectAffinity { - message = "Missing gke-placement-group nodeSelector and workers not equal to specified topology" - } else if !workersMatchTopology { - message = "Number of workers in worker group not equal to specified topology" - } else if !hasCorrectAffinity { - message = "TPU worker group requested without gke-placement-group nodeSelector" - } - break + message = "Number of workers in worker group not equal to specified topology" } + break } // Create AdmissionResponse @@ -251,85 +285,75 @@ func validateRayCluster(admissionReview *admissionv1.AdmissionReview) (*admissio return admissionResponse, nil } -// add TPU_WORKER_HOSTNAMES to containers in a ray cluster -func mutateRayCluster(admissionReview *admissionv1.AdmissionReview) (*admissionv1.AdmissionResponse, error) { - raycluster, err := extractRayCluster(admissionReview) - if err != nil { - return nil, err - } - - clusterName := raycluster.Name - headlessServiceName = fmt.Sprintf("%s-%s", clusterName, headlessServiceSuffix) - var patches []patch - workerGroupSpecs := raycluster.Spec.WorkerGroupSpecs - if workerGroupSpecs == nil { - return nil, errors.New("WorkerGroupSpecs not specified") +func getEnvironmentVariable(varName string, container corev1.Container) string { + if container.Env != nil && len(container.Env) > 0 { + for _, envVar := range container.Env { + if envVar.Name == varName { + return envVar.Value + } + } } - for i := 0; i < len(workerGroupSpecs); i++ { - workerGroupSpec := workerGroupSpecs[i] - // reset past sliceToWorkers entries for ray cluster - groupName := workerGroupSpec.GroupName - podSlice := slice{clusterName, groupName} - sliceToWorkers[podSlice] = 0 + return "" +} - containers := workerGroupSpec.Template.Spec.Containers - if containers == nil { - return nil, errors.New("Container path not specified") - } - if containerRequestingTPUs(containers...) { - topology := workerGroupSpec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-topology"] - isMultiHost, err := isTPUMultiHost(topology) - if err != nil { - return nil, err +// get next lowest-index pod slice to assign a pod to in the RayCluster +// this will be the first pod slice with # created pods < NumOfHosts +func getReplicaIndex(clusterName string) int { + if sliceToWorkers == nil { + return 0 + } + next_lowest_id := math.MaxInt32 + for slice, workerList := range sliceToWorkers { + if slice.clusterName == clusterName { + createdPods := 0 + for _, worker := range workerList { + if worker.isCreated { + createdPods++ + } } - if isMultiHost { - joinedHostNames, err := genDNSHostnames(workerGroupSpec) - if err != nil { - return nil, err + if createdPods < int(slice.numOfHosts) { + if slice.replicaIndex < next_lowest_id { + next_lowest_id = slice.replicaIndex } - injectHostnames(joinedHostNames, workerGroupSpec, i, &patches) } } } - - patchBytes, err := json.Marshal(patches) - if err != nil { - return nil, err - } - - // Create AdmissionResponse - admissionResponse := &admissionv1.AdmissionResponse{ - UID: admissionReview.Request.UID, - Allowed: true, - Patch: patchBytes, - PatchType: func() *admissionv1.PatchType { - pt := admissionv1.PatchTypeJSONPatch - return &pt - }(), - } - return admissionResponse, nil + return next_lowest_id } -func hasWorkerID(container corev1.Container) bool { - if container.Env != nil && len(container.Env) > 0 { - for _, envVar := range container.Env { - if envVar.Name == "TPU_WORKER_ID" { - return true +// returns next lowest TPU_WORKER_ID in pod slice and updates mappings +func getNextWorkerID(podSlice slice, replicaIndex int) int { + tpuWorkerID := 0 + if sliceToWorkers[podSlice] == nil { + newWorker := worker{tpuWorkerID, replicaIndex, true} + sliceToWorkers[podSlice] = []worker{newWorker} + } else { + nextLowestID := math.MaxInt32 + replacePod := false + // iterate through existing workers and check if any have been deleted + for _, worker := range sliceToWorkers[podSlice] { + if worker.isCreated == false && worker.workerIndex < nextLowestID { + replacePod = true + nextLowestID = worker.workerIndex } } - } - return false -} - -func hasWorkerName(container corev1.Container) bool { - if container.Env != nil && len(container.Env) > 0 { - for _, envVar := range container.Env { - if envVar.Name == "TPU_NAME" { - return true + // reassign next lowest TPU_WORKER_ID if pod has been deleted + if replacePod == true { + for _, worker := range sliceToWorkers[podSlice] { + // set worker.isCreated to true now that pod is being re-created + if worker.workerIndex == nextLowestID { + worker.isCreated = true + } } + } else { + // all pods are running -> create new worker with next TPU_WORKER_ID + nextLowestID = len(sliceToWorkers[podSlice]) + newWorker := worker{nextLowestID, replicaIndex, true} + sliceToWorkers[podSlice] = append(sliceToWorkers[podSlice], newWorker) } + tpuWorkerID = nextLowestID } - return false + return tpuWorkerID } // unmarshal pod from admission request @@ -339,8 +363,14 @@ func extractPod(admissionReview *admissionv1.AdmissionReview) (*corev1.Pod, erro } pod := corev1.Pod{} - if err := json.Unmarshal(admissionReview.Request.Object.Raw, &pod); err != nil { - return nil, err + if admissionReview.Request.Operation == "CREATE" { + if err := json.Unmarshal(admissionReview.Request.Object.Raw, &pod); err != nil { + return nil, err + } + } else if admissionReview.Request.Operation == "DELETE" { + if err := json.Unmarshal(admissionReview.Request.OldObject.Raw, &pod); err != nil { + return nil, err + } } return &pod, nil @@ -355,56 +385,70 @@ func mutatePod(admissionReview *admissionv1.AdmissionReview) (*admissionv1.Admis var patches []patch // ray operator only sets GenerateName field - doesn't include random suffix until after admission request - // use mapping of {cluster name, group name} -> # workers created to set TPU_WORKER_IDs + // use mapping of {cluster name, group name, replicaIndex} -> workers to extract next TPU_WORKER_ID clusterName := pod.Labels["ray.io/cluster"] groupName := pod.Labels["ray.io/group"] - podSlice := slice{clusterName, groupName} topology := pod.Spec.NodeSelector["cloud.google.com/gke-tpu-topology"] + acceleratorType := pod.Spec.NodeSelector["cloud.google.com/gke-tpu-accelerator"] if topology == "" { klog.Error("TPU topology not specified") } + if acceleratorType == "" { + klog.Error("TPU accelerator type not specified") + } containers := pod.Spec.Containers if containers == nil { return nil, errors.New("Container path not specified") } if containerRequestingTPUs(containers...) { - // assign to the next unique ID in the pod slice - tpu_worker_id := sliceToWorkers[podSlice] + // assign worker to the next unique ID in the pod slice and update map + numOfHosts, _ := getNumTPUHostsFromTopology(topology, acceleratorType) // ignore error here because topology may not be set yet + replicaIndex := getReplicaIndex(clusterName) + podSlice := slice{clusterName, groupName, replicaIndex, numOfHosts} + tpuWorkerID := getNextWorkerID(podSlice, replicaIndex) - // if multihost -> inject hostname into pod spec for DNS records - isMultiHost, _ := isTPUMultiHost(topology) // ignore error here because topology may not be set yet + isMultiHost, _ := isTPUMultiHost(topology, acceleratorType) // ignore error here because topology may not be set yet if isMultiHost { - hostname := fmt.Sprintf(groupName+"-%d", tpu_worker_id) + // inject hostname into pod spec for DNS records + hostname := fmt.Sprintf(groupName+"-%d-%d", replicaIndex, tpuWorkerID) hostnamePatch := patch{"op": "add"} hostnamePatch["path"] = "/spec/hostname" hostnamePatch["value"] = hostname patches = append(patches, hostnamePatch) + + // inject multi-host replica label + injectMultiHostReplicaLabel(replicaIndex, groupName, &patches) + + // inject pod affinity/anti-affinity for scheduling + injectPodAffinity(replicaIndex, groupName, &patches) } - // inject the TPU_WORKER_ID environment variable into the container requesting TPUs - increment_worker_id := false + // inject all environment variables into the container requesting TPUs for i := 0; i < len(containers); i++ { container := containers[i] if containerRequestingTPUs(container) { path := fmt.Sprintf("/spec/containers/%d/env", i) - if !hasWorkerID(container) { - increment_worker_id = true - tpuWorkerID := corev1.EnvVar{ + // inject TPU_WORKER_HOSTNAMES set during RayCluster interception + injectHostnames(sliceToHostnames[podSlice], path, container, &patches) + // inject TPU_WORKER_ID + if getEnvironmentVariable("TPU_WORKER_ID", container) == "" { + workerID := corev1.EnvVar{ Name: "TPU_WORKER_ID", - Value: fmt.Sprint(tpu_worker_id), + Value: fmt.Sprint(tpuWorkerID), } idPatch := patch{"op": "add"} // create new EnvVar array if container.Env is empty, and append new EnvVars if not if len(container.Env) == 0 { idPatch["path"] = path - idPatch["value"] = []corev1.EnvVar{tpuWorkerID} + idPatch["value"] = []corev1.EnvVar{workerID} } else { idPatch["path"] = fmt.Sprintf("%s/-", path) - idPatch["value"] = tpuWorkerID + idPatch["value"] = workerID } patches = append(patches, idPatch) } - if !hasWorkerName(container) { + // inject TPU_NAME + if getEnvironmentVariable("TPU_WORKER_NAME", container) == "" { tpuName := corev1.EnvVar{ Name: "TPU_NAME", Value: fmt.Sprint(groupName), @@ -422,9 +466,6 @@ func mutatePod(admissionReview *admissionv1.AdmissionReview) (*admissionv1.Admis } } } - if increment_worker_id { - sliceToWorkers[podSlice] += 1 - } } patchBytes, err := json.Marshal(patches) @@ -444,6 +485,60 @@ func mutatePod(admissionReview *admissionv1.AdmissionReview) (*admissionv1.Admis return admissionResponse, nil } +// update sliceToWorkers map on pod deletion +func deletePod(admissionReview *admissionv1.AdmissionReview) (*admissionv1.AdmissionResponse, error) { + pod, err := extractPod(admissionReview) + if err != nil { + klog.Fatalf("Pod extraction failed: %s", err) + } + + multiHostReplicaLabel := pod.Labels["multiHostReplica"] + + if multiHostReplicaLabel != "" { + multiHostReplicaLabelValues := strings.Split(multiHostReplicaLabel, "-") + groupName := multiHostReplicaLabelValues[0] + replicaIndex, _ := strconv.Atoi(multiHostReplicaLabelValues[1]) // ignore error here since must be set + + containers := pod.Spec.Containers + if containers == nil { + return nil, errors.New("Pod spec missing containers") + } + tpuWorkerID := -1 + for _, container := range pod.Spec.Containers { + if containerRequestingTPUs(container) { + tpuWorkerID, err = strconv.Atoi(getEnvironmentVariable("TPU_WORKER_ID", container)) + if err != nil { + return nil, errors.New("Unable to extract TPU_WORKER_ID") + } + } + } + // pod belongs to a multi-host replica -> update the map + for slice, _ := range sliceToWorkers { + if slice.groupName == groupName && slice.replicaIndex == replicaIndex { + // set the pod state to indicate it is not running + for _, worker := range sliceToWorkers[slice] { + if worker.workerIndex == tpuWorkerID { + worker.isCreated = false + break + } + } + break + } + } + } + + // Create AdmissionResponse - we never deny the deletion request + admissionResponse := &admissionv1.AdmissionResponse{ + UID: admissionReview.Request.UID, + Allowed: true, + Result: &metav1.Status{ + Status: "Success", + Message: "", + }, + } + return admissionResponse, nil +} + func writeCertfile(filename string, encodedData string) error { data, err := base64.StdEncoding.DecodeString(encodedData) if err != nil { @@ -454,7 +549,8 @@ func writeCertfile(filename string, encodedData string) error { } func init() { - sliceToWorkers = make(map[slice]int) + sliceToWorkers = make(map[slice][]worker) + sliceToHostnames = make(map[slice]string) flag.StringVar(&BindAddr, "bind-address", ":443", "Address to bind HTTPS service to") flag.StringVar(&CACert, "ca-cert", "", "base64-encoded root certificate for TLS") @@ -477,11 +573,11 @@ func main() { return } - if admissionReview.Request.Kind.Kind == "RayCluster" { - klog.Info("Received review for RayCluster") - response, err := mutateRayCluster(admissionReview) + if admissionReview.Request.Kind.Kind == "Pod" { + klog.Info("Received review for Pod") + response, err := mutatePod(admissionReview) if err != nil { - klog.Errorf("Failed to mutate ray cluster: %s", err) + klog.Errorf("Failed to mutate pod: %s", err) return } admissionReview.Response = response @@ -492,12 +588,20 @@ func main() { } fmt.Fprint(w, string(responseBytes)) } + }) + + mux.HandleFunc("/validate", func(w http.ResponseWriter, r *http.Request) { + admissionReview := &admissionv1.AdmissionReview{} + if err := json.NewDecoder(r.Body).Decode(admissionReview); err != nil { + http.Error(w, "Error decoding request body", http.StatusBadRequest) + return + } if admissionReview.Request.Kind.Kind == "Pod" { - klog.Info("Received review for Pod") - response, err := mutatePod(admissionReview) + klog.Info("Received review for Pod deletion") + response, err := deletePod(admissionReview) if err != nil { - klog.Errorf("Failed to mutate pod: %s", err) + klog.Errorf("Failed to validate pod deletion: %s", err) return } admissionReview.Response = response @@ -508,14 +612,6 @@ func main() { } fmt.Fprint(w, string(responseBytes)) } - }) - - mux.HandleFunc("/validate", func(w http.ResponseWriter, r *http.Request) { - admissionReview := &admissionv1.AdmissionReview{} - if err := json.NewDecoder(r.Body).Decode(admissionReview); err != nil { - http.Error(w, "Error decoding request body", http.StatusBadRequest) - return - } if admissionReview.Request.Kind.Kind == "RayCluster" { klog.Info("Received review for RayCluster") diff --git a/modules/kuberay-cluster/kuberay-tpu-values.yaml b/modules/kuberay-cluster/kuberay-tpu-values.yaml index a52130938..637f6104b 100644 --- a/modules/kuberay-cluster/kuberay-tpu-values.yaml +++ b/modules/kuberay-cluster/kuberay-tpu-values.yaml @@ -153,6 +153,7 @@ worker: replicas: 1 minReplicas: 1 maxReplicas: 1 + numOfHosts: 1 type: worker labels: cloud.google.com/gke-ray-node-type: worker diff --git a/modules/kuberay-cluster/main.tf b/modules/kuberay-cluster/main.tf index 42dac1c5e..8952c8c46 100644 --- a/modules/kuberay-cluster/main.tf +++ b/modules/kuberay-cluster/main.tf @@ -66,24 +66,10 @@ resource "helm_release" "ray-cluster" { ] } -resource "kubernetes_service" "tpu-worker-svc" { - count = var.enable_tpu ? 1 : 0 - metadata { - name = "${helm_release.ray-cluster.name}-kuberay-tpu-worker-svc" - namespace = var.namespace - } - spec { - selector = { - "cloud.google.com/gke-ray-node-type" = "worker" - } - cluster_ip = "None" - } -} - data "kubernetes_service" "head-svc" { metadata { name = "${helm_release.ray-cluster.name}-kuberay-head-svc" namespace = var.namespace } depends_on = [helm_release.ray-cluster] -} \ No newline at end of file +}