diff --git a/benchmark/perf-tests/100-raycluster/config.yaml b/benchmark/perf-tests/100-raycluster/config.yaml
index 5518a6b0a1d..b5c30659bd6 100644
--- a/benchmark/perf-tests/100-raycluster/config.yaml
+++ b/benchmark/perf-tests/100-raycluster/config.yaml
@@ -42,7 +42,8 @@ steps:
timeout: 30m
command:
- "bash"
- - "100-raycluster/wait-for-rayclusters.sh"
+ - "common/wait-for-rayclusters.sh"
+ - "100"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
diff --git a/benchmark/perf-tests/100-rayjob/config.yaml b/benchmark/perf-tests/100-rayjob/config.yaml
index 7c7e123d51c..69b5198fb85 100644
--- a/benchmark/perf-tests/100-rayjob/config.yaml
+++ b/benchmark/perf-tests/100-rayjob/config.yaml
@@ -60,7 +60,8 @@ steps:
timeout: 30m
command:
- "bash"
- - "100-rayjob/wait-for-rayjobs.sh"
+ - "common/wait-for-rayjobs.sh"
+ - "100"
- name: Wait for pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
diff --git a/benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh b/benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh
deleted file mode 100644
index 8b56dbea671..00000000000
--- a/benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-expect_succeeded=100
-echo "waiting for $expect_succeeded RayJobs to be completed successfully"
-
-while true; do
- num_succeeded=$(kubectl get rayjob -A -l perf-test=rayjob-pytorch-mnist -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
- echo "$num_succeeded RayJobs completed..."
-
- if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
- break;
- fi
-
- sleep 5
-done
-
-while true; do
- num_succeeded=$(kubectl get rayjob -A -l perf-test=ray-data-image-resize -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
- echo "$num_succeeded RayJobs completed..."
-
- if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
- break;
- fi
-
- sleep 5
-done
-
-echo "$num_succeeded RayJobs completed!"
diff --git a/benchmark/perf-tests/1000-raycluster/config.yaml b/benchmark/perf-tests/1000-raycluster/config.yaml
new file mode 100644
index 00000000000..1649ae2c1d8
--- /dev/null
+++ b/benchmark/perf-tests/1000-raycluster/config.yaml
@@ -0,0 +1,68 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+- name: Preload Images
+ measurements:
+ - Identifier: PreloadImages
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/preload-image.sh"
+- name: Creating Ray clusters
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 10
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: raycluster
+ objectTemplatePath: raycluster.yaml
+ templateFillMap:
+ Replicas: 3
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayClusters ready
+ measurements:
+ - Identifier: WaitForRayCluster
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/wait-for-rayclusters.sh"
+ - "1000"
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/1000-raycluster/raycluster.yaml b/benchmark/perf-tests/1000-raycluster/raycluster.yaml
new file mode 100644
index 00000000000..742891fdd25
--- /dev/null
+++ b/benchmark/perf-tests/1000-raycluster/raycluster.yaml
@@ -0,0 +1,50 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-cluster
+spec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ serviceType: ClusterIP
+ rayStartParams:
+ dashboard-host: '0.0.0.0'
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
+ volumes:
+ - name: ray-logs
+ emptyDir: {}
+ workerGroupSpecs:
+ - replicas: {{.Replicas}}
+ minReplicas: 1
+ maxReplicas: 10
+ # logical group name, for this called small-group, also can be functional
+ groupName: small-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
diff --git a/benchmark/perf-tests/1000-raycluster/results/junit.xml b/benchmark/perf-tests/1000-raycluster/results/junit.xml
new file mode 100644
index 00000000000..2a945ee9cc6
--- /dev/null
+++ b/benchmark/perf-tests/1000-raycluster/results/junit.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/1000-rayjob/config.yaml b/benchmark/perf-tests/1000-rayjob/config.yaml
new file mode 100644
index 00000000000..095b164a6f5
--- /dev/null
+++ b/benchmark/perf-tests/1000-rayjob/config.yaml
@@ -0,0 +1,83 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 10m
+- name: Creating RayJobs for PyTorch MNIST fine-tuning
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 5
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: pytorch-mnist
+ objectTemplatePath: pytorch-mnist-rayjob.yaml
+ templateFillMap:
+ Image: "rayproject/ray:2.9.3"
+- name: Creating RayJobs for Ray Data Image Resizing
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 5
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: ray-data-image-resize
+ objectTemplatePath: ray-data-image-resize.yaml
+ templateFillMap:
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayJobs complete
+ measurements:
+ - Identifier: WaitForRayJob
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/wait-for-rayjobs.sh"
+ - "500" # 1000 since we deploy two RayJobs with 500 instances each
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+ operationTimeout: 10m
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
+- name: Measure job finished
+ measurements:
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml b/benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml
new file mode 100644
index 00000000000..413e6816a7e
--- /dev/null
+++ b/benchmark/perf-tests/1000-rayjob/pytorch-mnist-rayjob.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: rayjob-pytorch-mnist
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_train_pytorch_mnist.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "4Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "4Gi"
diff --git a/benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml b/benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml
new file mode 100644
index 00000000000..8c9e2bdc3f3
--- /dev/null
+++ b/benchmark/perf-tests/1000-rayjob/ray-data-image-resize.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-data-image-resize
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_data_image_resize.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
diff --git a/benchmark/perf-tests/1000-rayjob/results/junit.xml b/benchmark/perf-tests/1000-rayjob/results/junit.xml
new file mode 100644
index 00000000000..2066c09568d
--- /dev/null
+++ b/benchmark/perf-tests/1000-rayjob/results/junit.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/10000-raycluster/config.yaml b/benchmark/perf-tests/10000-raycluster/config.yaml
new file mode 100644
index 00000000000..f435dfd7bac
--- /dev/null
+++ b/benchmark/perf-tests/10000-raycluster/config.yaml
@@ -0,0 +1,68 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+- name: Preload Images
+ measurements:
+ - Identifier: PreloadImages
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/preload-image.sh"
+- name: Creating Ray clusters
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 100
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: raycluster
+ objectTemplatePath: raycluster.yaml
+ templateFillMap:
+ Replicas: 3
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayClusters ready
+ measurements:
+ - Identifier: WaitForRayCluster
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "10000-raycluster/wait-for-rayclusters.sh"
+ - "10000"
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/10000-raycluster/raycluster.yaml b/benchmark/perf-tests/10000-raycluster/raycluster.yaml
new file mode 100644
index 00000000000..4a4c37bebd2
--- /dev/null
+++ b/benchmark/perf-tests/10000-raycluster/raycluster.yaml
@@ -0,0 +1,49 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-cluster
+spec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ serviceType: ClusterIP
+ rayStartParams:
+ dashboard-host: '0.0.0.0'
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
+ volumes:
+ - name: ray-logs
+ emptyDir: {}
+ workerGroupSpecs:
+ - replicas: {{.Replicas}}
+ minReplicas: 1
+ maxReplicas: 10
+ groupName: small-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
diff --git a/benchmark/perf-tests/10000-raycluster/results/junit.xml b/benchmark/perf-tests/10000-raycluster/results/junit.xml
new file mode 100644
index 00000000000..718cf9603a0
--- /dev/null
+++ b/benchmark/perf-tests/10000-raycluster/results/junit.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/10000-rayjob/config.yaml b/benchmark/perf-tests/10000-rayjob/config.yaml
new file mode 100644
index 00000000000..f2b6d6894e1
--- /dev/null
+++ b/benchmark/perf-tests/10000-rayjob/config.yaml
@@ -0,0 +1,81 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 10m
+- name: Creating RayJobs for PyTorch MNIST fine-tuning
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 50
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: pytorch-mnist
+ objectTemplatePath: pytorch-mnist-rayjob.yaml
+ Image: "rayproject/ray:2.9.3"
+- name: Creating RayJobs for Ray Data Image Resizing
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 50
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: ray-data-image-resize
+ objectTemplatePath: ray-data-image-resize.yaml
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayJobs complete
+ measurements:
+ - Identifier: WaitForRayJob
+ Method: Exec
+ Params:
+ timeout: 60m
+ command:
+ - "bash"
+ - "common/wait-for-rayjobs.sh"
+ - "5000" # 10000 total since we deploy 2 RayJobs with 5000 replicas each
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+ operationTimeout: 10m
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
+- name: Measure job finished
+ measurements:
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/10000-rayjob/pytorch-mnist-rayjob.yaml b/benchmark/perf-tests/10000-rayjob/pytorch-mnist-rayjob.yaml
new file mode 100644
index 00000000000..526ca1106ee
--- /dev/null
+++ b/benchmark/perf-tests/10000-rayjob/pytorch-mnist-rayjob.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: rayjob-pytorch-mnist
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_train_pytorch_mnist.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
diff --git a/benchmark/perf-tests/10000-rayjob/ray-data-image-resize.yaml b/benchmark/perf-tests/10000-rayjob/ray-data-image-resize.yaml
new file mode 100644
index 00000000000..8c9e2bdc3f3
--- /dev/null
+++ b/benchmark/perf-tests/10000-rayjob/ray-data-image-resize.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-data-image-resize
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_data_image_resize.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
diff --git a/benchmark/perf-tests/10000-rayjob/results/junit.xml b/benchmark/perf-tests/10000-rayjob/results/junit.xml
new file mode 100644
index 00000000000..6a8386d5d73
--- /dev/null
+++ b/benchmark/perf-tests/10000-rayjob/results/junit.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/5000-raycluster/config.yaml b/benchmark/perf-tests/5000-raycluster/config.yaml
new file mode 100644
index 00000000000..b34379ab911
--- /dev/null
+++ b/benchmark/perf-tests/5000-raycluster/config.yaml
@@ -0,0 +1,68 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+- name: Preload Images
+ measurements:
+ - Identifier: PreloadImages
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/preload-image.sh"
+- name: Creating Ray clusters
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 50
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: raycluster
+ objectTemplatePath: raycluster.yaml
+ templateFillMap:
+ Replicas: 3
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayClusters ready
+ measurements:
+ - Identifier: WaitForRayCluster
+ Method: Exec
+ Params:
+ timeout: 30m
+ command:
+ - "bash"
+ - "common/wait-for-rayclusters.sh"
+ - "5000"
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/5000-raycluster/raycluster.yaml b/benchmark/perf-tests/5000-raycluster/raycluster.yaml
new file mode 100644
index 00000000000..4a4c37bebd2
--- /dev/null
+++ b/benchmark/perf-tests/5000-raycluster/raycluster.yaml
@@ -0,0 +1,49 @@
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-cluster
+spec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ serviceType: ClusterIP
+ rayStartParams:
+ dashboard-host: '0.0.0.0'
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
+ volumes:
+ - name: ray-logs
+ emptyDir: {}
+ workerGroupSpecs:
+ - replicas: {{.Replicas}}
+ minReplicas: 1
+ maxReplicas: 10
+ groupName: small-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ limits:
+ cpu: "1"
+ requests:
+ cpu: "10m"
diff --git a/benchmark/perf-tests/5000-raycluster/results/junit.xml b/benchmark/perf-tests/5000-raycluster/results/junit.xml
new file mode 100644
index 00000000000..500bc3b386f
--- /dev/null
+++ b/benchmark/perf-tests/5000-raycluster/results/junit.xml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/5000-rayjob/config.yaml b/benchmark/perf-tests/5000-rayjob/config.yaml
new file mode 100644
index 00000000000..1c900641f14
--- /dev/null
+++ b/benchmark/perf-tests/5000-rayjob/config.yaml
@@ -0,0 +1,81 @@
+name: kuberay
+namespace:
+ number: 100
+tuningSets:
+- name: Uniform100qps
+ qpsLoad:
+ qps: 100
+steps:
+- name: Start measurements
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 30m
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: start
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ operationTimeout: 120s
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: start
+ labelSelector: app.kubernetes.io/created-by = kuberay-operator
+ threshold: 10m
+- name: Creating RayJobs for PyTorch MNIST fine-tuning
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 25
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: pytorch-mnist
+ objectTemplatePath: pytorch-mnist-rayjob.yaml
+ Image: "rayproject/ray:2.9.3"
+- name: Creating RayJobs for Ray Data Image Resizing
+ phases:
+ - namespaceRange:
+ min: 1
+ max: 100
+ replicasPerNamespace: 25
+ tuningSet: Uniform100qps
+ objectBundle:
+ - basename: ray-data-image-resize
+ objectTemplatePath: ray-data-image-resize.yaml
+ Image: "rayproject/ray:2.9.3"
+- name: Wait for RayJobs complete
+ measurements:
+ - Identifier: WaitForRayJob
+ Method: Exec
+ Params:
+ timeout: 60m
+ command:
+ - "bash"
+ - "common/wait-for-rayjobs.sh"
+ - "2500" # total 5000 since we deploy 2 RayJobs with 2500 instances each
+- name: Measure wait for pods to be running
+ measurements:
+ - Identifier: WaitForControlledPodsRunning
+ Method: WaitForControlledPodsRunning
+ Params:
+ action: gather
+ operationTimeout: 10m
+- name: Measure pod startup latency
+ measurements:
+ - Identifier: PodStartupLatency
+ Method: PodStartupLatency
+ Params:
+ action: gather
+- name: Measure job finished
+ measurements:
+ - Identifier: JobLifecycleLatency
+ Method: JobLifecycleLatency
+ Params:
+ action: gather
diff --git a/benchmark/perf-tests/5000-rayjob/pytorch-mnist-rayjob.yaml b/benchmark/perf-tests/5000-rayjob/pytorch-mnist-rayjob.yaml
new file mode 100644
index 00000000000..526ca1106ee
--- /dev/null
+++ b/benchmark/perf-tests/5000-rayjob/pytorch-mnist-rayjob.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: rayjob-pytorch-mnist
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_train_pytorch_mnist.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"NUM_WORKERS":"2","CPUS_PER_WORKER":"1","OMP_NUM_THREADS":"1"}}' -- python ray_train_pytorch_mnist.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
diff --git a/benchmark/perf-tests/5000-rayjob/ray-data-image-resize.yaml b/benchmark/perf-tests/5000-rayjob/ray-data-image-resize.yaml
new file mode 100644
index 00000000000..8c9e2bdc3f3
--- /dev/null
+++ b/benchmark/perf-tests/5000-rayjob/ray-data-image-resize.yaml
@@ -0,0 +1,63 @@
+apiVersion: ray.io/v1
+kind: RayJob
+metadata:
+ name: {{.Name}}
+ labels:
+ perf-test: ray-data-image-resize
+spec:
+ shutdownAfterJobFinishes: true
+ entrypoint: python ray_data_image_resize.py
+ submitterPodTemplate:
+ spec:
+ restartPolicy: Never
+ containers:
+ - name: submitter-job
+ image: {{.Image}}
+ command:
+ - "sh"
+ - "-c"
+ args:
+ - |
+ #!/bin/sh
+
+ ray job logs $RAY_JOB_SUBMISSION_ID --address=http://$RAY_DASHBOARD_ADDRESS --follow || \
+ ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID --runtime-env-json '{"env_vars":{"BUCKET_NAME":"ray-images","BUCKET_PREFIX":"images"}}' -- python ray_data_image_resize.py
+ resources:
+ requests:
+ cpu: "10m"
+ rayClusterSpec:
+ rayVersion: '2.9.3'
+ headGroupSpec:
+ rayStartParams:
+ disable-usage-stats: 'true'
+ template:
+ spec:
+ containers:
+ - name: ray-head
+ image: {{.Image}}
+ ports:
+ - containerPort: 6379
+ name: gcs-server
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
+ workerGroupSpecs:
+ - replicas: 2
+ minReplicas: 1
+ maxReplicas: 5
+ groupName: worker-group
+ rayStartParams: {}
+ template:
+ spec:
+ containers:
+ - name: ray-worker
+ image: {{.Image}}
+ resources:
+ requests:
+ cpu: "100m"
+ memory: "2Gi"
diff --git a/benchmark/perf-tests/5000-rayjob/results/junit.xml b/benchmark/perf-tests/5000-rayjob/results/junit.xml
new file mode 100644
index 00000000000..03f3a8ef745
--- /dev/null
+++ b/benchmark/perf-tests/5000-rayjob/results/junit.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/benchmark/perf-tests/README.md b/benchmark/perf-tests/README.md
index 6b736eb3f54..8a824bf7a97 100644
--- a/benchmark/perf-tests/README.md
+++ b/benchmark/perf-tests/README.md
@@ -27,7 +27,16 @@ for previously executed runs of the tests.
The current lists of tests are:
* [100 RayCluster test](./100-raycluster/)
* [100 RayJob test](./100-rayjob/)
-
+* [1000 RayCluster test](./1000-raycluster/)
+* [1000 RayJob test](./1000-rayjob/)
+* [5000 RayCluster test](./5000-raycluster/)
+* [5000 RayJob test](./5000-rayjob/)
+* [10000 RayCluster test](./10000-raycluster/)
+* [10000 RayJob test](./10000-rayjob/)
+
+All published results are based on tests that ran on GKE clusters using KubeRay v1.1.1. Each test directory contains a
+`results/junit.xml` file containing the Cluster Loader 2 steps that were successfully completed.
+To learn more about the benchmark measurements, see [Cluster Loader 2 Measurements](https://github.com/kubernetes/perf-tests/tree/master/clusterloader2#measurement).
## Run a performance test with Kind
diff --git a/benchmark/perf-tests/common/image-preload-daemonset.yaml b/benchmark/perf-tests/common/image-preload-daemonset.yaml
new file mode 100644
index 00000000000..35fc470948d
--- /dev/null
+++ b/benchmark/perf-tests/common/image-preload-daemonset.yaml
@@ -0,0 +1,20 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: ray-image-preloader
+ labels:
+ k8s-app: ray-image-preloader
+spec:
+ selector:
+ matchLabels:
+ k8s-app: ray-image-preloader
+ template:
+ metadata:
+ labels:
+ name: ray-image-preloader
+ k8s-app: ray-image-preloader
+ spec:
+ containers:
+ - image: {{.Image}}
+ name: ray-image-preloader
+ command: [ "sleep", "inf" ]
diff --git a/benchmark/perf-tests/common/preload-image.sh b/benchmark/perf-tests/common/preload-image.sh
new file mode 100644
index 00000000000..319283ebfa1
--- /dev/null
+++ b/benchmark/perf-tests/common/preload-image.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+kubectl apply -f "${SCRIPT_DIR}"/image-preload-daemonset.yaml
+
+kubectl rollout status daemonset ray-image-preloader --timeout 25m
diff --git a/benchmark/perf-tests/100-raycluster/wait-for-rayclusters.sh b/benchmark/perf-tests/common/wait-for-rayclusters.sh
similarity index 95%
rename from benchmark/perf-tests/100-raycluster/wait-for-rayclusters.sh
rename to benchmark/perf-tests/common/wait-for-rayclusters.sh
index ea927cea5ed..d8d48c7d9a8 100644
--- a/benchmark/perf-tests/100-raycluster/wait-for-rayclusters.sh
+++ b/benchmark/perf-tests/common/wait-for-rayclusters.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-expect_succeeded=100
+expect_succeeded=$1
echo "waiting for $expect_succeeded RayClusters to be completed successfully"
while true; do
diff --git a/benchmark/perf-tests/common/wait-for-rayjobs.sh b/benchmark/perf-tests/common/wait-for-rayjobs.sh
new file mode 100644
index 00000000000..eb71818b030
--- /dev/null
+++ b/benchmark/perf-tests/common/wait-for-rayjobs.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+expect_succeeded=$1
+echo "waiting for $expect_succeeded PyTorch RayJobs to be completed successfully"
+
+while true; do
+ num_succeeded=$(kubectl get rayjob -A -l perf-test=rayjob-pytorch-mnist -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
+ echo "$num_succeeded RayJobs completed..."
+
+ if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
+ break;
+ fi
+
+ echo "printing RayJobs with Failed deployment status"
+ kubectl get rayjob -A -l perf-test=rayjob-pytorch-mnist -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobDeploymentStatus}{"\n"}' | grep Failed
+
+ echo "printing RayJobs with FAILED job status"
+ kubectl get rayjob -A -l perf-test=rayjob-pytorch-mnist -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep FAILED
+
+ sleep 30
+done
+
+echo "waiting for $expect_succeeded Ray Data RayJobs to be completed successfully"
+
+while true; do
+ num_succeeded=$(kubectl get rayjob -A -l perf-test=ray-data-image-resize -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
+ echo "$num_succeeded RayJobs completed..."
+
+ if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
+ break;
+ fi
+
+ echo "printing RayJobs with Failed deployment status"
+ kubectl get rayjob -A -l perf-test=ray-data-image-resize -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobDeploymentStatus}{"\n"}' | grep Failed
+
+ echo "printing RayJobs with FAILED job status"
+ kubectl get rayjob -A -l perf-test=ray-data-image-resize -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep FAILED
+
+ sleep 30
+done
+
+echo "$num_succeeded RayJobs completed!"