diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
index c742e5066..9ef733159 100644
--- a/.github/workflows/e2e_tests.yaml
+++ b/.github/workflows/e2e_tests.yaml
@@ -114,6 +114,8 @@ jobs:
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
+ kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers
+ kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
diff --git a/docs/cluster-configuration.md b/docs/cluster-configuration.md
index c131b20cd..ae6cd2ead 100644
--- a/docs/cluster-configuration.md
+++ b/docs/cluster-configuration.md
@@ -18,17 +18,13 @@ cluster = Cluster(ClusterConfiguration(
max_cpus=1, # Default 1
min_memory=2, # Default 2
max_memory=2, # Default 2
- mcad=True, # Default True
+ num_gpus=0, # Default 0
image="quay.io/project-codeflare/ray:latest-py39-cu118", # Mandatory Field
machine_types=["m5.xlarge", "g4dn.xlarge"],
labels={"exampleLabel": "example", "secondLabel": "example"},
))
```
-Upon creating a cluster configuration with `mcad=True` an appwrapper will be created featuring the Ray Cluster and any Routes, Ingresses or Secrets that are needed to be created along side it.
-From there a user can call `cluster.up()` and `cluster.down()` to create and remove the appwrapper thus creating and removing the Ray Cluster.
-
-In cases where `mcad=False` a yaml file will be created with the individual Ray Cluster, Route/Ingress and Secret included.
-The Ray Cluster and service will be created by KubeRay directly and the other components will be individually created.
-
The `labels={"exampleLabel": "example"}` parameter can be used to apply additional labels to the RayCluster resource.
+
+After creating their`cluster`, a user can call `cluster.up()` and `cluster.down()` to respectively create or remove the Ray Cluster.
diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py
index 97d138d5a..c622f8772 100644
--- a/src/codeflare_sdk/cluster/awload.py
+++ b/src/codeflare_sdk/cluster/awload.py
@@ -62,7 +62,7 @@ def submit(self) -> None:
api_instance = client.CustomObjectsApi(api_config_handler())
api_instance.create_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=self.namespace,
plural="appwrappers",
body=self.awyaml,
@@ -87,7 +87,7 @@ def remove(self) -> None:
api_instance = client.CustomObjectsApi(api_config_handler())
api_instance.delete_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=self.namespace,
plural="appwrappers",
name=self.name,
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
index 6ddd778c0..35c26b0a9 100644
--- a/src/codeflare_sdk/cluster/cluster.py
+++ b/src/codeflare_sdk/cluster/cluster.py
@@ -103,26 +103,6 @@ def job_client(self):
)
return self._job_submission_client
- def evaluate_dispatch_priority(self):
- priority_class = self.config.dispatch_priority
-
- try:
- config_check()
- api_instance = client.CustomObjectsApi(api_config_handler())
- priority_classes = api_instance.list_cluster_custom_object(
- group="scheduling.k8s.io",
- version="v1",
- plural="priorityclasses",
- )
- except Exception as e: # pragma: no cover
- return _kube_api_error_handling(e)
-
- for pc in priority_classes["items"]:
- if pc["metadata"]["name"] == priority_class:
- return pc["value"]
- print(f"Priority class {priority_class} is not available in the cluster")
- return None
-
def validate_image_config(self):
"""
Validates that the image configuration is not empty.
@@ -152,18 +132,6 @@ def create_app_wrapper(self):
self.validate_image_config()
# Before attempting to create the cluster AW, let's evaluate the ClusterConfig
- if self.config.dispatch_priority:
- if not self.config.mcad:
- raise ValueError(
- "Invalid Cluster Configuration, cannot have dispatch priority without MCAD"
- )
- priority_val = self.evaluate_dispatch_priority()
- if priority_val == None:
- raise ValueError(
- "Invalid Cluster Configuration, AppWrapper not generated"
- )
- else:
- priority_val = None
name = self.config.name
namespace = self.config.namespace
@@ -178,12 +146,10 @@ def create_app_wrapper(self):
workers = self.config.num_workers
template = self.config.template
image = self.config.image
- instascale = self.config.instascale
- mcad = self.config.mcad
+ appwrapper = self.config.appwrapper
instance_types = self.config.machine_types
env = self.config.envs
image_pull_secrets = self.config.image_pull_secrets
- dispatch_priority = self.config.dispatch_priority
write_to_file = self.config.write_to_file
verify_tls = self.config.verify_tls
local_queue = self.config.local_queue
@@ -202,13 +168,10 @@ def create_app_wrapper(self):
workers=workers,
template=template,
image=image,
- instascale=instascale,
- mcad=mcad,
+ appwrapper=appwrapper,
instance_types=instance_types,
env=env,
image_pull_secrets=image_pull_secrets,
- dispatch_priority=dispatch_priority,
- priority_val=priority_val,
write_to_file=write_to_file,
verify_tls=verify_tls,
local_queue=local_queue,
@@ -230,13 +193,13 @@ def up(self):
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
- if self.config.mcad:
+ if self.config.appwrapper:
if self.config.write_to_file:
with open(self.app_wrapper_yaml) as f:
aw = yaml.load(f, Loader=yaml.FullLoader)
api_instance.create_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
body=aw,
@@ -245,7 +208,7 @@ def up(self):
aw = yaml.safe_load(self.app_wrapper_yaml)
api_instance.create_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
body=aw,
@@ -284,10 +247,10 @@ def down(self):
try:
config_check()
api_instance = client.CustomObjectsApi(api_config_handler())
- if self.config.mcad:
+ if self.config.appwrapper:
api_instance.delete_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
name=self.app_wrapper_name,
@@ -306,30 +269,28 @@ def status(
"""
ready = False
status = CodeFlareClusterStatus.UNKNOWN
- if self.config.mcad:
+ if self.config.appwrapper:
# check the app wrapper status
appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
if appwrapper:
if appwrapper.status in [
- AppWrapperStatus.RUNNING,
- AppWrapperStatus.COMPLETED,
- AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+ AppWrapperStatus.RESUMING,
+ AppWrapperStatus.RESETTING,
]:
ready = False
status = CodeFlareClusterStatus.STARTING
elif appwrapper.status in [
AppWrapperStatus.FAILED,
- AppWrapperStatus.DELETED,
]:
ready = False
status = CodeFlareClusterStatus.FAILED # should deleted be separate
return status, ready # exit early, no need to check ray status
elif appwrapper.status in [
- AppWrapperStatus.PENDING,
- AppWrapperStatus.QUEUEING,
+ AppWrapperStatus.SUSPENDED,
+ AppWrapperStatus.SUSPENDING,
]:
ready = False
- if appwrapper.status == AppWrapperStatus.PENDING:
+ if appwrapper.status == AppWrapperStatus.SUSPENDED:
status = CodeFlareClusterStatus.QUEUED
else:
status = CodeFlareClusterStatus.QUEUEING
@@ -501,7 +462,7 @@ def job_logs(self, job_id: str) -> str:
def from_k8_cluster_object(
rc,
- mcad=True,
+ appwrapper=True,
write_to_file=False,
verify_tls=True,
):
@@ -534,11 +495,10 @@ def from_k8_cluster_object(
"resources"
]["limits"]["nvidia.com/gpu"]
),
- instascale=True if machine_types else False,
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["image"],
- mcad=mcad,
+ appwrapper=appwrapper,
write_to_file=write_to_file,
verify_tls=verify_tls,
local_queue=rc["metadata"]
@@ -597,15 +557,15 @@ def list_all_clusters(namespace: str, print_to_console: bool = True):
return clusters
-def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False):
+def list_all_queued(
+ namespace: str, print_to_console: bool = True, appwrapper: bool = False
+):
"""
Returns (and prints by default) a list of all currently queued-up Ray Clusters
in a given namespace.
"""
- if mcad:
- resources = _get_app_wrappers(
- namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
- )
+ if appwrapper:
+ resources = _get_app_wrappers(namespace, filter=[AppWrapperStatus.SUSPENDED])
if print_to_console:
pretty_print.print_app_wrappers_status(resources)
else:
@@ -675,10 +635,10 @@ def get_cluster(
for rc in rcs["items"]:
if rc["metadata"]["name"] == cluster_name:
- mcad = _check_aw_exists(cluster_name, namespace)
+ appwrapper = _check_aw_exists(cluster_name, namespace)
return Cluster.from_k8_cluster_object(
rc,
- mcad=mcad,
+ appwrapper=appwrapper,
write_to_file=write_to_file,
verify_tls=verify_tls,
)
@@ -721,7 +681,7 @@ def _check_aw_exists(name: str, namespace: str) -> bool:
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
)
@@ -781,7 +741,7 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
)
@@ -851,7 +811,7 @@ def _get_app_wrappers(
api_instance = client.CustomObjectsApi(api_config_handler())
aws = api_instance.list_namespaced_custom_object(
group="workload.codeflare.dev",
- version="v1beta1",
+ version="v1beta2",
namespace=namespace,
plural="appwrappers",
)
@@ -945,18 +905,14 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
def _map_to_app_wrapper(aw) -> AppWrapper:
- if "status" in aw and "canrun" in aw["status"]:
+ if "status" in aw:
return AppWrapper(
name=aw["metadata"]["name"],
- status=AppWrapperStatus(aw["status"]["state"].lower()),
- can_run=aw["status"]["canrun"],
- job_state=aw["status"]["queuejobstate"],
+ status=AppWrapperStatus(aw["status"]["phase"].lower()),
)
return AppWrapper(
name=aw["metadata"]["name"],
- status=AppWrapperStatus("queueing"),
- can_run=False,
- job_state="Still adding to queue",
+ status=AppWrapperStatus("suspended"),
)
diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py
index f8010ea92..9e069c376 100644
--- a/src/codeflare_sdk/cluster/config.py
+++ b/src/codeflare_sdk/cluster/config.py
@@ -46,12 +46,10 @@ class ClusterConfiguration:
max_memory: typing.Union[int, str] = 2
num_gpus: int = 0
template: str = f"{dir}/templates/base-template.yaml"
- instascale: bool = False
- mcad: bool = False
+ appwrapper: bool = False
envs: dict = field(default_factory=dict)
image: str = ""
image_pull_secrets: list = field(default_factory=list)
- dispatch_priority: str = None
write_to_file: bool = False
verify_tls: bool = True
labels: dict = field(default_factory=dict)
diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py
index e2dcb6522..2547de254 100644
--- a/src/codeflare_sdk/cluster/model.py
+++ b/src/codeflare_sdk/cluster/model.py
@@ -37,16 +37,17 @@ class RayClusterStatus(Enum):
class AppWrapperStatus(Enum):
"""
- Defines the possible reportable states of an AppWrapper.
+ Defines the possible reportable phases of an AppWrapper.
"""
- QUEUEING = "queueing"
- PENDING = "pending"
+ SUSPENDED = "suspended"
+ RESUMING = "resuming"
RUNNING = "running"
+ RESETTING = "resetting"
+ SUSPENDING = "suspending"
+ SUCCEEDED = "succeeded"
FAILED = "failed"
- DELETED = "deleted"
- COMPLETED = "completed"
- RUNNING_HOLD_COMPLETION = "runningholdcompletion"
+ TERMINATING = "terminating"
class CodeFlareClusterStatus(Enum):
@@ -91,5 +92,3 @@ class AppWrapper:
name: str
status: AppWrapperStatus
- can_run: bool
- job_state: str
diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml
index 356e3494e..b6a70b2b6 100644
--- a/src/codeflare_sdk/templates/base-template.yaml
+++ b/src/codeflare_sdk/templates/base-template.yaml
@@ -1,254 +1,207 @@
-apiVersion: workload.codeflare.dev/v1beta1
+apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
name: aw-kuberay
namespace: default
- #new addition
- labels:
- orderedinstance: "m4.xlarge_g4dn.xlarge"
spec:
- priority: 9
- resources:
- Items: []
- GenericItems:
- - replicas: 1
- #new addition
- custompodresources:
- - replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - replicas: 3
- requests:
- cpu: 2
- memory: 12G
- nvidia.com/gpu: 1
- limits:
- cpu: 2
- memory: 12G
- nvidia.com/gpu: 1
- generictemplate:
- # This config demonstrates KubeRay's Ray autoscaler integration.
- # The resource requests and limits in this config are too small for production!
- # For an example with more realistic resource configuration, see
- # ray-cluster.autoscaler.large.yaml.
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- workload.codeflare.dev/appwrapper: "aw-kuberay"
- controller-tools.k8s.io: "1.0"
- # A unique identifier for the head node and workers of this cluster.
- name: kuberay-cluster
- # finalizers:
- # - kubernetes
- spec:
- # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
- rayVersion: '2.7.0'
- # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
- # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
- # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
- enableInTreeAutoscaling: false
- # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
- # The example configuration shown below below represents the DEFAULT values.
- # (You may delete autoscalerOptions if the defaults are suitable.)
- autoscalerOptions:
- # upscalingMode is "Default" or "Aggressive."
- # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
- # Default: Upscaling is not rate-limited.
- # Aggressive: An alias for Default; upscaling is not rate-limited.
- upscalingMode: Default
- # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
- idleTimeoutSeconds: 60
- # image optionally overrides the autoscaler's container image.
- # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
- # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
- ## image: "my-repo/my-custom-autoscaler-image:tag"
- # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
- imagePullPolicy: Always
- # resources specifies optional resource request and limit overrides for the autoscaler container.
- # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
- resources:
- limits:
- cpu: "500m"
- memory: "512Mi"
- requests:
- cpu: "500m"
- memory: "512Mi"
- ######################headGroupSpec#################################
- # head group template and specs, (perhaps 'group' is not needed in the name)
- headGroupSpec:
- # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
- serviceType: ClusterIP
- enableIngress: false
- # logical group name, for this called head-group, also can be functional
- # pod type head or worker
- # rayNodeType: head # Not needed since it is under the headgroup
- # the following params are used to complete the ray start: ray start --head --block ...
- rayStartParams:
- # Flag "no-monitor" will be automatically set when autoscaling is enabled.
- dashboard-host: '0.0.0.0'
- block: 'true'
- # num-cpus: '1' # can be auto-completed from the limits
- # Use `resources` to optionally specify custom resource annotations for the Ray node.
- # The value of `resources` is a string-integer mapping.
- # Currently, `resources` must be provided in the specific format demonstrated below:
- # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
- num-gpus: '0'
- #pod template
- template:
- spec:
- #new addition
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: aw-kuberay
- operator: In
- values:
- - "aw-kuberay"
- containers:
- # The Ray head pod
- - name: ray-head
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- lifecycle:
- preStop:
- exec:
- command: ["/bin/sh","-c","ray stop"]
- resources:
- limits:
- cpu: 2
- memory: "8G"
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: "8G"
- nvidia.com/gpu: 0
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- volumes:
- - name: odh-trusted-ca-cert
- configMap:
- name: odh-trusted-ca-bundle
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- optional: true
- - name: odh-ca-cert
- configMap:
- name: odh-trusted-ca-bundle
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- optional: true
- workerGroupSpecs:
- # the pod replicas in this group typed worker
- - replicas: 3
- minReplicas: 3
- maxReplicas: 3
- # logical group name, for this called small-group, also can be functional
- groupName: small-group
- # if worker pods need to be added, we can simply increment the replicas
- # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
- # the operator will remove pods from the list until the number of replicas is satisfied
- # when a pod is confirmed to be deleted, its name will be removed from the list below
- #scaleStrategy:
- # workersToDelete:
- # - raycluster-complete-worker-small-group-bdtwh
- # - raycluster-complete-worker-small-group-hv457
- # - raycluster-complete-worker-small-group-k8tj7
- # the following params are used to complete the ray start: ray start --block ...
- rayStartParams:
- block: 'true'
- num-gpus: 1
- #pod template
- template:
- metadata:
- labels:
- key: value
- # annotations for pod
- annotations:
- key: value
- # finalizers:
- # - kubernetes
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: aw-kuberay
- operator: In
- values:
- - "aw-kuberay"
- containers:
- - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- # environment variables to set in the container.Optional.
- # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
- lifecycle:
- preStop:
- exec:
- command: ["/bin/sh","-c","ray stop"]
- resources:
- limits:
- cpu: "2"
- memory: "12G"
- nvidia.com/gpu: "1"
- requests:
- cpu: "2"
- memory: "12G"
- nvidia.com/gpu: "1"
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- volumes:
- - name: odh-trusted-ca-cert
- configMap:
- name: odh-trusted-ca-bundle
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- optional: true
- - name: odh-ca-cert
- configMap:
- name: odh-trusted-ca-bundle
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- optional: true
+ components:
+ - template:
+ # This config demonstrates KubeRay's Ray autoscaler integration.
+ # The resource requests and limits in this config are too small for production!
+ # For an example with more realistic resource configuration, see
+ # ray-cluster.autoscaler.large.yaml.
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: "1.0"
+ # A unique identifier for the head node and workers of this cluster.
+ name: kuberay-cluster
+ # finalizers:
+ # - kubernetes
+ spec:
+ # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
+ rayVersion: '2.7.0'
+ # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
+ # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
+ # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
+ enableInTreeAutoscaling: false
+ # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
+ # The example configuration shown below below represents the DEFAULT values.
+ # (You may delete autoscalerOptions if the defaults are suitable.)
+ autoscalerOptions:
+ # upscalingMode is "Default" or "Aggressive."
+ # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
+ # Default: Upscaling is not rate-limited.
+ # Aggressive: An alias for Default; upscaling is not rate-limited.
+ upscalingMode: Default
+ # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
+ idleTimeoutSeconds: 60
+ # image optionally overrides the autoscaler's container image.
+ # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
+ # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
+ ## image: "my-repo/my-custom-autoscaler-image:tag"
+ # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
+ imagePullPolicy: Always
+ # resources specifies optional resource request and limit overrides for the autoscaler container.
+ # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+ resources:
+ limits:
+ cpu: "500m"
+ memory: "512Mi"
+ requests:
+ cpu: "500m"
+ memory: "512Mi"
+ ######################headGroupSpec#################################
+ # head group template and specs, (perhaps 'group' is not needed in the name)
+ headGroupSpec:
+ # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
+ serviceType: ClusterIP
+ enableIngress: false
+ # logical group name, for this called head-group, also can be functional
+ # pod type head or worker
+ # rayNodeType: head # Not needed since it is under the headgroup
+ # the following params are used to complete the ray start: ray start --head --block ...
+ rayStartParams:
+ # Flag "no-monitor" will be automatically set when autoscaling is enabled.
+ dashboard-host: '0.0.0.0'
+ block: 'true'
+ # num-cpus: '1' # can be auto-completed from the limits
+ # Use `resources` to optionally specify custom resource annotations for the Ray node.
+ # The value of `resources` is a string-integer mapping.
+ # Currently, `resources` must be provided in the specific format demonstrated below:
+ # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
+ num-gpus: '0'
+ #pod template
+ template:
+ spec:
+ containers:
+ # The Ray head pod
+ - name: ray-head
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
+ imagePullPolicy: Always
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: 2
+ memory: "8G"
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 2
+ memory: "8G"
+ nvidia.com/gpu: 0
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ volumes:
+ - name: odh-trusted-ca-cert
+ configMap:
+ name: odh-trusted-ca-bundle
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ optional: true
+ - name: odh-ca-cert
+ configMap:
+ name: odh-trusted-ca-bundle
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ optional: true
+ workerGroupSpecs:
+ # the pod replicas in this group typed worker
+ - replicas: 3
+ minReplicas: 3
+ maxReplicas: 3
+ # logical group name, for this called small-group, also can be functional
+ groupName: small-group
+ # if worker pods need to be added, we can simply increment the replicas
+ # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
+ # the operator will remove pods from the list until the number of replicas is satisfied
+ # when a pod is confirmed to be deleted, its name will be removed from the list below
+ #scaleStrategy:
+ # workersToDelete:
+ # - raycluster-complete-worker-small-group-bdtwh
+ # - raycluster-complete-worker-small-group-hv457
+ # - raycluster-complete-worker-small-group-k8tj7
+ # the following params are used to complete the ray start: ray start --block ...
+ rayStartParams:
+ block: 'true'
+ num-gpus: 1
+ #pod template
+ template:
+ metadata:
+ labels:
+ key: value
+ # annotations for pod
+ annotations:
+ key: value
+ # finalizers:
+ # - kubernetes
+ spec:
+ containers:
+ - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
+ # environment variables to set in the container.Optional.
+ # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "2"
+ memory: "12G"
+ nvidia.com/gpu: "1"
+ requests:
+ cpu: "2"
+ memory: "12G"
+ nvidia.com/gpu: "1"
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ volumes:
+ - name: odh-trusted-ca-cert
+ configMap:
+ name: odh-trusted-ca-bundle
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ optional: true
+ - name: odh-ca-cert
+ configMap:
+ name: odh-trusted-ca-bundle
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ optional: true
diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py
index 2ea6dd78d..dcd4a42c4 100755
--- a/src/codeflare_sdk/utils/generate_yaml.py
+++ b/src/codeflare_sdk/utils/generate_yaml.py
@@ -81,109 +81,11 @@ def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
metadata = yaml.get("metadata")
metadata["name"] = appwrapper_name
metadata["namespace"] = namespace
- lower_meta = item.get("generictemplate", {}).get("metadata")
- lower_meta["labels"]["workload.codeflare.dev/appwrapper"] = appwrapper_name
+ lower_meta = item.get("template", {}).get("metadata")
lower_meta["name"] = cluster_name
lower_meta["namespace"] = namespace
-def update_labels(yaml, instascale, instance_types):
- metadata = yaml.get("metadata")
- if instascale:
- if not len(instance_types) > 0:
- sys.exit(
- "If instascale is set to true, must provide at least one instance type"
- )
- type_str = ""
- for type in instance_types:
- type_str += type + "_"
- type_str = type_str[:-1]
- metadata["labels"]["orderedinstance"] = type_str
- else:
- metadata.pop("labels")
-
-
-def update_priority(yaml, item, dispatch_priority, priority_val):
- spec = yaml.get("spec")
- if dispatch_priority is not None:
- if priority_val:
- spec["priority"] = priority_val
- else:
- raise ValueError(
- "AW generation error: Priority value is None, while dispatch_priority is defined"
- )
- head = item.get("generictemplate").get("spec").get("headGroupSpec")
- worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
- head["template"]["spec"]["priorityClassName"] = dispatch_priority
- worker["template"]["spec"]["priorityClassName"] = dispatch_priority
- else:
- spec.pop("priority")
-
-
-def update_custompodresources(
- item,
- min_cpu,
- max_cpu,
- min_memory,
- max_memory,
- gpu,
- workers,
- head_cpus,
- head_memory,
- head_gpus,
-):
- if "custompodresources" in item.keys():
- custompodresources = item.get("custompodresources")
- for i in range(len(custompodresources)):
- resource = custompodresources[i]
- if i == 0:
- # Leave head node resources as template default
- resource["requests"]["cpu"] = head_cpus
- resource["limits"]["cpu"] = head_cpus
- resource["requests"]["memory"] = head_memory
- resource["limits"]["memory"] = head_memory
- resource["requests"]["nvidia.com/gpu"] = head_gpus
- resource["limits"]["nvidia.com/gpu"] = head_gpus
-
- else:
- for k, v in resource.items():
- if k == "replicas" and i == 1:
- resource[k] = workers
- if k == "requests" or k == "limits":
- for spec, _ in v.items():
- if spec == "cpu":
- if k == "limits":
- resource[k][spec] = max_cpu
- else:
- resource[k][spec] = min_cpu
- if spec == "memory":
- if k == "limits":
- resource[k][spec] = max_memory
- else:
- resource[k][spec] = min_memory
- if spec == "nvidia.com/gpu":
- if i == 0:
- resource[k][spec] = 0
- else:
- resource[k][spec] = gpu
- else:
- sys.exit("Error: malformed template")
-
-
-def update_affinity(spec, appwrapper_name, instascale):
- if instascale:
- node_selector_terms = (
- spec.get("affinity")
- .get("nodeAffinity")
- .get("requiredDuringSchedulingIgnoredDuringExecution")
- .get("nodeSelectorTerms")
- )
- node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
- node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
- else:
- spec.pop("affinity")
-
-
def update_image(spec, image):
containers = spec.get("containers")
for container in containers:
@@ -232,18 +134,17 @@ def update_nodes(
gpu,
workers,
image,
- instascale,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
):
- if "generictemplate" in item.keys():
- head = item.get("generictemplate").get("spec").get("headGroupSpec")
+ if "template" in item.keys():
+ head = item.get("template").get("spec").get("headGroupSpec")
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
- worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
+ worker = item.get("template").get("spec").get("workerGroupSpecs")[0]
# Head counts as first worker
worker["replicas"] = workers
worker["minReplicas"] = workers
@@ -253,7 +154,6 @@ def update_nodes(
for comp in [head, worker]:
spec = comp.get("template").get("spec")
- update_affinity(spec, appwrapper_name, instascale)
update_image_pull_secrets(spec, image_pull_secrets)
update_image(spec, image)
update_env(spec, env)
@@ -328,74 +228,52 @@ def local_queue_exists(namespace: str, local_queue_name: str):
return False
+def add_queue_label(item: dict, namespace: str, local_queue: Optional[str]):
+ lq_name = local_queue or get_default_kueue_name(namespace)
+ if not local_queue_exists(namespace, lq_name):
+ raise ValueError(
+ "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration"
+ )
+ if not "labels" in item["metadata"]:
+ item["metadata"]["labels"] = {}
+ item["metadata"]["labels"].update({"kueue.x-k8s.io/queue-name": lq_name})
+
+
+def augment_labels(item: dict, labels: dict):
+ if "template" in item:
+ if not "labels" in item["template"]["metadata"]:
+ item["template"]["metadata"]["labels"] = {}
+ item["template"]["metadata"]["labels"].update(labels)
+
+
def write_components(
user_yaml: dict,
output_file_name: str,
- namespace: str,
- local_queue: Optional[str],
- labels: dict,
):
# Create the directory if it doesn't exist
directory_path = os.path.dirname(output_file_name)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
- components = user_yaml.get("spec", "resources")["resources"].get("GenericItems")
+ components = user_yaml.get("spec", "resources").get("components")
open(output_file_name, "w").close()
- lq_name = local_queue or get_default_kueue_name(namespace)
- cluster_labels = labels
- if not local_queue_exists(namespace, lq_name):
- raise ValueError(
- "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration"
- )
with open(output_file_name, "a") as outfile:
for component in components:
- if "generictemplate" in component:
- if (
- "workload.codeflare.dev/appwrapper"
- in component["generictemplate"]["metadata"]["labels"]
- ):
- del component["generictemplate"]["metadata"]["labels"][
- "workload.codeflare.dev/appwrapper"
- ]
- labels = component["generictemplate"]["metadata"]["labels"]
- labels.update({"kueue.x-k8s.io/queue-name": lq_name})
- labels.update(cluster_labels)
+ if "template" in component:
outfile.write("---\n")
- yaml.dump(
- component["generictemplate"], outfile, default_flow_style=False
- )
+ yaml.dump(component["template"], outfile, default_flow_style=False)
print(f"Written to: {output_file_name}")
def load_components(
user_yaml: dict,
name: str,
- namespace: str,
- local_queue: Optional[str],
- labels: dict,
):
component_list = []
- components = user_yaml.get("spec", "resources")["resources"].get("GenericItems")
- lq_name = local_queue or get_default_kueue_name(namespace)
- cluster_labels = labels
- if not local_queue_exists(namespace, lq_name):
- raise ValueError(
- "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration"
- )
+ components = user_yaml.get("spec", "resources").get("components")
for component in components:
- if "generictemplate" in component:
- if (
- "workload.codeflare.dev/appwrapper"
- in component["generictemplate"]["metadata"]["labels"]
- ):
- del component["generictemplate"]["metadata"]["labels"][
- "workload.codeflare.dev/appwrapper"
- ]
- labels = component["generictemplate"]["metadata"]["labels"]
- labels.update({"kueue.x-k8s.io/queue-name": lq_name})
- labels.update(cluster_labels)
- component_list.append(component["generictemplate"])
+ if "template" in component:
+ component_list.append(component["template"])
resources = "---\n" + "---\n".join(
[yaml.dump(component) for component in component_list]
@@ -425,13 +303,10 @@ def generate_appwrapper(
workers: int,
template: str,
image: str,
- instascale: bool,
- mcad: bool,
+ appwrapper: bool,
instance_types: list,
env,
image_pull_secrets: list,
- dispatch_priority: str,
- priority_val: int,
write_to_file: bool,
verify_tls: bool,
local_queue: Optional[str],
@@ -440,7 +315,7 @@ def generate_appwrapper(
user_yaml = read_template(template)
appwrapper_name, cluster_name = gen_names(name)
resources = user_yaml.get("spec", "resources")
- item = resources["resources"].get("GenericItems")[0]
+ item = resources.get("components")[0]
update_names(
user_yaml,
item,
@@ -448,20 +323,6 @@ def generate_appwrapper(
cluster_name,
namespace,
)
- update_labels(user_yaml, instascale, instance_types)
- update_priority(user_yaml, item, dispatch_priority, priority_val)
- update_custompodresources(
- item,
- min_cpu,
- max_cpu,
- min_memory,
- max_memory,
- gpu,
- workers,
- head_cpus,
- head_memory,
- head_gpus,
- )
update_nodes(
item,
appwrapper_name,
@@ -472,7 +333,6 @@ def generate_appwrapper(
gpu,
workers,
image,
- instascale,
env,
image_pull_secrets,
head_cpus,
@@ -480,18 +340,25 @@ def generate_appwrapper(
head_gpus,
)
+ augment_labels(item, labels)
+
+ if appwrapper:
+ add_queue_label(user_yaml, namespace, local_queue)
+ else:
+ add_queue_label(item["template"], namespace, local_queue)
+
directory_path = os.path.expanduser("~/.codeflare/resources/")
outfile = os.path.join(directory_path, appwrapper_name + ".yaml")
if write_to_file:
- if mcad:
+ if appwrapper:
write_user_appwrapper(user_yaml, outfile)
else:
- write_components(user_yaml, outfile, namespace, local_queue, labels)
+ write_components(user_yaml, outfile)
return outfile
else:
- if mcad:
+ if appwrapper:
user_yaml = load_appwrapper(user_yaml, name)
else:
- user_yaml = load_components(user_yaml, name, namespace, local_queue, labels)
+ user_yaml = load_components(user_yaml, name)
return user_yaml
diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
new file mode 100644
index 000000000..2aa5da16d
--- /dev/null
+++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py
@@ -0,0 +1,106 @@
+import requests
+
+from time import sleep
+
+from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
+from codeflare_sdk.job import RayJobClient
+
+import pytest
+
+from support import *
+
+# This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster
+
+
+@pytest.mark.kind
+class TestRayClusterSDKAppWrapperKind:
+ def setup_method(self):
+ initialize_kubernetes_client(self)
+
+ def teardown_method(self):
+ delete_namespace(self)
+
+ def test_mnist_ray_cluster_sdk_kind(self):
+ self.setup_method()
+ create_namespace(self)
+ create_kueue_resources(self)
+ self.run_mnist_raycluster_sdk_kind()
+
+ def run_mnist_raycluster_sdk_kind(self):
+ ray_image = get_ray_image()
+
+ cluster = Cluster(
+ ClusterConfiguration(
+ name="mnist",
+ namespace=self.namespace,
+ num_workers=1,
+ head_cpus="500m",
+ head_memory=2,
+ min_cpus="500m",
+ max_cpus=1,
+ min_memory=1,
+ max_memory=2,
+ num_gpus=0,
+ image=ray_image,
+ write_to_file=True,
+ verify_tls=False,
+ appwrapper=True,
+ )
+ )
+
+ cluster.up()
+
+ cluster.status()
+
+ cluster.wait_ready()
+
+ cluster.status()
+
+ cluster.details()
+
+ self.assert_jobsubmit_withoutlogin_kind(cluster)
+
+ # Assertions
+
+ def assert_jobsubmit_withoutlogin_kind(self, cluster):
+ ray_dashboard = cluster.cluster_dashboard_uri()
+ client = RayJobClient(address=ray_dashboard, verify=False)
+
+ submission_id = client.submit_job(
+ entrypoint="python mnist.py",
+ runtime_env={
+ "working_dir": "./tests/e2e/",
+ "pip": "./tests/e2e/mnist_pip_requirements.txt",
+ },
+ )
+ print(f"Submitted job with ID: {submission_id}")
+ done = False
+ time = 0
+ timeout = 900
+ while not done:
+ status = client.get_job_status(submission_id)
+ if status.is_terminal():
+ break
+ if not done:
+ print(status)
+ if timeout and time >= timeout:
+ raise TimeoutError(f"job has timed out after waiting {timeout}s")
+ sleep(5)
+ time += 5
+
+ logs = client.get_job_logs(submission_id)
+ print(logs)
+
+ self.assert_job_completion(status)
+
+ client.delete_job(submission_id)
+
+ cluster.down()
+
+ def assert_job_completion(self, status):
+ if status == "SUCCEEDED":
+ print(f"Job has completed: '{status}'")
+ assert True
+ else:
+ print(f"Job has completed: '{status}'")
+ assert False
diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py
index 8bb185808..957d0c25e 100644
--- a/tests/e2e/start_ray_cluster.py
+++ b/tests/e2e/start_ray_cluster.py
@@ -20,9 +20,8 @@
min_memory=1,
max_memory=2,
num_gpus=0,
- instascale=False,
image=ray_image,
- mcad=True,
+ appwrapper=True,
)
)
diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml
index 6e969e01b..3c5bf076d 100644
--- a/tests/test-case-bad.yaml
+++ b/tests/test-case-bad.yaml
@@ -1,4 +1,4 @@
-apiVersion: workload.codeflare.dev/v1beta1
+apiVersion: workload.codeflare.dev/v1beta2
kind: AppsWrapper
metadata:
labels:
@@ -6,162 +6,105 @@ metadata:
nam: unit-test-cluster
namspace: ns
spec:
- priority: 9
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- replicas: 2
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- generictemplate:
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- workload.codeflare.dev/appwrapper: unit-test-cluster
- controller-tools.k8s.io: '1.0'
- name: unit-test-cluster
- namespace: ns
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster
- operator: In
- values:
- - unit-test-cluster
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- rayVersion: 1.12.0
- workerGroupSpecs:
- - groupName: small-group-unit-test-cluster
- maxReplicas: 2
- minReplicas: 2
- rayStartParams:
- block: 'true'
- num-gpus: '7'
- replicas: 2
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster
- operator: In
- values:
- - unit-test-cluster
- containers:
- - env:
- - name: MY_POD_IP
- valueFrom:
- fieldRef:
- fieldPath: status.podIP
- image: quay.io/project-codeflare/ray:latest-py39-cu118
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- replicas: 1
- - generictemplate:
- apiVersion: route.openshift.io/v1
- kind: Route
- metadata:
- labels:
- odh-ray-cluster-service: unit-test-cluster-head-svc
- name: ray-dashboard-unit-test-cluster
- namespace: ns
- spec:
- port:
- targetPort: dashboard
- to:
- kind: Service
- name: unit-test-cluster-head-svc
- replicas: 1
- Items: []
+ components:
+ - template:
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: '1.0'
+ name: unit-test-cluster
+ namespace: ns
+ spec:
+ autoscalerOptions:
+ idleTimeoutSeconds: 60
+ imagePullPolicy: Always
+ resources:
+ limits:
+ cpu: 500m
+ memory: 512Mi
+ requests:
+ cpu: 500m
+ memory: 512Mi
+ upscalingMode: Default
+ enableInTreeAutoscaling: false
+ headGroupSpec:
+ rayStartParams:
+ block: 'true'
+ dashboard-host: 0.0.0.0
+ num-gpus: '0'
+ serviceType: ClusterIP
+ template:
+ spec:
+ containers:
+ - env:
+ - name: MY_POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
+ imagePullPolicy: Always
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: ray-head
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ rayVersion: 1.12.0
+ workerGroupSpecs:
+ - groupName: small-group-unit-test-cluster
+ maxReplicas: 2
+ minReplicas: 2
+ rayStartParams:
+ block: 'true'
+ num-gpus: '7'
+ replicas: 2
+ template:
+ metadata:
+ annotations:
+ key: value
+ labels:
+ key: value
+ spec:
+ containers:
+ - env:
+ - name: MY_POD_IP
+ valueFrom:
+ fieldRef:
+ fieldPath: status.podIP
+ image: quay.io/project-codeflare/ray:latest-py39-cu118
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: machine-learning
+ resources:
+ limits:
+ cpu: 4
+ memory: 6G
+ nvidia.com/gpu: 7
+ requests:
+ cpu: 3
+ memory: 5G
+ nvidia.com/gpu: 7
diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls
index 7fcf1fdc4..367703d67 100644
--- a/tests/test-case-no-mcad.yamls
+++ b/tests/test-case-no-mcad.yamls
@@ -31,15 +31,6 @@ spec:
serviceType: ClusterIP
template:
spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster-ray
- operator: In
- values:
- - unit-test-cluster-ray
containers:
- image: quay.io/project-codeflare/ray:latest-py39-cu118
imagePullPolicy: Always
@@ -113,15 +104,6 @@ spec:
labels:
key: value
spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster-ray
- operator: In
- values:
- - unit-test-cluster-ray
containers:
- image: quay.io/project-codeflare/ray:latest-py39-cu118
lifecycle:
diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml
deleted file mode 100644
index a4d6e68f2..000000000
--- a/tests/test-case-prio.yaml
+++ /dev/null
@@ -1,205 +0,0 @@
-apiVersion: workload.codeflare.dev/v1beta1
-kind: AppWrapper
-metadata:
- labels:
- orderedinstance: cpu.small_gpu.large
- name: prio-test-cluster
- namespace: ns
-spec:
- priority: 10
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- replicas: 2
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- generictemplate:
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- controller-tools.k8s.io: '1.0'
- workload.codeflare.dev/appwrapper: prio-test-cluster
- name: prio-test-cluster
- namespace: ns
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- enableIngress: false
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: prio-test-cluster
- operator: In
- values:
- - prio-test-cluster
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets:
- - name: unit-test-pull-secret
- priorityClassName: default
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
- name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
- name: odh-ca-cert
- rayVersion: 2.7.0
- workerGroupSpecs:
- - groupName: small-group-prio-test-cluster
- maxReplicas: 2
- minReplicas: 2
- rayStartParams:
- block: 'true'
- num-gpus: '7'
- replicas: 2
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: prio-test-cluster
- operator: In
- values:
- - prio-test-cluster
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets:
- - name: unit-test-pull-secret
- priorityClassName: default
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
- name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
- name: odh-ca-cert
- replicas: 1
- Items: []
diff --git a/tests/test-case.yaml b/tests/test-case.yaml
index b97d12a49..98166b372 100644
--- a/tests/test-case.yaml
+++ b/tests/test-case.yaml
@@ -1,202 +1,161 @@
-apiVersion: workload.codeflare.dev/v1beta1
+apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
labels:
- orderedinstance: cpu.small_gpu.large
+ kueue.x-k8s.io/queue-name: local-queue-default
name: unit-test-cluster
namespace: ns
spec:
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- replicas: 2
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- generictemplate:
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- controller-tools.k8s.io: '1.0'
- workload.codeflare.dev/appwrapper: unit-test-cluster
- name: unit-test-cluster
- namespace: ns
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- enableIngress: false
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster
- operator: In
- values:
- - unit-test-cluster
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets:
- - name: unit-test-pull-secret
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ components:
+ - template:
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: '1.0'
+ name: unit-test-cluster
+ namespace: ns
+ spec:
+ autoscalerOptions:
+ idleTimeoutSeconds: 60
+ imagePullPolicy: Always
+ resources:
+ limits:
+ cpu: 500m
+ memory: 512Mi
+ requests:
+ cpu: 500m
+ memory: 512Mi
+ upscalingMode: Default
+ enableInTreeAutoscaling: false
+ headGroupSpec:
+ enableIngress: false
+ rayStartParams:
+ block: 'true'
+ dashboard-host: 0.0.0.0
+ num-gpus: '0'
+ serviceType: ClusterIP
+ template:
+ spec:
+ containers:
+ - image: quay.io/project-codeflare/ray:latest-py39-cu118
+ imagePullPolicy: Always
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: ray-head
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
- rayVersion: 2.7.0
- workerGroupSpecs:
- - groupName: small-group-unit-test-cluster
- maxReplicas: 2
- minReplicas: 2
- rayStartParams:
- block: 'true'
- num-gpus: '7'
- replicas: 2
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- affinity:
- nodeAffinity:
- requiredDuringSchedulingIgnoredDuringExecution:
- nodeSelectorTerms:
- - matchExpressions:
- - key: unit-test-cluster
- operator: In
- values:
- - unit-test-cluster
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 4
- memory: 6G
- nvidia.com/gpu: 7
- requests:
- cpu: 3
- memory: 5G
- nvidia.com/gpu: 7
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets:
- - name: unit-test-pull-secret
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-ca-bundle.crt
+ imagePullSecrets:
+ - name: unit-test-pull-secret
+ volumes:
+ - configMap:
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-trusted-ca-cert
+ - configMap:
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-ca-cert
+ rayVersion: 2.7.0
+ workerGroupSpecs:
+ - groupName: small-group-unit-test-cluster
+ maxReplicas: 2
+ minReplicas: 2
+ rayStartParams:
+ block: 'true'
+ num-gpus: '7'
+ replicas: 2
+ template:
+ metadata:
+ annotations:
+ key: value
+ labels:
+ key: value
+ spec:
+ containers:
+ - image: quay.io/project-codeflare/ray:latest-py39-cu118
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: machine-learning
+ resources:
+ limits:
+ cpu: 4
+ memory: 6G
+ nvidia.com/gpu: 7
+ requests:
+ cpu: 3
+ memory: 5G
+ nvidia.com/gpu: 7
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
- replicas: 1
- Items: []
+ subPath: odh-ca-bundle.crt
+ imagePullSecrets:
+ - name: unit-test-pull-secret
+ volumes:
+ - configMap:
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-trusted-ca-cert
+ - configMap:
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-ca-cert
diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml
index c390f619b..f754c1763 100644
--- a/tests/test-default-appwrapper.yaml
+++ b/tests/test-default-appwrapper.yaml
@@ -1,180 +1,159 @@
-apiVersion: workload.codeflare.dev/v1beta1
+apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
+ labels:
+ kueue.x-k8s.io/queue-name: local-queue-default
name: unit-test-default-cluster
namespace: opendatahub
spec:
- resources:
- GenericItems:
- - custompodresources:
- - limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- - limits:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 0
- replicas: 1
- requests:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 0
- generictemplate:
- apiVersion: ray.io/v1
- kind: RayCluster
- metadata:
- labels:
- controller-tools.k8s.io: '1.0'
- workload.codeflare.dev/appwrapper: unit-test-default-cluster
- name: unit-test-default-cluster
- namespace: opendatahub
- spec:
- autoscalerOptions:
- idleTimeoutSeconds: 60
- imagePullPolicy: Always
- resources:
- limits:
- cpu: 500m
- memory: 512Mi
- requests:
- cpu: 500m
- memory: 512Mi
- upscalingMode: Default
- enableInTreeAutoscaling: false
- headGroupSpec:
- enableIngress: false
- rayStartParams:
- block: 'true'
- dashboard-host: 0.0.0.0
- num-gpus: '0'
- serviceType: ClusterIP
- template:
- spec:
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- imagePullPolicy: Always
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: ray-head
- ports:
- - containerPort: 6379
- name: gcs
- - containerPort: 8265
- name: dashboard
- - containerPort: 10001
- name: client
- resources:
- limits:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- requests:
- cpu: 2
- memory: 8G
- nvidia.com/gpu: 0
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets: []
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ components:
+ - template:
+ apiVersion: ray.io/v1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: '1.0'
+ name: unit-test-default-cluster
+ namespace: opendatahub
+ spec:
+ autoscalerOptions:
+ idleTimeoutSeconds: 60
+ imagePullPolicy: Always
+ resources:
+ limits:
+ cpu: 500m
+ memory: 512Mi
+ requests:
+ cpu: 500m
+ memory: 512Mi
+ upscalingMode: Default
+ enableInTreeAutoscaling: false
+ headGroupSpec:
+ enableIngress: false
+ rayStartParams:
+ block: 'true'
+ dashboard-host: 0.0.0.0
+ num-gpus: '0'
+ serviceType: ClusterIP
+ template:
+ spec:
+ containers:
+ - image: quay.io/project-codeflare/ray:latest-py39-cu118
+ imagePullPolicy: Always
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: ray-head
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ resources:
+ limits:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 2
+ memory: 8G
+ nvidia.com/gpu: 0
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
- rayVersion: 2.7.0
- workerGroupSpecs:
- - groupName: small-group-unit-test-default-cluster
- maxReplicas: 1
- minReplicas: 1
- rayStartParams:
- block: 'true'
- num-gpus: '0'
- replicas: 1
- template:
- metadata:
- annotations:
- key: value
- labels:
- key: value
- spec:
- containers:
- - image: quay.io/project-codeflare/ray:latest-py39-cu118
- lifecycle:
- preStop:
- exec:
- command:
- - /bin/sh
- - -c
- - ray stop
- name: machine-learning
- resources:
- limits:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 0
- requests:
- cpu: 1
- memory: 2G
- nvidia.com/gpu: 0
- volumeMounts:
- - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-cert
- subPath: odh-trusted-ca-bundle.crt
- - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
- name: odh-ca-cert
- subPath: odh-ca-bundle.crt
- imagePullSecrets: []
- volumes:
- - configMap:
- items:
- - key: ca-bundle.crt
- path: odh-trusted-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-ca-bundle.crt
+ imagePullSecrets: []
+ volumes:
+ - configMap:
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-trusted-ca-cert
+ - configMap:
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-ca-cert
+ rayVersion: 2.7.0
+ workerGroupSpecs:
+ - groupName: small-group-unit-test-default-cluster
+ maxReplicas: 1
+ minReplicas: 1
+ rayStartParams:
+ block: 'true'
+ num-gpus: '0'
+ replicas: 1
+ template:
+ metadata:
+ annotations:
+ key: value
+ labels:
+ key: value
+ spec:
+ containers:
+ - image: quay.io/project-codeflare/ray:latest-py39-cu118
+ lifecycle:
+ preStop:
+ exec:
+ command:
+ - /bin/sh
+ - -c
+ - ray stop
+ name: machine-learning
+ resources:
+ limits:
+ cpu: 1
+ memory: 2G
+ nvidia.com/gpu: 0
+ requests:
+ cpu: 1
+ memory: 2G
+ nvidia.com/gpu: 0
+ volumeMounts:
+ - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt
name: odh-trusted-ca-cert
- - configMap:
- items:
- - key: odh-ca-bundle.crt
- path: odh-ca-bundle.crt
- name: odh-trusted-ca-bundle
- optional: true
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-cert
+ subPath: odh-trusted-ca-bundle.crt
+ - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt
+ name: odh-ca-cert
+ subPath: odh-ca-bundle.crt
+ - mountPath: /etc/ssl/certs/odh-ca-bundle.crt
name: odh-ca-cert
- replicas: 1
- Items: []
+ subPath: odh-ca-bundle.crt
+ imagePullSecrets: []
+ volumes:
+ - configMap:
+ items:
+ - key: ca-bundle.crt
+ path: odh-trusted-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-trusted-ca-cert
+ - configMap:
+ items:
+ - key: odh-ca-bundle.crt
+ path: odh-ca-bundle.crt
+ name: odh-trusted-ca-bundle
+ optional: true
+ name: odh-ca-cert
diff --git a/tests/unit_test.py b/tests/unit_test.py
index 1fe139de5..32d730c4d 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -265,15 +265,18 @@ def test_config_creation():
assert config.num_gpus == 7
assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118"
assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml"
- assert config.instascale
assert config.machine_types == ["cpu.small", "gpu.large"]
assert config.image_pull_secrets == ["unit-test-pull-secret"]
- assert config.dispatch_priority == None
- assert config.mcad == True
+ assert config.appwrapper == True
def test_cluster_creation(mocker):
+ # Create AppWrapper containing a Ray Cluster with no local queue specified
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
+ )
cluster = createClusterWithConfig(mocker)
assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster.yaml"
assert cluster.app_wrapper_name == "unit-test-cluster"
@@ -345,8 +348,8 @@ def test_cluster_creation_no_mcad(mocker):
config = createClusterConfig()
config.name = "unit-test-cluster-ray"
config.write_to_file = True
- config.mcad = False
config.labels = {"testlabel": "test", "testlabel2": "test"}
+ config.appwrapper = False
cluster = Cluster(config)
assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml"
@@ -372,7 +375,7 @@ def test_cluster_creation_no_mcad_local_queue(mocker):
)
config = createClusterConfig()
config.name = "unit-test-cluster-ray"
- config.mcad = False
+ config.appwrapper = False
config.write_to_file = True
config.local_queue = "local-queue-default"
config.labels = {"testlabel": "test", "testlabel2": "test"}
@@ -394,12 +397,11 @@ def test_cluster_creation_no_mcad_local_queue(mocker):
min_memory=5,
max_memory=6,
num_gpus=7,
- instascale=True,
machine_types=["cpu.small", "gpu.large"],
image_pull_secrets=["unit-test-pull-secret"],
image="quay.io/project-codeflare/ray:latest-py39-cu118",
write_to_file=True,
- mcad=False,
+ appwrapper=False,
local_queue="local-queue-default",
labels={"testlabel": "test", "testlabel2": "test"},
)
@@ -413,40 +415,20 @@ def test_cluster_creation_no_mcad_local_queue(mocker):
)
-def test_cluster_creation_priority(mocker):
- mocker.patch("kubernetes.client.ApisApi.get_api_versions")
- mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
- mocker.patch(
- "kubernetes.client.CustomObjectsApi.list_cluster_custom_object",
- return_value={"items": [{"metadata": {"name": "default"}, "value": 10}]},
- )
- config = createClusterConfig()
- config.name = "prio-test-cluster"
- config.dispatch_priority = "default"
- mocker.patch(
- "kubernetes.client.CustomObjectsApi.get_cluster_custom_object",
- return_value={"spec": {"domain": "apps.cluster.awsroute.org"}},
- )
- cluster = Cluster(config)
- assert cluster.app_wrapper_yaml == f"{aw_dir}prio-test-cluster.yaml"
- assert cluster.app_wrapper_name == "prio-test-cluster"
- assert filecmp.cmp(
- f"{aw_dir}prio-test-cluster.yaml",
- f"{parent}/tests/test-case-prio.yaml",
- shallow=True,
- )
-
-
def test_default_cluster_creation(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
mocker.patch(
"codeflare_sdk.cluster.cluster.get_current_namespace",
return_value="opendatahub",
)
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
+ )
default_config = ClusterConfiguration(
name="unit-test-default-cluster",
image="quay.io/project-codeflare/ray:latest-py39-cu118",
- mcad=True,
+ appwrapper=True,
)
cluster = Cluster(default_config)
test_aw = yaml.load(cluster.app_wrapper_yaml, Loader=yaml.FullLoader)
@@ -485,7 +467,7 @@ def arg_check_apply_effect(group, version, namespace, plural, body, *args):
assert args == tuple()
if plural == "appwrappers":
assert group == "workload.codeflare.dev"
- assert version == "v1beta1"
+ assert version == "v1beta2"
with open(f"{aw_dir}unit-test-cluster.yaml") as f:
aw = yaml.load(f, Loader=yaml.FullLoader)
assert body == aw
@@ -522,7 +504,7 @@ def arg_check_del_effect(group, version, namespace, plural, name, *args):
assert args == tuple()
if plural == "appwrappers":
assert group == "workload.codeflare.dev"
- assert version == "v1beta1"
+ assert version == "v1beta2"
assert name == "unit-test-cluster"
elif plural == "rayclusters":
assert group == "ray.io"
@@ -554,6 +536,10 @@ def test_cluster_up_down(mocker):
"kubernetes.client.CustomObjectsApi.list_cluster_custom_object",
return_value={"items": []},
)
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
+ )
cluster = cluster = createClusterWithConfig(mocker)
cluster.up()
cluster.down()
@@ -587,7 +573,7 @@ def test_cluster_up_down_no_mcad(mocker):
)
config = createClusterConfig()
config.name = "unit-test-cluster-ray"
- config.mcad = False
+ config.appwrapper = False
cluster = Cluster(config)
cluster.up()
cluster.down()
@@ -616,7 +602,7 @@ def test_get_ingress_domain(mocker):
def aw_status_fields(group, version, namespace, plural, *args):
assert group == "workload.codeflare.dev"
- assert version == "v1beta1"
+ assert version == "v1beta2"
assert namespace == "test-ns"
assert plural == "appwrappers"
assert args == tuple()
@@ -659,6 +645,10 @@ def test_cluster_uris(mocker):
"codeflare_sdk.cluster.cluster._get_ingress_domain",
return_value="apps.cluster.awsroute.org",
)
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
+ )
cluster = cluster = createClusterWithConfig(mocker)
mocker.patch(
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
@@ -781,6 +771,10 @@ def ingress_retrieval(
def test_ray_job_wrapping(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
+ mocker.patch(
+ "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
+ return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
+ )
cluster = cluster = createClusterWithConfig(mocker)
cluster.config.image = "quay.io/project-codeflare/ray:latest-py39-cu118"
mocker.patch(
@@ -843,15 +837,11 @@ def test_print_no_cluster(capsys):
def test_print_appwrappers(capsys):
aw1 = AppWrapper(
name="awtest1",
- status=AppWrapperStatus.PENDING,
- can_run=False,
- job_state="queue-state",
+ status=AppWrapperStatus.SUSPENDED,
)
aw2 = AppWrapper(
name="awtest2",
status=AppWrapperStatus.RUNNING,
- can_run=False,
- job_state="queue-state",
)
try:
print_app_wrappers_status([aw1, aw2])
@@ -859,18 +849,18 @@ def test_print_appwrappers(capsys):
assert 1 == 0
captured = capsys.readouterr()
assert captured.out == (
- "╭───────────────────────╮\n"
- "│ 🚀 Cluster Queue │\n"
- "│ Status 🚀 │\n"
- "│ +---------+---------+ │\n"
- "│ | Name | Status | │\n"
- "│ +=========+=========+ │\n"
- "│ | awtest1 | pending | │\n"
- "│ | | | │\n"
- "│ | awtest2 | running | │\n"
- "│ | | | │\n"
- "│ +---------+---------+ │\n"
- "╰───────────────────────╯\n"
+ "╭─────────────────────────╮\n"
+ "│ 🚀 Cluster Queue │\n"
+ "│ Status 🚀 │\n"
+ "│ +---------+-----------+ │\n"
+ "│ | Name | Status | │\n"
+ "│ +=========+===========+ │\n"
+ "│ | awtest1 | suspended | │\n"
+ "│ | | | │\n"
+ "│ | awtest2 | running | │\n"
+ "│ | | | │\n"
+ "│ +---------+-----------+ │\n"
+ "╰─────────────────────────╯\n"
)
@@ -898,13 +888,18 @@ def test_ray_details(mocker, capsys):
"codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri",
return_value="",
)
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
+ )
cf = Cluster(
ClusterConfiguration(
name="raytest2",
namespace="ns",
image="quay.io/project-codeflare/ray:latest-py39-cu118",
write_to_file=True,
- mcad=True,
+ appwrapper=True,
+ local_queue="local_default_queue",
)
)
captured = capsys.readouterr()
@@ -1023,118 +1018,16 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"creationTimestamp": "2024-03-05T09:55:37Z",
"generation": 1,
"labels": {
- "appwrapper.mcad.ibm.com": "quicktest",
"controller-tools.k8s.io": "1.0",
"resourceName": "quicktest",
- "workload.codeflare.dev/appwrapper": "quicktest",
"orderedinstance": "m4.xlarge_g4dn.xlarge",
"kueue.x-k8s.io/queue-name": "team-a-queue",
},
- "managedFields": [
- {
- "apiVersion": "ray.io/v1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:labels": {
- ".": {},
- "f:appwrapper.mcad.ibm.com": {},
- "f:controller-tools.k8s.io": {},
- "f:resourceName": {},
- "f:workload.codeflare.dev/appwrapper": {},
- },
- "f:ownerReferences": {
- ".": {},
- 'k:{"uid":"a29b1a7a-0992-4860-a8d5-a689a751a3e8"}': {},
- },
- },
- "f:spec": {
- ".": {},
- "f:autoscalerOptions": {
- ".": {},
- "f:idleTimeoutSeconds": {},
- "f:imagePullPolicy": {},
- "f:resources": {
- ".": {},
- "f:limits": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
- "f:requests": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
- },
- "f:upscalingMode": {},
- },
- "f:enableInTreeAutoscaling": {},
- "f:headGroupSpec": {
- ".": {},
- "f:rayStartParams": {
- ".": {},
- "f:block": {},
- "f:dashboard-host": {},
- "f:num-gpus": {},
- },
- "f:serviceType": {},
- "f:template": {
- ".": {},
- "f:spec": {
- ".": {},
- "f:affinity": {
- ".": {},
- "f:nodeAffinity": {
- ".": {},
- "f:requiredDuringSchedulingIgnoredDuringExecution": {},
- },
- },
- "f:imagePullSecrets": {},
- "f:volumes": {},
- },
- },
- },
- "f:rayVersion": {},
- "f:workerGroupSpecs": {},
- },
- },
- "manager": "codeflare-operator",
- "operation": "Update",
- "time": "2024-03-05T09:55:37Z",
- },
- {
- "apiVersion": "ray.io/v1alpha1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:status": {
- ".": {},
- "f:desiredWorkerReplicas": {},
- "f:endpoints": {
- ".": {},
- "f:client": {},
- "f:dashboard": {},
- "f:gcs": {},
- "f:metrics": {},
- },
- "f:head": {".": {}, "f:serviceIP": {}},
- "f:lastUpdateTime": {},
- "f:maxWorkerReplicas": {},
- "f:minWorkerReplicas": {},
- "f:observedGeneration": {},
- }
- },
- "manager": "manager",
- "operation": "Update",
- "subresource": "status",
- "time": "2024-03-05T09:55:37Z",
- },
- ],
"name": "quicktest",
"namespace": "ns",
"ownerReferences": [
{
- "apiVersion": "workload.codeflare.dev/v1beta1",
+ "apiVersion": "workload.codeflare.dev/v1beta2",
"blockOwnerDeletion": True,
"controller": True,
"kind": "AppWrapper",
@@ -1166,23 +1059,6 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"template": {
"metadata": {},
"spec": {
- "affinity": {
- "nodeAffinity": {
- "requiredDuringSchedulingIgnoredDuringExecution": {
- "nodeSelectorTerms": [
- {
- "matchExpressions": [
- {
- "key": "quicktest",
- "operator": "In",
- "values": ["quicktest"],
- }
- ]
- }
- ]
- }
- }
- },
"containers": [
{
"env": [
@@ -1321,23 +1197,6 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"labels": {"key": "value"},
},
"spec": {
- "affinity": {
- "nodeAffinity": {
- "requiredDuringSchedulingIgnoredDuringExecution": {
- "nodeSelectorTerms": [
- {
- "matchExpressions": [
- {
- "key": "quicktest",
- "operator": "In",
- "values": ["quicktest"],
- }
- ]
- }
- ]
- }
- }
- },
"containers": [
{
"env": [
@@ -1468,103 +1327,15 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"creationTimestamp": "2023-02-22T16:26:07Z",
"generation": 1,
"labels": {
- "workload.codeflare.dev/appwrapper": "quicktest2",
"controller-tools.k8s.io": "1.0",
"resourceName": "quicktest2",
"orderedinstance": "m4.xlarge_g4dn.xlarge",
},
- "managedFields": [
- {
- "apiVersion": "ray.io/v1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:labels": {
- ".": {},
- "f:workload.codeflare.dev/appwrapper": {},
- "f:controller-tools.k8s.io": {},
- "f:resourceName": {},
- },
- "f:ownerReferences": {
- ".": {},
- 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {},
- },
- },
- "f:spec": {
- ".": {},
- "f:autoscalerOptions": {
- ".": {},
- "f:idleTimeoutSeconds": {},
- "f:imagePullPolicy": {},
- "f:resources": {
- ".": {},
- "f:limits": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
- "f:requests": {
- ".": {},
- "f:cpu": {},
- "f:memory": {},
- },
- },
- "f:upscalingMode": {},
- },
- "f:enableInTreeAutoscaling": {},
- "f:headGroupSpec": {
- ".": {},
- "f:rayStartParams": {
- ".": {},
- "f:block": {},
- "f:dashboard-host": {},
- "f:num-gpus": {},
- },
- "f:serviceType": {},
- "f:template": {
- ".": {},
- "f:spec": {".": {}, "f:containers": {}},
- },
- },
- "f:rayVersion": {},
- "f:workerGroupSpecs": {},
- },
- },
- "manager": "mcad-controller",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- {
- "apiVersion": "ray.io/v1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:status": {
- ".": {},
- "f:availableWorkerReplicas": {},
- "f:desiredWorkerReplicas": {},
- "f:endpoints": {
- ".": {},
- "f:client": {},
- "f:dashboard": {},
- "f:gcs": {},
- },
- "f:lastUpdateTime": {},
- "f:maxWorkerReplicas": {},
- "f:minWorkerReplicas": {},
- "f:state": {},
- }
- },
- "manager": "manager",
- "operation": "Update",
- "subresource": "status",
- "time": "2023-02-22T16:26:16Z",
- },
- ],
"name": "quicktest2",
"namespace": "ns",
"ownerReferences": [
{
- "apiVersion": "workload.codeflare.dev/v1beta1",
+ "apiVersion": "workload.codeflare.dev/v1beta2",
"blockOwnerDeletion": True,
"controller": True,
"kind": "AppWrapper",
@@ -1723,7 +1494,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
def get_named_aw(group, version, namespace, plural, name):
- aws = get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers")
+ aws = get_aw_obj("workload.codeflare.dev", "v1beta2", "ns", "appwrappers")
return aws["items"][0]
@@ -1731,144 +1502,128 @@ def get_aw_obj(group, version, namespace, plural):
api_obj1 = {
"items": [
{
- "apiVersion": "workload.codeflare.dev/v1beta1",
+ "apiVersion": "workload.codeflare.dev/v1beta2",
"kind": "AppWrapper",
"metadata": {
- "annotations": {
- "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"codeflare.dev/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1","kind":"RayCluster","metadata":{"labels":{"appwrapper.codeflare.dev":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
- },
- "creationTimestamp": "2023-02-22T16:26:07Z",
- "generation": 4,
- "managedFields": [
- {
- "apiVersion": "workload.codeflare.dev/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:spec": {
- "f:resources": {
- "f:GenericItems": {},
- "f:metadata": {},
- },
- "f:schedulingSpec": {},
- "f:service": {".": {}, "f:spec": {}},
- },
- "f:status": {
- ".": {},
- "f:canrun": {},
- "f:conditions": {},
- "f:controllerfirsttimestamp": {},
- "f:filterignore": {},
- "f:queuejobstate": {},
- "f:sender": {},
- "f:state": {},
- "f:systempriority": {},
- },
- },
- "manager": "Go-http-client",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- {
- "apiVersion": "workload.codeflare.dev/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:annotations": {
- ".": {},
- "f:kubectl.kubernetes.io/last-applied-configuration": {},
- }
- },
- "f:spec": {
- ".": {},
- "f:priority": {},
- "f:resources": {".": {}, "f:Items": {}},
- },
- },
- "manager": "kubectl-client-side-apply",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- ],
"name": "quicktest1",
"namespace": "ns",
- "resourceVersion": "9482384",
- "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
},
"spec": {
- "priority": 9,
- "resources": {
- "GenericItems": [
- {
- "allocated": 0,
- "custompodresources": [
- {
- "limits": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
+ "components": [
+ {
+ "template": {
+ "apiVersion": "ray.io/v1",
+ "kind": "RayCluster",
+ "metadata": {
+ "labels": {
+ "controller-tools.k8s.io": "1.0",
},
- {
- "limits": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
+ "name": "quicktest1",
+ "namespace": "ns",
+ },
+ "spec": {
+ "autoscalerOptions": {
+ "idleTimeoutSeconds": 60,
+ "imagePullPolicy": "Always",
+ "resources": {
+ "limits": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ "requests": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
},
+ "upscalingMode": "Default",
},
- ],
- "generictemplate": {
- "apiVersion": "ray.io/v1",
- "kind": "RayCluster",
- "metadata": {
- "labels": {
- "workload.codeflare.dev/appwrapper": "quicktest1",
- "controller-tools.k8s.io": "1.0",
+ "enableInTreeAutoscaling": False,
+ "headGroupSpec": {
+ "rayStartParams": {
+ "block": "true",
+ "dashboard-host": "0.0.0.0",
+ "num-gpus": "0",
},
- "name": "quicktest1",
- "namespace": "ns",
- },
- "spec": {
- "autoscalerOptions": {
- "idleTimeoutSeconds": 60,
- "imagePullPolicy": "Always",
- "resources": {
- "limits": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- "requests": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- },
- "upscalingMode": "Default",
+ "serviceType": "ClusterIP",
+ "template": {
+ "spec": {
+ "containers": [
+ {
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "imagePullPolicy": "Always",
+ "lifecycle": {
+ "preStop": {
+ "exec": {
+ "command": [
+ "/bin/sh",
+ "-c",
+ "ray stop",
+ ]
+ }
+ }
+ },
+ "name": "ray-head",
+ "ports": [
+ {
+ "containerPort": 6379,
+ "name": "gcs",
+ },
+ {
+ "containerPort": 8265,
+ "name": "dashboard",
+ },
+ {
+ "containerPort": 10001,
+ "name": "client",
+ },
+ ],
+ "resources": {
+ "limits": {
+ "cpu": 2,
+ "memory": "8G",
+ "nvidia.com/gpu": 0,
+ },
+ "requests": {
+ "cpu": 2,
+ "memory": "8G",
+ "nvidia.com/gpu": 0,
+ },
+ },
+ }
+ ]
+ }
},
- "enableInTreeAutoscaling": False,
- "headGroupSpec": {
+ },
+ "rayVersion": "1.12.0",
+ "workerGroupSpecs": [
+ {
+ "groupName": "small-group-quicktest",
+ "maxReplicas": 1,
+ "minReplicas": 1,
"rayStartParams": {
"block": "true",
- "dashboard-host": "0.0.0.0",
"num-gpus": "0",
},
- "serviceType": "ClusterIP",
+ "replicas": 1,
"template": {
+ "metadata": {
+ "annotations": {"key": "value"},
+ "labels": {"key": "value"},
+ },
"spec": {
"containers": [
{
+ "env": [
+ {
+ "name": "MY_POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ },
+ }
+ ],
"image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
- "imagePullPolicy": "Always",
"lifecycle": {
"preStop": {
"exec": {
@@ -1880,322 +1635,190 @@ def get_aw_obj(group, version, namespace, plural):
}
}
},
- "name": "ray-head",
- "ports": [
- {
- "containerPort": 6379,
- "name": "gcs",
- },
- {
- "containerPort": 8265,
- "name": "dashboard",
- },
- {
- "containerPort": 10001,
- "name": "client",
- },
- ],
+ "name": "machine-learning",
"resources": {
"limits": {
- "cpu": 2,
- "memory": "8G",
+ "cpu": 1,
+ "memory": "2G",
"nvidia.com/gpu": 0,
},
"requests": {
- "cpu": 2,
- "memory": "8G",
+ "cpu": 1,
+ "memory": "2G",
"nvidia.com/gpu": 0,
},
},
}
- ]
- }
- },
- },
- "rayVersion": "1.12.0",
- "workerGroupSpecs": [
- {
- "groupName": "small-group-quicktest",
- "maxReplicas": 1,
- "minReplicas": 1,
- "rayStartParams": {
- "block": "true",
- "num-gpus": "0",
- },
- "replicas": 1,
- "template": {
- "metadata": {
- "annotations": {"key": "value"},
- "labels": {"key": "value"},
- },
- "spec": {
- "containers": [
- {
- "env": [
- {
- "name": "MY_POD_IP",
- "valueFrom": {
- "fieldRef": {
- "fieldPath": "status.podIP"
- }
- },
- }
- ],
- "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
- "lifecycle": {
- "preStop": {
- "exec": {
- "command": [
- "/bin/sh",
- "-c",
- "ray stop",
- ]
- }
- }
- },
- "name": "machine-learning",
- "resources": {
- "limits": {
- "cpu": 1,
- "memory": "2G",
- "nvidia.com/gpu": 0,
- },
- "requests": {
- "cpu": 1,
- "memory": "2G",
- "nvidia.com/gpu": 0,
- },
- },
- }
- ],
- },
+ ],
},
- }
- ],
- },
+ },
+ }
+ ],
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
- "replicas": 1,
},
- {
- "allocated": 0,
- "generictemplate": {
- "apiVersion": "networking.k8s.io/v1",
- "kind": "Ingress",
- "metadata": {
- "labels": {
- "ingress-owner": "appwrapper-name",
- },
- "name": "ray-dashboard-quicktest",
- "namespace": "default",
+ },
+ {
+ "template": {
+ "apiVersion": "networking.k8s.io/v1",
+ "kind": "Ingress",
+ "metadata": {
+ "labels": {
+ "ingress-owner": "appwrapper-name",
},
- "spec": {
- "ingressClassName": "nginx",
- "rules": [
- {
- "http": {
- "paths": {
- "backend": {
- "service": {
- "name": "quicktest-head-svc",
- "port": {
- "number": 8265
- },
- },
+ "name": "ray-dashboard-quicktest",
+ "namespace": "default",
+ },
+ "spec": {
+ "ingressClassName": "nginx",
+ "rules": [
+ {
+ "http": {
+ "paths": {
+ "backend": {
+ "service": {
+ "name": "quicktest-head-svc",
+ "port": {"number": 8265},
},
- "pathType": "Prefix",
- "path": "/",
},
+ "pathType": "Prefix",
+ "path": "/",
},
- "host": "quicktest.awsroute.com",
- }
- ],
- },
+ },
+ "host": "quicktest.awsroute.com",
+ }
+ ],
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
},
- ],
- "Items": [],
- "metadata": {},
- },
- "schedulingSpec": {},
- "service": {"spec": {}},
- },
- "status": {
- "canrun": True,
- "conditions": [
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
- "status": "True",
- "type": "Init",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
- "reason": "AwaitingHeadOfLine",
- "status": "True",
- "type": "Queueing",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
- "reason": "AppWrapperRunnable",
- "status": "True",
- "type": "Dispatched",
},
],
- "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
- "filterignore": True,
- "queuejobstate": "Dispatched",
- "sender": "before manageQueueJob - afterEtcdDispatching",
- "state": "Running",
- "systempriority": 9,
+ },
+ "status": {
+ "phase": "Running",
},
},
{
- "apiVersion": "workload.codeflare.dev/v1beta1",
+ "apiVersion": "workload.codeflare.dev/v1beta2",
"kind": "AppWrapper",
"metadata": {
- "annotations": {
- "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"codeflare.dev/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1","kind":"RayCluster","metadata":{"labels":{"appwrapper.codeflare.dev":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n'
- },
- "creationTimestamp": "2023-02-22T16:26:07Z",
- "generation": 4,
- "managedFields": [
- {
- "apiVersion": "workload.codeflare.dev/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:spec": {
- "f:resources": {
- "f:GenericItems": {},
- "f:metadata": {},
- },
- "f:schedulingSpec": {},
- "f:service": {".": {}, "f:spec": {}},
- },
- "f:status": {
- ".": {},
- "f:canrun": {},
- "f:conditions": {},
- "f:controllerfirsttimestamp": {},
- "f:filterignore": {},
- "f:queuejobstate": {},
- "f:sender": {},
- "f:state": {},
- "f:systempriority": {},
- },
- },
- "manager": "Go-http-client",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- {
- "apiVersion": "workload.codeflare.dev/v1beta1",
- "fieldsType": "FieldsV1",
- "fieldsV1": {
- "f:metadata": {
- "f:annotations": {
- ".": {},
- "f:kubectl.kubernetes.io/last-applied-configuration": {},
- }
- },
- "f:spec": {
- ".": {},
- "f:priority": {},
- "f:resources": {".": {}, "f:Items": {}},
- },
- },
- "manager": "kubectl-client-side-apply",
- "operation": "Update",
- "time": "2023-02-22T16:26:07Z",
- },
- ],
"name": "quicktest2",
"namespace": "ns",
- "resourceVersion": "9482384",
- "uid": "6334fc1b-471e-4876-8e7b-0b2277679235",
},
"spec": {
- "priority": 9,
- "resources": {
- "GenericItems": [
- {
- "allocated": 0,
- "custompodresources": [
- {
- "limits": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "2",
- "memory": "8G",
- "nvidia.com/gpu": "0",
- },
+ "components": [
+ {
+ "template": {
+ "apiVersion": "ray.io/v1",
+ "kind": "RayCluster",
+ "metadata": {
+ "labels": {
+ "controller-tools.k8s.io": "1.0",
},
- {
- "limits": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
- },
- "replicas": 1,
- "requests": {
- "cpu": "1",
- "memory": "2G",
- "nvidia.com/gpu": "0",
+ "name": "quicktest2",
+ "namespace": "ns",
+ },
+ "spec": {
+ "autoscalerOptions": {
+ "idleTimeoutSeconds": 60,
+ "imagePullPolicy": "Always",
+ "resources": {
+ "limits": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
+ "requests": {
+ "cpu": "500m",
+ "memory": "512Mi",
+ },
},
+ "upscalingMode": "Default",
},
- ],
- "generictemplate": {
- "apiVersion": "ray.io/v1",
- "kind": "RayCluster",
- "metadata": {
- "labels": {
- "workload.codeflare.dev/appwrapper": "quicktest2",
- "controller-tools.k8s.io": "1.0",
+ "enableInTreeAutoscaling": False,
+ "headGroupSpec": {
+ "rayStartParams": {
+ "block": "true",
+ "dashboard-host": "0.0.0.0",
+ "num-gpus": "0",
},
- "name": "quicktest2",
- "namespace": "ns",
- },
- "spec": {
- "autoscalerOptions": {
- "idleTimeoutSeconds": 60,
- "imagePullPolicy": "Always",
- "resources": {
- "limits": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- "requests": {
- "cpu": "500m",
- "memory": "512Mi",
- },
- },
- "upscalingMode": "Default",
+ "serviceType": "ClusterIP",
+ "template": {
+ "spec": {
+ "containers": [
+ {
+ "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
+ "imagePullPolicy": "Always",
+ "lifecycle": {
+ "preStop": {
+ "exec": {
+ "command": [
+ "/bin/sh",
+ "-c",
+ "ray stop",
+ ]
+ }
+ }
+ },
+ "name": "ray-head",
+ "ports": [
+ {
+ "containerPort": 6379,
+ "name": "gcs",
+ },
+ {
+ "containerPort": 8265,
+ "name": "dashboard",
+ },
+ {
+ "containerPort": 10001,
+ "name": "client",
+ },
+ ],
+ "resources": {
+ "limits": {
+ "cpu": 2,
+ "memory": "8G",
+ "nvidia.com/gpu": 0,
+ },
+ "requests": {
+ "cpu": 2,
+ "memory": "8G",
+ "nvidia.com/gpu": 0,
+ },
+ },
+ }
+ ]
+ }
},
- "enableInTreeAutoscaling": False,
- "headGroupSpec": {
+ },
+ "rayVersion": "1.12.0",
+ "workerGroupSpecs": [
+ {
+ "groupName": "small-group-quicktest",
+ "maxReplicas": 1,
+ "minReplicas": 1,
"rayStartParams": {
"block": "true",
- "dashboard-host": "0.0.0.0",
"num-gpus": "0",
},
- "serviceType": "ClusterIP",
+ "replicas": 1,
"template": {
+ "metadata": {
+ "annotations": {"key": "value"},
+ "labels": {"key": "value"},
+ },
"spec": {
"containers": [
{
+ "env": [
+ {
+ "name": "MY_POD_IP",
+ "valueFrom": {
+ "fieldRef": {
+ "fieldPath": "status.podIP"
+ }
+ },
+ }
+ ],
"image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
- "imagePullPolicy": "Always",
"lifecycle": {
"preStop": {
"exec": {
@@ -2207,166 +1830,52 @@ def get_aw_obj(group, version, namespace, plural):
}
}
},
- "name": "ray-head",
- "ports": [
- {
- "containerPort": 6379,
- "name": "gcs",
- },
- {
- "containerPort": 8265,
- "name": "dashboard",
- },
- {
- "containerPort": 10001,
- "name": "client",
- },
- ],
+ "name": "machine-learning",
"resources": {
"limits": {
- "cpu": 2,
- "memory": "8G",
+ "cpu": 1,
+ "memory": "2G",
"nvidia.com/gpu": 0,
},
"requests": {
- "cpu": 2,
- "memory": "8G",
+ "cpu": 1,
+ "memory": "2G",
"nvidia.com/gpu": 0,
},
},
}
- ]
- }
- },
- },
- "rayVersion": "1.12.0",
- "workerGroupSpecs": [
- {
- "groupName": "small-group-quicktest",
- "maxReplicas": 1,
- "minReplicas": 1,
- "rayStartParams": {
- "block": "true",
- "num-gpus": "0",
+ ],
},
- "replicas": 1,
- "template": {
- "metadata": {
- "annotations": {"key": "value"},
- "labels": {"key": "value"},
- },
- "spec": {
- "containers": [
- {
- "env": [
- {
- "name": "MY_POD_IP",
- "valueFrom": {
- "fieldRef": {
- "fieldPath": "status.podIP"
- }
- },
- }
- ],
- "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103",
- "lifecycle": {
- "preStop": {
- "exec": {
- "command": [
- "/bin/sh",
- "-c",
- "ray stop",
- ]
- }
- }
- },
- "name": "machine-learning",
- "resources": {
- "limits": {
- "cpu": 1,
- "memory": "2G",
- "nvidia.com/gpu": 0,
- },
- "requests": {
- "cpu": 1,
- "memory": "2G",
- "nvidia.com/gpu": 0,
- },
- },
- }
- ],
- },
- },
- }
- ],
- },
+ },
+ }
+ ],
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
- "replicas": 1,
},
- {
- "allocated": 0,
- "generictemplate": {
- "apiVersion": "route.openshift.io/v1",
- "kind": "Route",
- "metadata": {
- "labels": {
- "odh-ray-cluster-service": "quicktest-head-svc"
- },
- "name": "ray-dashboard-quicktest",
- "namespace": "default",
+ },
+ {
+ "template": {
+ "apiVersion": "route.openshift.io/v1",
+ "kind": "Route",
+ "metadata": {
+ "labels": {
+ "odh-ray-cluster-service": "quicktest-head-svc"
},
- "spec": {
- "port": {"targetPort": "dashboard"},
- "to": {
- "kind": "Service",
- "name": "quicktest-head-svc",
- },
+ "name": "ray-dashboard-quicktest",
+ "namespace": "default",
+ },
+ "spec": {
+ "port": {"targetPort": "dashboard"},
+ "to": {
+ "kind": "Service",
+ "name": "quicktest-head-svc",
},
},
- "metadata": {},
- "priority": 0,
- "priorityslope": 0,
},
- ],
- "Items": [],
- "metadata": {},
- },
- "schedulingSpec": {},
- "service": {"spec": {}},
- },
- "status": {
- "canrun": True,
- "conditions": [
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z",
- "status": "True",
- "type": "Init",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z",
- "reason": "AwaitingHeadOfLine",
- "status": "True",
- "type": "Queueing",
- },
- {
- "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z",
- "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z",
- "reason": "AppWrapperRunnable",
- "status": "True",
- "type": "Dispatched",
},
],
- "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z",
- "filterignore": True,
- "queuejobstate": "Dispatched",
- "sender": "before manageQueueJob - afterEtcdDispatching",
- "state": "Pending",
- "systempriority": 9,
+ },
+ "status": {
+ "phase": "Suspended",
},
},
]
@@ -2428,6 +1937,10 @@ def test_get_cluster_openshift(mocker):
MagicMock(versions=[MagicMock(group_version="route.openshift.io/v1")])
]
mocker.patch("kubernetes.client.ApisApi", return_value=mock_api)
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
+ )
assert is_openshift_cluster()
@@ -2437,7 +1950,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):
elif plural == "rayclusters":
return get_ray_obj("ray.io", "v1", "ns", "rayclusters")
elif plural == "appwrappers":
- return get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers")
+ return get_aw_obj("workload.codeflare.dev", "v1beta2", "ns", "appwrappers")
elif plural == "localqueues":
return get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues")
@@ -2492,6 +2005,10 @@ def test_get_cluster(mocker):
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
return_value=ingress_retrieval(cluster_name="quicktest", client_ing=True),
)
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
+ )
cluster = get_cluster("quicktest")
cluster_config = cluster.config
assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
@@ -2502,7 +2019,6 @@ def test_get_cluster(mocker):
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
assert cluster_config.num_gpus == 0
- assert cluster_config.instascale
assert (
cluster_config.image
== "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
@@ -2521,6 +2037,10 @@ def test_get_cluster_no_mcad(mocker):
"kubernetes.client.NetworkingV1Api.list_namespaced_ingress",
return_value=ingress_retrieval(cluster_name="quicktest", client_ing=True),
)
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
+ )
cluster = get_cluster("quicktest")
cluster_config = cluster.config
assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
@@ -2531,7 +2051,6 @@ def test_get_cluster_no_mcad(mocker):
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G"
assert cluster_config.num_gpus == 0
- assert cluster_config.instascale
assert (
cluster_config.image
== "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
@@ -2672,7 +2191,7 @@ def test_list_queue(mocker, capsys):
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
side_effect=get_obj_none,
)
- list_all_queued("ns", mcad=True)
+ list_all_queued("ns", appwrapper=True)
captured = capsys.readouterr()
assert captured.out == (
"╭──────────────────────────────────────────────────────────────────────────────╮\n"
@@ -2683,21 +2202,21 @@ def test_list_queue(mocker, capsys):
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
side_effect=get_aw_obj,
)
- list_all_queued("ns", mcad=True)
+ list_all_queued("ns", appwrapper=True)
captured = capsys.readouterr()
assert captured.out == (
- "╭──────────────────────────╮\n"
- "│ 🚀 Cluster Queue Status │\n"
- "│ 🚀 │\n"
- "│ +------------+---------+ │\n"
- "│ | Name | Status | │\n"
- "│ +============+=========+ │\n"
- "│ | quicktest1 | running | │\n"
- "│ | | | │\n"
- "│ | quicktest2 | pending | │\n"
- "│ | | | │\n"
- "│ +------------+---------+ │\n"
- "╰──────────────────────────╯\n"
+ "╭────────────────────────────╮\n"
+ "│ 🚀 Cluster Queue Status │\n"
+ "│ 🚀 │\n"
+ "│ +------------+-----------+ │\n"
+ "│ | Name | Status | │\n"
+ "│ +============+===========+ │\n"
+ "│ | quicktest1 | running | │\n"
+ "│ | | | │\n"
+ "│ | quicktest2 | suspended | │\n"
+ "│ | | | │\n"
+ "│ +------------+-----------+ │\n"
+ "╰────────────────────────────╯\n"
)
@@ -2747,9 +2266,11 @@ def test_list_queue_rayclusters(mocker, capsys):
def test_cluster_status(mocker):
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
- fake_aw = AppWrapper(
- "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused"
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
)
+ fake_aw = AppWrapper("test", AppWrapperStatus.FAILED)
fake_ray = RayCluster(
name="test",
status=RayClusterStatus.UNKNOWN,
@@ -2770,7 +2291,8 @@ def test_cluster_status(mocker):
namespace="ns",
image="quay.io/project-codeflare/ray:latest-py39-cu118",
write_to_file=True,
- mcad=True,
+ appwrapper=True,
+ local_queue="local_default_queue",
)
)
mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
@@ -2786,29 +2308,24 @@ def test_cluster_status(mocker):
assert status == CodeFlareClusterStatus.FAILED
assert ready == False
- fake_aw.status = AppWrapperStatus.DELETED
- status, ready = cf.status()
- assert status == CodeFlareClusterStatus.FAILED
- assert ready == False
-
- fake_aw.status = AppWrapperStatus.PENDING
+ fake_aw.status = AppWrapperStatus.SUSPENDED
status, ready = cf.status()
assert status == CodeFlareClusterStatus.QUEUED
assert ready == False
- fake_aw.status = AppWrapperStatus.COMPLETED
+ fake_aw.status = AppWrapperStatus.RESUMING
status, ready = cf.status()
assert status == CodeFlareClusterStatus.STARTING
assert ready == False
- fake_aw.status = AppWrapperStatus.RUNNING_HOLD_COMPLETION
+ fake_aw.status = AppWrapperStatus.RESETTING
status, ready = cf.status()
assert status == CodeFlareClusterStatus.STARTING
assert ready == False
fake_aw.status = AppWrapperStatus.RUNNING
status, ready = cf.status()
- assert status == CodeFlareClusterStatus.STARTING
+ assert status == CodeFlareClusterStatus.UNKNOWN
assert ready == False
mocker.patch(
@@ -2844,6 +2361,10 @@ def test_wait_ready(mocker, capsys):
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None)
+ mocker.patch(
+ "codeflare_sdk.utils.generate_yaml.local_queue_exists",
+ return_value="true",
+ )
mocker.patch.object(
client.CustomObjectsApi,
"list_namespaced_custom_object",
@@ -2865,7 +2386,8 @@ def test_wait_ready(mocker, capsys):
namespace="ns",
image="quay.io/project-codeflare/ray:latest-py39-cu118",
write_to_file=True,
- mcad=True,
+ appwrapper=True,
+ local_queue="local-queue-default",
)
)
try:
@@ -2937,7 +2459,7 @@ def test_AWManager_creation(mocker):
def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args):
assert group == "workload.codeflare.dev"
- assert version == "v1beta1"
+ assert version == "v1beta2"
assert namespace == "ns"
assert plural == "appwrappers"
with open(f"{aw_dir}test.yaml") as f:
@@ -2948,7 +2470,7 @@ def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args):
def arg_check_aw_del_effect(group, version, namespace, plural, name, *args):
assert group == "workload.codeflare.dev"
- assert version == "v1beta1"
+ assert version == "v1beta2"
assert namespace == "ns"
assert plural == "appwrappers"
assert name == "test"
@@ -3255,7 +2777,6 @@ def test_rjc_list_jobs(ray_job_client, mocker):
# Make sure to always keep this function last
def test_cleanup():
os.remove(f"{aw_dir}unit-test-cluster.yaml")
- os.remove(f"{aw_dir}prio-test-cluster.yaml")
os.remove(f"{aw_dir}test.yaml")
os.remove(f"{aw_dir}raytest2.yaml")
os.remove(f"{aw_dir}unit-test-cluster-ray.yaml")
diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py
index 329df45ed..baa14aaca 100644
--- a/tests/unit_test_support.py
+++ b/tests/unit_test_support.py
@@ -14,8 +14,7 @@ def createClusterConfig():
min_memory=5,
max_memory=6,
num_gpus=7,
- mcad=True,
- instascale=True,
+ appwrapper=True,
machine_types=["cpu.small", "gpu.large"],
image_pull_secrets=["unit-test-pull-secret"],
image="quay.io/project-codeflare/ray:latest-py39-cu118",