From c0f7d7fa36e5c03724aa2bfa62be12289b1108f6 Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 5 Jun 2024 05:38:17 -0400 Subject: [PATCH] Migrate from MCAD to AppWrapper v1beta2 (#521) * rename mcad to appwrapper * remove dispatch_priority (not supported by v1beta2 AppWrapper) * remove instascale * remove priority/affinity from template -- not compatible with Kueue * make mocked objects easier to maintain by removing unnecessary metadata * port appwrapper status to v1beta2 names * prune mocked appwrappers * eliminate dependency on workload.codeflare.dev/appwrapper label * Finish converting AppWrappers to v1beta2 * fix incomplete rebase * rebase: remove instascale from new testcase * add e2e test for appwrapper containing a raycluster * Also must add local_queue label to appwrappers * user labels should also be added to ray cluster wrapped in appwrapper * fix more incorrect test cases that were assuming that appwrappers don't get a localqueue * sdk_user must have rbacs to create appwrappers for e2e test to succeed * elide AppWrappers from top-level documentation --- .github/workflows/e2e_tests.yaml | 2 + docs/cluster-configuration.md | 10 +- src/codeflare_sdk/cluster/awload.py | 4 +- src/codeflare_sdk/cluster/cluster.py | 100 +- src/codeflare_sdk/cluster/config.py | 4 +- src/codeflare_sdk/cluster/model.py | 15 +- .../templates/base-template.yaml | 451 +++--- src/codeflare_sdk/utils/generate_yaml.py | 215 +-- .../e2e/mnist_raycluster_sdk_aw_kind_test.py | 106 ++ tests/e2e/start_ray_cluster.py | 3 +- tests/test-case-bad.yaml | 263 ++-- tests/test-case-no-mcad.yamls | 18 - tests/test-case-prio.yaml | 205 --- tests/test-case.yaml | 343 ++--- tests/test-default-appwrapper.yaml | 321 ++--- tests/unit_test.py | 1215 +++++------------ tests/unit_test_support.py | 3 +- 17 files changed, 1166 insertions(+), 2112 deletions(-) create mode 100644 tests/e2e/mnist_raycluster_sdk_aw_kind_test.py delete mode 100644 tests/test-case-prio.yaml diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index c742e5066..9ef733159 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -114,6 +114,8 @@ jobs: kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user + kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers + kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues diff --git a/docs/cluster-configuration.md b/docs/cluster-configuration.md index c131b20cd..ae6cd2ead 100644 --- a/docs/cluster-configuration.md +++ b/docs/cluster-configuration.md @@ -18,17 +18,13 @@ cluster = Cluster(ClusterConfiguration( max_cpus=1, # Default 1 min_memory=2, # Default 2 max_memory=2, # Default 2 - mcad=True, # Default True + num_gpus=0, # Default 0 image="quay.io/project-codeflare/ray:latest-py39-cu118", # Mandatory Field machine_types=["m5.xlarge", "g4dn.xlarge"], labels={"exampleLabel": "example", "secondLabel": "example"}, )) ``` -Upon creating a cluster configuration with `mcad=True` an appwrapper will be created featuring the Ray Cluster and any Routes, Ingresses or Secrets that are needed to be created along side it.
-From there a user can call `cluster.up()` and `cluster.down()` to create and remove the appwrapper thus creating and removing the Ray Cluster. - -In cases where `mcad=False` a yaml file will be created with the individual Ray Cluster, Route/Ingress and Secret included.
-The Ray Cluster and service will be created by KubeRay directly and the other components will be individually created. - The `labels={"exampleLabel": "example"}` parameter can be used to apply additional labels to the RayCluster resource. + +After creating their`cluster`, a user can call `cluster.up()` and `cluster.down()` to respectively create or remove the Ray Cluster. diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py index 97d138d5a..c622f8772 100644 --- a/src/codeflare_sdk/cluster/awload.py +++ b/src/codeflare_sdk/cluster/awload.py @@ -62,7 +62,7 @@ def submit(self) -> None: api_instance = client.CustomObjectsApi(api_config_handler()) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=self.namespace, plural="appwrappers", body=self.awyaml, @@ -87,7 +87,7 @@ def remove(self) -> None: api_instance = client.CustomObjectsApi(api_config_handler()) api_instance.delete_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=self.namespace, plural="appwrappers", name=self.name, diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 6ddd778c0..35c26b0a9 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -103,26 +103,6 @@ def job_client(self): ) return self._job_submission_client - def evaluate_dispatch_priority(self): - priority_class = self.config.dispatch_priority - - try: - config_check() - api_instance = client.CustomObjectsApi(api_config_handler()) - priority_classes = api_instance.list_cluster_custom_object( - group="scheduling.k8s.io", - version="v1", - plural="priorityclasses", - ) - except Exception as e: # pragma: no cover - return _kube_api_error_handling(e) - - for pc in priority_classes["items"]: - if pc["metadata"]["name"] == priority_class: - return pc["value"] - print(f"Priority class {priority_class} is not available in the cluster") - return None - def validate_image_config(self): """ Validates that the image configuration is not empty. @@ -152,18 +132,6 @@ def create_app_wrapper(self): self.validate_image_config() # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - if self.config.dispatch_priority: - if not self.config.mcad: - raise ValueError( - "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" - ) - priority_val = self.evaluate_dispatch_priority() - if priority_val == None: - raise ValueError( - "Invalid Cluster Configuration, AppWrapper not generated" - ) - else: - priority_val = None name = self.config.name namespace = self.config.namespace @@ -178,12 +146,10 @@ def create_app_wrapper(self): workers = self.config.num_workers template = self.config.template image = self.config.image - instascale = self.config.instascale - mcad = self.config.mcad + appwrapper = self.config.appwrapper instance_types = self.config.machine_types env = self.config.envs image_pull_secrets = self.config.image_pull_secrets - dispatch_priority = self.config.dispatch_priority write_to_file = self.config.write_to_file verify_tls = self.config.verify_tls local_queue = self.config.local_queue @@ -202,13 +168,10 @@ def create_app_wrapper(self): workers=workers, template=template, image=image, - instascale=instascale, - mcad=mcad, + appwrapper=appwrapper, instance_types=instance_types, env=env, image_pull_secrets=image_pull_secrets, - dispatch_priority=dispatch_priority, - priority_val=priority_val, write_to_file=write_to_file, verify_tls=verify_tls, local_queue=local_queue, @@ -230,13 +193,13 @@ def up(self): try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - if self.config.mcad: + if self.config.appwrapper: if self.config.write_to_file: with open(self.app_wrapper_yaml) as f: aw = yaml.load(f, Loader=yaml.FullLoader) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", body=aw, @@ -245,7 +208,7 @@ def up(self): aw = yaml.safe_load(self.app_wrapper_yaml) api_instance.create_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", body=aw, @@ -284,10 +247,10 @@ def down(self): try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - if self.config.mcad: + if self.config.appwrapper: api_instance.delete_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", name=self.app_wrapper_name, @@ -306,30 +269,28 @@ def status( """ ready = False status = CodeFlareClusterStatus.UNKNOWN - if self.config.mcad: + if self.config.appwrapper: # check the app wrapper status appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) if appwrapper: if appwrapper.status in [ - AppWrapperStatus.RUNNING, - AppWrapperStatus.COMPLETED, - AppWrapperStatus.RUNNING_HOLD_COMPLETION, + AppWrapperStatus.RESUMING, + AppWrapperStatus.RESETTING, ]: ready = False status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, - AppWrapperStatus.DELETED, ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate return status, ready # exit early, no need to check ray status elif appwrapper.status in [ - AppWrapperStatus.PENDING, - AppWrapperStatus.QUEUEING, + AppWrapperStatus.SUSPENDED, + AppWrapperStatus.SUSPENDING, ]: ready = False - if appwrapper.status == AppWrapperStatus.PENDING: + if appwrapper.status == AppWrapperStatus.SUSPENDED: status = CodeFlareClusterStatus.QUEUED else: status = CodeFlareClusterStatus.QUEUEING @@ -501,7 +462,7 @@ def job_logs(self, job_id: str) -> str: def from_k8_cluster_object( rc, - mcad=True, + appwrapper=True, write_to_file=False, verify_tls=True, ): @@ -534,11 +495,10 @@ def from_k8_cluster_object( "resources" ]["limits"]["nvidia.com/gpu"] ), - instascale=True if machine_types else False, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], - mcad=mcad, + appwrapper=appwrapper, write_to_file=write_to_file, verify_tls=verify_tls, local_queue=rc["metadata"] @@ -597,15 +557,15 @@ def list_all_clusters(namespace: str, print_to_console: bool = True): return clusters -def list_all_queued(namespace: str, print_to_console: bool = True, mcad: bool = False): +def list_all_queued( + namespace: str, print_to_console: bool = True, appwrapper: bool = False +): """ Returns (and prints by default) a list of all currently queued-up Ray Clusters in a given namespace. """ - if mcad: - resources = _get_app_wrappers( - namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING] - ) + if appwrapper: + resources = _get_app_wrappers(namespace, filter=[AppWrapperStatus.SUSPENDED]) if print_to_console: pretty_print.print_app_wrappers_status(resources) else: @@ -675,10 +635,10 @@ def get_cluster( for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: - mcad = _check_aw_exists(cluster_name, namespace) + appwrapper = _check_aw_exists(cluster_name, namespace) return Cluster.from_k8_cluster_object( rc, - mcad=mcad, + appwrapper=appwrapper, write_to_file=write_to_file, verify_tls=verify_tls, ) @@ -721,7 +681,7 @@ def _check_aw_exists(name: str, namespace: str) -> bool: api_instance = client.CustomObjectsApi(api_config_handler()) aws = api_instance.list_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", ) @@ -781,7 +741,7 @@ def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: api_instance = client.CustomObjectsApi(api_config_handler()) aws = api_instance.list_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", ) @@ -851,7 +811,7 @@ def _get_app_wrappers( api_instance = client.CustomObjectsApi(api_config_handler()) aws = api_instance.list_namespaced_custom_object( group="workload.codeflare.dev", - version="v1beta1", + version="v1beta2", namespace=namespace, plural="appwrappers", ) @@ -945,18 +905,14 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: def _map_to_app_wrapper(aw) -> AppWrapper: - if "status" in aw and "canrun" in aw["status"]: + if "status" in aw: return AppWrapper( name=aw["metadata"]["name"], - status=AppWrapperStatus(aw["status"]["state"].lower()), - can_run=aw["status"]["canrun"], - job_state=aw["status"]["queuejobstate"], + status=AppWrapperStatus(aw["status"]["phase"].lower()), ) return AppWrapper( name=aw["metadata"]["name"], - status=AppWrapperStatus("queueing"), - can_run=False, - job_state="Still adding to queue", + status=AppWrapperStatus("suspended"), ) diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index f8010ea92..9e069c376 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -46,12 +46,10 @@ class ClusterConfiguration: max_memory: typing.Union[int, str] = 2 num_gpus: int = 0 template: str = f"{dir}/templates/base-template.yaml" - instascale: bool = False - mcad: bool = False + appwrapper: bool = False envs: dict = field(default_factory=dict) image: str = "" image_pull_secrets: list = field(default_factory=list) - dispatch_priority: str = None write_to_file: bool = False verify_tls: bool = True labels: dict = field(default_factory=dict) diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index e2dcb6522..2547de254 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -37,16 +37,17 @@ class RayClusterStatus(Enum): class AppWrapperStatus(Enum): """ - Defines the possible reportable states of an AppWrapper. + Defines the possible reportable phases of an AppWrapper. """ - QUEUEING = "queueing" - PENDING = "pending" + SUSPENDED = "suspended" + RESUMING = "resuming" RUNNING = "running" + RESETTING = "resetting" + SUSPENDING = "suspending" + SUCCEEDED = "succeeded" FAILED = "failed" - DELETED = "deleted" - COMPLETED = "completed" - RUNNING_HOLD_COMPLETION = "runningholdcompletion" + TERMINATING = "terminating" class CodeFlareClusterStatus(Enum): @@ -91,5 +92,3 @@ class AppWrapper: name: str status: AppWrapperStatus - can_run: bool - job_state: str diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 356e3494e..b6a70b2b6 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -1,254 +1,207 @@ -apiVersion: workload.codeflare.dev/v1beta1 +apiVersion: workload.codeflare.dev/v1beta2 kind: AppWrapper metadata: name: aw-kuberay namespace: default - #new addition - labels: - orderedinstance: "m4.xlarge_g4dn.xlarge" spec: - priority: 9 - resources: - Items: [] - GenericItems: - - replicas: 1 - #new addition - custompodresources: - - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - replicas: 3 - requests: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - limits: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - generictemplate: - # This config demonstrates KubeRay's Ray autoscaler integration. - # The resource requests and limits in this config are too small for production! - # For an example with more realistic resource configuration, see - # ray-cluster.autoscaler.large.yaml. - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - workload.codeflare.dev/appwrapper: "aw-kuberay" - controller-tools.k8s.io: "1.0" - # A unique identifier for the head node and workers of this cluster. - name: kuberay-cluster - # finalizers: - # - kubernetes - spec: - # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.7.0' - # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. - # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 - # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: false - # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # upscalingMode is "Default" or "Aggressive." - # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. - # Default: Upscaling is not rate-limited. - # Aggressive: An alias for Default; upscaling is not rate-limited. - upscalingMode: Default - # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # image optionally overrides the autoscaler's container image. - # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. - ## image: "my-repo/my-custom-autoscaler-image:tag" - # imagePullPolicy optionally overrides the autoscaler container's image pull policy. - imagePullPolicy: Always - # resources specifies optional resource request and limit overrides for the autoscaler container. - # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" - ######################headGroupSpec################################# - # head group template and specs, (perhaps 'group' is not needed in the name) - headGroupSpec: - # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' - serviceType: ClusterIP - enableIngress: false - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block ... - rayStartParams: - # Flag "no-monitor" will be automatically set when autoscaling is enabled. - dashboard-host: '0.0.0.0' - block: 'true' - # num-cpus: '1' # can be auto-completed from the limits - # Use `resources` to optionally specify custom resource annotations for the Ray node. - # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the specific format demonstrated below: - # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' - num-gpus: '0' - #pod template - template: - spec: - #new addition - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay - operator: In - values: - - "aw-kuberay" - containers: - # The Ray head pod - - name: ray-head - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: 2 - memory: "8G" - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: "8G" - nvidia.com/gpu: 0 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - volumes: - - name: odh-trusted-ca-cert - configMap: - name: odh-trusted-ca-bundle - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - optional: true - - name: odh-ca-cert - configMap: - name: odh-trusted-ca-bundle - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - optional: true - workerGroupSpecs: - # the pod replicas in this group typed worker - - replicas: 3 - minReplicas: 3 - maxReplicas: 3 - # logical group name, for this called small-group, also can be functional - groupName: small-group - # if worker pods need to be added, we can simply increment the replicas - # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list - # the operator will remove pods from the list until the number of replicas is satisfied - # when a pod is confirmed to be deleted, its name will be removed from the list below - #scaleStrategy: - # workersToDelete: - # - raycluster-complete-worker-small-group-bdtwh - # - raycluster-complete-worker-small-group-hv457 - # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block ... - rayStartParams: - block: 'true' - num-gpus: 1 - #pod template - template: - metadata: - labels: - key: value - # annotations for pod - annotations: - key: value - # finalizers: - # - kubernetes - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay - operator: In - values: - - "aw-kuberay" - containers: - - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: quay.io/project-codeflare/ray:latest-py39-cu118 - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - volumes: - - name: odh-trusted-ca-cert - configMap: - name: odh-trusted-ca-bundle - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - optional: true - - name: odh-ca-cert - configMap: - name: odh-trusted-ca-bundle - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - optional: true + components: + - template: + # This config demonstrates KubeRay's Ray autoscaler integration. + # The resource requests and limits in this config are too small for production! + # For an example with more realistic resource configuration, see + # ray-cluster.autoscaler.large.yaml. + apiVersion: ray.io/v1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: "1.0" + # A unique identifier for the head node and workers of this cluster. + name: kuberay-cluster + # finalizers: + # - kubernetes + spec: + # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. + rayVersion: '2.7.0' + # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. + # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 + # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. + enableInTreeAutoscaling: false + # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. + # The example configuration shown below below represents the DEFAULT values. + # (You may delete autoscalerOptions if the defaults are suitable.) + autoscalerOptions: + # upscalingMode is "Default" or "Aggressive." + # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. + # Default: Upscaling is not rate-limited. + # Aggressive: An alias for Default; upscaling is not rate-limited. + upscalingMode: Default + # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. + idleTimeoutSeconds: 60 + # image optionally overrides the autoscaler's container image. + # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as + # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. + ## image: "my-repo/my-custom-autoscaler-image:tag" + # imagePullPolicy optionally overrides the autoscaler container's image pull policy. + imagePullPolicy: Always + # resources specifies optional resource request and limit overrides for the autoscaler container. + # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. + resources: + limits: + cpu: "500m" + memory: "512Mi" + requests: + cpu: "500m" + memory: "512Mi" + ######################headGroupSpec################################# + # head group template and specs, (perhaps 'group' is not needed in the name) + headGroupSpec: + # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' + serviceType: ClusterIP + enableIngress: false + # logical group name, for this called head-group, also can be functional + # pod type head or worker + # rayNodeType: head # Not needed since it is under the headgroup + # the following params are used to complete the ray start: ray start --head --block ... + rayStartParams: + # Flag "no-monitor" will be automatically set when autoscaling is enabled. + dashboard-host: '0.0.0.0' + block: 'true' + # num-cpus: '1' # can be auto-completed from the limits + # Use `resources` to optionally specify custom resource annotations for the Ray node. + # The value of `resources` is a string-integer mapping. + # Currently, `resources` must be provided in the specific format demonstrated below: + # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' + num-gpus: '0' + #pod template + template: + spec: + containers: + # The Ray head pod + - name: ray-head + image: quay.io/project-codeflare/ray:latest-py39-cu118 + imagePullPolicy: Always + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: 2 + memory: "8G" + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: "8G" + nvidia.com/gpu: 0 + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + volumes: + - name: odh-trusted-ca-cert + configMap: + name: odh-trusted-ca-bundle + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + optional: true + - name: odh-ca-cert + configMap: + name: odh-trusted-ca-bundle + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + optional: true + workerGroupSpecs: + # the pod replicas in this group typed worker + - replicas: 3 + minReplicas: 3 + maxReplicas: 3 + # logical group name, for this called small-group, also can be functional + groupName: small-group + # if worker pods need to be added, we can simply increment the replicas + # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list + # the operator will remove pods from the list until the number of replicas is satisfied + # when a pod is confirmed to be deleted, its name will be removed from the list below + #scaleStrategy: + # workersToDelete: + # - raycluster-complete-worker-small-group-bdtwh + # - raycluster-complete-worker-small-group-hv457 + # - raycluster-complete-worker-small-group-k8tj7 + # the following params are used to complete the ray start: ray start --block ... + rayStartParams: + block: 'true' + num-gpus: 1 + #pod template + template: + metadata: + labels: + key: value + # annotations for pod + annotations: + key: value + # finalizers: + # - kubernetes + spec: + containers: + - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' + image: quay.io/project-codeflare/ray:latest-py39-cu118 + # environment variables to set in the container.Optional. + # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ + lifecycle: + preStop: + exec: + command: ["/bin/sh","-c","ray stop"] + resources: + limits: + cpu: "2" + memory: "12G" + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: "12G" + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + volumes: + - name: odh-trusted-ca-cert + configMap: + name: odh-trusted-ca-bundle + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + optional: true + - name: odh-ca-cert + configMap: + name: odh-trusted-ca-bundle + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + optional: true diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 2ea6dd78d..dcd4a42c4 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -81,109 +81,11 @@ def update_names(yaml, item, appwrapper_name, cluster_name, namespace): metadata = yaml.get("metadata") metadata["name"] = appwrapper_name metadata["namespace"] = namespace - lower_meta = item.get("generictemplate", {}).get("metadata") - lower_meta["labels"]["workload.codeflare.dev/appwrapper"] = appwrapper_name + lower_meta = item.get("template", {}).get("metadata") lower_meta["name"] = cluster_name lower_meta["namespace"] = namespace -def update_labels(yaml, instascale, instance_types): - metadata = yaml.get("metadata") - if instascale: - if not len(instance_types) > 0: - sys.exit( - "If instascale is set to true, must provide at least one instance type" - ) - type_str = "" - for type in instance_types: - type_str += type + "_" - type_str = type_str[:-1] - metadata["labels"]["orderedinstance"] = type_str - else: - metadata.pop("labels") - - -def update_priority(yaml, item, dispatch_priority, priority_val): - spec = yaml.get("spec") - if dispatch_priority is not None: - if priority_val: - spec["priority"] = priority_val - else: - raise ValueError( - "AW generation error: Priority value is None, while dispatch_priority is defined" - ) - head = item.get("generictemplate").get("spec").get("headGroupSpec") - worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] - head["template"]["spec"]["priorityClassName"] = dispatch_priority - worker["template"]["spec"]["priorityClassName"] = dispatch_priority - else: - spec.pop("priority") - - -def update_custompodresources( - item, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - head_cpus, - head_memory, - head_gpus, -): - if "custompodresources" in item.keys(): - custompodresources = item.get("custompodresources") - for i in range(len(custompodresources)): - resource = custompodresources[i] - if i == 0: - # Leave head node resources as template default - resource["requests"]["cpu"] = head_cpus - resource["limits"]["cpu"] = head_cpus - resource["requests"]["memory"] = head_memory - resource["limits"]["memory"] = head_memory - resource["requests"]["nvidia.com/gpu"] = head_gpus - resource["limits"]["nvidia.com/gpu"] = head_gpus - - else: - for k, v in resource.items(): - if k == "replicas" and i == 1: - resource[k] = workers - if k == "requests" or k == "limits": - for spec, _ in v.items(): - if spec == "cpu": - if k == "limits": - resource[k][spec] = max_cpu - else: - resource[k][spec] = min_cpu - if spec == "memory": - if k == "limits": - resource[k][spec] = max_memory - else: - resource[k][spec] = min_memory - if spec == "nvidia.com/gpu": - if i == 0: - resource[k][spec] = 0 - else: - resource[k][spec] = gpu - else: - sys.exit("Error: malformed template") - - -def update_affinity(spec, appwrapper_name, instascale): - if instascale: - node_selector_terms = ( - spec.get("affinity") - .get("nodeAffinity") - .get("requiredDuringSchedulingIgnoredDuringExecution") - .get("nodeSelectorTerms") - ) - node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name - node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name - else: - spec.pop("affinity") - - def update_image(spec, image): containers = spec.get("containers") for container in containers: @@ -232,18 +134,17 @@ def update_nodes( gpu, workers, image, - instascale, env, image_pull_secrets, head_cpus, head_memory, head_gpus, ): - if "generictemplate" in item.keys(): - head = item.get("generictemplate").get("spec").get("headGroupSpec") + if "template" in item.keys(): + head = item.get("template").get("spec").get("headGroupSpec") head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) - worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] + worker = item.get("template").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers worker["minReplicas"] = workers @@ -253,7 +154,6 @@ def update_nodes( for comp in [head, worker]: spec = comp.get("template").get("spec") - update_affinity(spec, appwrapper_name, instascale) update_image_pull_secrets(spec, image_pull_secrets) update_image(spec, image) update_env(spec, env) @@ -328,74 +228,52 @@ def local_queue_exists(namespace: str, local_queue_name: str): return False +def add_queue_label(item: dict, namespace: str, local_queue: Optional[str]): + lq_name = local_queue or get_default_kueue_name(namespace) + if not local_queue_exists(namespace, lq_name): + raise ValueError( + "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration" + ) + if not "labels" in item["metadata"]: + item["metadata"]["labels"] = {} + item["metadata"]["labels"].update({"kueue.x-k8s.io/queue-name": lq_name}) + + +def augment_labels(item: dict, labels: dict): + if "template" in item: + if not "labels" in item["template"]["metadata"]: + item["template"]["metadata"]["labels"] = {} + item["template"]["metadata"]["labels"].update(labels) + + def write_components( user_yaml: dict, output_file_name: str, - namespace: str, - local_queue: Optional[str], - labels: dict, ): # Create the directory if it doesn't exist directory_path = os.path.dirname(output_file_name) if not os.path.exists(directory_path): os.makedirs(directory_path) - components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") + components = user_yaml.get("spec", "resources").get("components") open(output_file_name, "w").close() - lq_name = local_queue or get_default_kueue_name(namespace) - cluster_labels = labels - if not local_queue_exists(namespace, lq_name): - raise ValueError( - "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration" - ) with open(output_file_name, "a") as outfile: for component in components: - if "generictemplate" in component: - if ( - "workload.codeflare.dev/appwrapper" - in component["generictemplate"]["metadata"]["labels"] - ): - del component["generictemplate"]["metadata"]["labels"][ - "workload.codeflare.dev/appwrapper" - ] - labels = component["generictemplate"]["metadata"]["labels"] - labels.update({"kueue.x-k8s.io/queue-name": lq_name}) - labels.update(cluster_labels) + if "template" in component: outfile.write("---\n") - yaml.dump( - component["generictemplate"], outfile, default_flow_style=False - ) + yaml.dump(component["template"], outfile, default_flow_style=False) print(f"Written to: {output_file_name}") def load_components( user_yaml: dict, name: str, - namespace: str, - local_queue: Optional[str], - labels: dict, ): component_list = [] - components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") - lq_name = local_queue or get_default_kueue_name(namespace) - cluster_labels = labels - if not local_queue_exists(namespace, lq_name): - raise ValueError( - "local_queue provided does not exist or is not in this namespace. Please provide the correct local_queue name in Cluster Configuration" - ) + components = user_yaml.get("spec", "resources").get("components") for component in components: - if "generictemplate" in component: - if ( - "workload.codeflare.dev/appwrapper" - in component["generictemplate"]["metadata"]["labels"] - ): - del component["generictemplate"]["metadata"]["labels"][ - "workload.codeflare.dev/appwrapper" - ] - labels = component["generictemplate"]["metadata"]["labels"] - labels.update({"kueue.x-k8s.io/queue-name": lq_name}) - labels.update(cluster_labels) - component_list.append(component["generictemplate"]) + if "template" in component: + component_list.append(component["template"]) resources = "---\n" + "---\n".join( [yaml.dump(component) for component in component_list] @@ -425,13 +303,10 @@ def generate_appwrapper( workers: int, template: str, image: str, - instascale: bool, - mcad: bool, + appwrapper: bool, instance_types: list, env, image_pull_secrets: list, - dispatch_priority: str, - priority_val: int, write_to_file: bool, verify_tls: bool, local_queue: Optional[str], @@ -440,7 +315,7 @@ def generate_appwrapper( user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) resources = user_yaml.get("spec", "resources") - item = resources["resources"].get("GenericItems")[0] + item = resources.get("components")[0] update_names( user_yaml, item, @@ -448,20 +323,6 @@ def generate_appwrapper( cluster_name, namespace, ) - update_labels(user_yaml, instascale, instance_types) - update_priority(user_yaml, item, dispatch_priority, priority_val) - update_custompodresources( - item, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - head_cpus, - head_memory, - head_gpus, - ) update_nodes( item, appwrapper_name, @@ -472,7 +333,6 @@ def generate_appwrapper( gpu, workers, image, - instascale, env, image_pull_secrets, head_cpus, @@ -480,18 +340,25 @@ def generate_appwrapper( head_gpus, ) + augment_labels(item, labels) + + if appwrapper: + add_queue_label(user_yaml, namespace, local_queue) + else: + add_queue_label(item["template"], namespace, local_queue) + directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") if write_to_file: - if mcad: + if appwrapper: write_user_appwrapper(user_yaml, outfile) else: - write_components(user_yaml, outfile, namespace, local_queue, labels) + write_components(user_yaml, outfile) return outfile else: - if mcad: + if appwrapper: user_yaml = load_appwrapper(user_yaml, name) else: - user_yaml = load_components(user_yaml, name, namespace, local_queue, labels) + user_yaml = load_components(user_yaml, name) return user_yaml diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py new file mode 100644 index 000000000..2aa5da16d --- /dev/null +++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py @@ -0,0 +1,106 @@ +import requests + +from time import sleep + +from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication +from codeflare_sdk.job import RayJobClient + +import pytest + +from support import * + +# This test creates an AppWrapper containing a Ray Cluster and covers the Ray Job submission functionality on Kind Cluster + + +@pytest.mark.kind +class TestRayClusterSDKAppWrapperKind: + def setup_method(self): + initialize_kubernetes_client(self) + + def teardown_method(self): + delete_namespace(self) + + def test_mnist_ray_cluster_sdk_kind(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind() + + def run_mnist_raycluster_sdk_kind(self): + ray_image = get_ray_image() + + cluster = Cluster( + ClusterConfiguration( + name="mnist", + namespace=self.namespace, + num_workers=1, + head_cpus="500m", + head_memory=2, + min_cpus="500m", + max_cpus=1, + min_memory=1, + max_memory=2, + num_gpus=0, + image=ray_image, + write_to_file=True, + verify_tls=False, + appwrapper=True, + ) + ) + + cluster.up() + + cluster.status() + + cluster.wait_ready() + + cluster.status() + + cluster.details() + + self.assert_jobsubmit_withoutlogin_kind(cluster) + + # Assertions + + def assert_jobsubmit_withoutlogin_kind(self, cluster): + ray_dashboard = cluster.cluster_dashboard_uri() + client = RayJobClient(address=ray_dashboard, verify=False) + + submission_id = client.submit_job( + entrypoint="python mnist.py", + runtime_env={ + "working_dir": "./tests/e2e/", + "pip": "./tests/e2e/mnist_pip_requirements.txt", + }, + ) + print(f"Submitted job with ID: {submission_id}") + done = False + time = 0 + timeout = 900 + while not done: + status = client.get_job_status(submission_id) + if status.is_terminal(): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError(f"job has timed out after waiting {timeout}s") + sleep(5) + time += 5 + + logs = client.get_job_logs(submission_id) + print(logs) + + self.assert_job_completion(status) + + client.delete_job(submission_id) + + cluster.down() + + def assert_job_completion(self, status): + if status == "SUCCEEDED": + print(f"Job has completed: '{status}'") + assert True + else: + print(f"Job has completed: '{status}'") + assert False diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py index 8bb185808..957d0c25e 100644 --- a/tests/e2e/start_ray_cluster.py +++ b/tests/e2e/start_ray_cluster.py @@ -20,9 +20,8 @@ min_memory=1, max_memory=2, num_gpus=0, - instascale=False, image=ray_image, - mcad=True, + appwrapper=True, ) ) diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index 6e969e01b..3c5bf076d 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -1,4 +1,4 @@ -apiVersion: workload.codeflare.dev/v1beta1 +apiVersion: workload.codeflare.dev/v1beta2 kind: AppsWrapper metadata: labels: @@ -6,162 +6,105 @@ metadata: nam: unit-test-cluster namspace: ns spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - replicas: 2 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - generictemplate: - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - workload.codeflare.dev/appwrapper: unit-test-cluster - controller-tools.k8s.io: '1.0' - name: unit-test-cluster - namespace: ns - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster - operator: In - values: - - unit-test-cluster - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - rayVersion: 1.12.0 - workerGroupSpecs: - - groupName: small-group-unit-test-cluster - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '7' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster - operator: In - values: - - unit-test-cluster - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - image: quay.io/project-codeflare/ray:latest-py39-cu118 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: unit-test-cluster-head-svc - name: ray-dashboard-unit-test-cluster - namespace: ns - spec: - port: - targetPort: dashboard - to: - kind: Service - name: unit-test-cluster-head-svc - replicas: 1 - Items: [] + components: + - template: + apiVersion: ray.io/v1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: '1.0' + name: unit-test-cluster + namespace: ns + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/project-codeflare/ray:latest-py39-cu118 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + rayVersion: 1.12.0 + workerGroupSpecs: + - groupName: small-group-unit-test-cluster + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '7' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: quay.io/project-codeflare/ray:latest-py39-cu118 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index 7fcf1fdc4..367703d67 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -31,15 +31,6 @@ spec: serviceType: ClusterIP template: spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster-ray - operator: In - values: - - unit-test-cluster-ray containers: - image: quay.io/project-codeflare/ray:latest-py39-cu118 imagePullPolicy: Always @@ -113,15 +104,6 @@ spec: labels: key: value spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster-ray - operator: In - values: - - unit-test-cluster-ray containers: - image: quay.io/project-codeflare/ray:latest-py39-cu118 lifecycle: diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml deleted file mode 100644 index a4d6e68f2..000000000 --- a/tests/test-case-prio.yaml +++ /dev/null @@ -1,205 +0,0 @@ -apiVersion: workload.codeflare.dev/v1beta1 -kind: AppWrapper -metadata: - labels: - orderedinstance: cpu.small_gpu.large - name: prio-test-cluster - namespace: ns -spec: - priority: 10 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - replicas: 2 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - generictemplate: - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - controller-tools.k8s.io: '1.0' - workload.codeflare.dev/appwrapper: prio-test-cluster - name: prio-test-cluster - namespace: ns - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - enableIngress: false - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: prio-test-cluster - operator: In - values: - - prio-test-cluster - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: - - name: unit-test-pull-secret - priorityClassName: default - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true - name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true - name: odh-ca-cert - rayVersion: 2.7.0 - workerGroupSpecs: - - groupName: small-group-prio-test-cluster - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '7' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: prio-test-cluster - operator: In - values: - - prio-test-cluster - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: - - name: unit-test-pull-secret - priorityClassName: default - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true - name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true - name: odh-ca-cert - replicas: 1 - Items: [] diff --git a/tests/test-case.yaml b/tests/test-case.yaml index b97d12a49..98166b372 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -1,202 +1,161 @@ -apiVersion: workload.codeflare.dev/v1beta1 +apiVersion: workload.codeflare.dev/v1beta2 kind: AppWrapper metadata: labels: - orderedinstance: cpu.small_gpu.large + kueue.x-k8s.io/queue-name: local-queue-default name: unit-test-cluster namespace: ns spec: - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - replicas: 2 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - generictemplate: - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - controller-tools.k8s.io: '1.0' - workload.codeflare.dev/appwrapper: unit-test-cluster - name: unit-test-cluster - namespace: ns - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - enableIngress: false - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster - operator: In - values: - - unit-test-cluster - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: - - name: unit-test-pull-secret - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + components: + - template: + apiVersion: ray.io/v1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: '1.0' + name: unit-test-cluster + namespace: ns + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + enableIngress: false + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - image: quay.io/project-codeflare/ray:latest-py39-cu118 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert - rayVersion: 2.7.0 - workerGroupSpecs: - - groupName: small-group-unit-test-cluster - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '7' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: unit-test-cluster - operator: In - values: - - unit-test-cluster - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 4 - memory: 6G - nvidia.com/gpu: 7 - requests: - cpu: 3 - memory: 5G - nvidia.com/gpu: 7 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: - - name: unit-test-pull-secret - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-ca-bundle.crt + imagePullSecrets: + - name: unit-test-pull-secret + volumes: + - configMap: + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-trusted-ca-cert + - configMap: + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-ca-cert + rayVersion: 2.7.0 + workerGroupSpecs: + - groupName: small-group-unit-test-cluster + maxReplicas: 2 + minReplicas: 2 + rayStartParams: + block: 'true' + num-gpus: '7' + replicas: 2 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - image: quay.io/project-codeflare/ray:latest-py39-cu118 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 4 + memory: 6G + nvidia.com/gpu: 7 + requests: + cpu: 3 + memory: 5G + nvidia.com/gpu: 7 + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert - replicas: 1 - Items: [] + subPath: odh-ca-bundle.crt + imagePullSecrets: + - name: unit-test-pull-secret + volumes: + - configMap: + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-trusted-ca-cert + - configMap: + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-ca-cert diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index c390f619b..f754c1763 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -1,180 +1,159 @@ -apiVersion: workload.codeflare.dev/v1beta1 +apiVersion: workload.codeflare.dev/v1beta2 kind: AppWrapper metadata: + labels: + kueue.x-k8s.io/queue-name: local-queue-default name: unit-test-default-cluster namespace: opendatahub spec: - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1 - kind: RayCluster - metadata: - labels: - controller-tools.k8s.io: '1.0' - workload.codeflare.dev/appwrapper: unit-test-default-cluster - name: unit-test-default-cluster - namespace: opendatahub - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - enableIngress: false - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: [] - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + components: + - template: + apiVersion: ray.io/v1 + kind: RayCluster + metadata: + labels: + controller-tools.k8s.io: '1.0' + name: unit-test-default-cluster + namespace: opendatahub + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + enableIngress: false + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - image: quay.io/project-codeflare/ray:latest-py39-cu118 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert - rayVersion: 2.7.0 - workerGroupSpecs: - - groupName: small-group-unit-test-default-cluster - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - image: quay.io/project-codeflare/ray:latest-py39-cu118 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - volumeMounts: - - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt - name: odh-trusted-ca-cert - subPath: odh-trusted-ca-bundle.crt - - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - - mountPath: /etc/ssl/certs/odh-ca-bundle.crt - name: odh-ca-cert - subPath: odh-ca-bundle.crt - imagePullSecrets: [] - volumes: - - configMap: - items: - - key: ca-bundle.crt - path: odh-trusted-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-ca-bundle.crt + imagePullSecrets: [] + volumes: + - configMap: + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-trusted-ca-cert + - configMap: + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-ca-cert + rayVersion: 2.7.0 + workerGroupSpecs: + - groupName: small-group-unit-test-default-cluster + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - image: quay.io/project-codeflare/ray:latest-py39-cu118 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + volumeMounts: + - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert - - configMap: - items: - - key: odh-ca-bundle.crt - path: odh-ca-bundle.crt - name: odh-trusted-ca-bundle - optional: true + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-trusted-ca-bundle.crt + name: odh-trusted-ca-cert + subPath: odh-trusted-ca-bundle.crt + - mountPath: /etc/pki/tls/certs/odh-ca-bundle.crt + name: odh-ca-cert + subPath: odh-ca-bundle.crt + - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert - replicas: 1 - Items: [] + subPath: odh-ca-bundle.crt + imagePullSecrets: [] + volumes: + - configMap: + items: + - key: ca-bundle.crt + path: odh-trusted-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-trusted-ca-cert + - configMap: + items: + - key: odh-ca-bundle.crt + path: odh-ca-bundle.crt + name: odh-trusted-ca-bundle + optional: true + name: odh-ca-cert diff --git a/tests/unit_test.py b/tests/unit_test.py index 1fe139de5..32d730c4d 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -265,15 +265,18 @@ def test_config_creation(): assert config.num_gpus == 7 assert config.image == "quay.io/project-codeflare/ray:latest-py39-cu118" assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" - assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] assert config.image_pull_secrets == ["unit-test-pull-secret"] - assert config.dispatch_priority == None - assert config.mcad == True + assert config.appwrapper == True def test_cluster_creation(mocker): + # Create AppWrapper containing a Ray Cluster with no local queue specified mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) cluster = createClusterWithConfig(mocker) assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster.yaml" assert cluster.app_wrapper_name == "unit-test-cluster" @@ -345,8 +348,8 @@ def test_cluster_creation_no_mcad(mocker): config = createClusterConfig() config.name = "unit-test-cluster-ray" config.write_to_file = True - config.mcad = False config.labels = {"testlabel": "test", "testlabel2": "test"} + config.appwrapper = False cluster = Cluster(config) assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml" @@ -372,7 +375,7 @@ def test_cluster_creation_no_mcad_local_queue(mocker): ) config = createClusterConfig() config.name = "unit-test-cluster-ray" - config.mcad = False + config.appwrapper = False config.write_to_file = True config.local_queue = "local-queue-default" config.labels = {"testlabel": "test", "testlabel2": "test"} @@ -394,12 +397,11 @@ def test_cluster_creation_no_mcad_local_queue(mocker): min_memory=5, max_memory=6, num_gpus=7, - instascale=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, - mcad=False, + appwrapper=False, local_queue="local-queue-default", labels={"testlabel": "test", "testlabel2": "test"}, ) @@ -413,40 +415,20 @@ def test_cluster_creation_no_mcad_local_queue(mocker): ) -def test_cluster_creation_priority(mocker): - mocker.patch("kubernetes.client.ApisApi.get_api_versions") - mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") - mocker.patch( - "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", - return_value={"items": [{"metadata": {"name": "default"}, "value": 10}]}, - ) - config = createClusterConfig() - config.name = "prio-test-cluster" - config.dispatch_priority = "default" - mocker.patch( - "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", - return_value={"spec": {"domain": "apps.cluster.awsroute.org"}}, - ) - cluster = Cluster(config) - assert cluster.app_wrapper_yaml == f"{aw_dir}prio-test-cluster.yaml" - assert cluster.app_wrapper_name == "prio-test-cluster" - assert filecmp.cmp( - f"{aw_dir}prio-test-cluster.yaml", - f"{parent}/tests/test-case-prio.yaml", - shallow=True, - ) - - def test_default_cluster_creation(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch( "codeflare_sdk.cluster.cluster.get_current_namespace", return_value="opendatahub", ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) default_config = ClusterConfiguration( name="unit-test-default-cluster", image="quay.io/project-codeflare/ray:latest-py39-cu118", - mcad=True, + appwrapper=True, ) cluster = Cluster(default_config) test_aw = yaml.load(cluster.app_wrapper_yaml, Loader=yaml.FullLoader) @@ -485,7 +467,7 @@ def arg_check_apply_effect(group, version, namespace, plural, body, *args): assert args == tuple() if plural == "appwrappers": assert group == "workload.codeflare.dev" - assert version == "v1beta1" + assert version == "v1beta2" with open(f"{aw_dir}unit-test-cluster.yaml") as f: aw = yaml.load(f, Loader=yaml.FullLoader) assert body == aw @@ -522,7 +504,7 @@ def arg_check_del_effect(group, version, namespace, plural, name, *args): assert args == tuple() if plural == "appwrappers": assert group == "workload.codeflare.dev" - assert version == "v1beta1" + assert version == "v1beta2" assert name == "unit-test-cluster" elif plural == "rayclusters": assert group == "ray.io" @@ -554,6 +536,10 @@ def test_cluster_up_down(mocker): "kubernetes.client.CustomObjectsApi.list_cluster_custom_object", return_value={"items": []}, ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) cluster = cluster = createClusterWithConfig(mocker) cluster.up() cluster.down() @@ -587,7 +573,7 @@ def test_cluster_up_down_no_mcad(mocker): ) config = createClusterConfig() config.name = "unit-test-cluster-ray" - config.mcad = False + config.appwrapper = False cluster = Cluster(config) cluster.up() cluster.down() @@ -616,7 +602,7 @@ def test_get_ingress_domain(mocker): def aw_status_fields(group, version, namespace, plural, *args): assert group == "workload.codeflare.dev" - assert version == "v1beta1" + assert version == "v1beta2" assert namespace == "test-ns" assert plural == "appwrappers" assert args == tuple() @@ -659,6 +645,10 @@ def test_cluster_uris(mocker): "codeflare_sdk.cluster.cluster._get_ingress_domain", return_value="apps.cluster.awsroute.org", ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) cluster = cluster = createClusterWithConfig(mocker) mocker.patch( "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", @@ -781,6 +771,10 @@ def ingress_retrieval( def test_ray_job_wrapping(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"), + ) cluster = cluster = createClusterWithConfig(mocker) cluster.config.image = "quay.io/project-codeflare/ray:latest-py39-cu118" mocker.patch( @@ -843,15 +837,11 @@ def test_print_no_cluster(capsys): def test_print_appwrappers(capsys): aw1 = AppWrapper( name="awtest1", - status=AppWrapperStatus.PENDING, - can_run=False, - job_state="queue-state", + status=AppWrapperStatus.SUSPENDED, ) aw2 = AppWrapper( name="awtest2", status=AppWrapperStatus.RUNNING, - can_run=False, - job_state="queue-state", ) try: print_app_wrappers_status([aw1, aw2]) @@ -859,18 +849,18 @@ def test_print_appwrappers(capsys): assert 1 == 0 captured = capsys.readouterr() assert captured.out == ( - "╭───────────────────────╮\n" - "│ 🚀 Cluster Queue │\n" - "│ Status 🚀 │\n" - "│ +---------+---------+ │\n" - "│ | Name | Status | │\n" - "│ +=========+=========+ │\n" - "│ | awtest1 | pending | │\n" - "│ | | | │\n" - "│ | awtest2 | running | │\n" - "│ | | | │\n" - "│ +---------+---------+ │\n" - "╰───────────────────────╯\n" + "╭─────────────────────────╮\n" + "│ 🚀 Cluster Queue │\n" + "│ Status 🚀 │\n" + "│ +---------+-----------+ │\n" + "│ | Name | Status | │\n" + "│ +=========+===========+ │\n" + "│ | awtest1 | suspended | │\n" + "│ | | | │\n" + "│ | awtest2 | running | │\n" + "│ | | | │\n" + "│ +---------+-----------+ │\n" + "╰─────────────────────────╯\n" ) @@ -898,13 +888,18 @@ def test_ray_details(mocker, capsys): "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", return_value="", ) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) cf = Cluster( ClusterConfiguration( name="raytest2", namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, - mcad=True, + appwrapper=True, + local_queue="local_default_queue", ) ) captured = capsys.readouterr() @@ -1023,118 +1018,16 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "creationTimestamp": "2024-03-05T09:55:37Z", "generation": 1, "labels": { - "appwrapper.mcad.ibm.com": "quicktest", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest", - "workload.codeflare.dev/appwrapper": "quicktest", "orderedinstance": "m4.xlarge_g4dn.xlarge", "kueue.x-k8s.io/queue-name": "team-a-queue", }, - "managedFields": [ - { - "apiVersion": "ray.io/v1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:labels": { - ".": {}, - "f:appwrapper.mcad.ibm.com": {}, - "f:controller-tools.k8s.io": {}, - "f:resourceName": {}, - "f:workload.codeflare.dev/appwrapper": {}, - }, - "f:ownerReferences": { - ".": {}, - 'k:{"uid":"a29b1a7a-0992-4860-a8d5-a689a751a3e8"}': {}, - }, - }, - "f:spec": { - ".": {}, - "f:autoscalerOptions": { - ".": {}, - "f:idleTimeoutSeconds": {}, - "f:imagePullPolicy": {}, - "f:resources": { - ".": {}, - "f:limits": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - "f:requests": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - }, - "f:upscalingMode": {}, - }, - "f:enableInTreeAutoscaling": {}, - "f:headGroupSpec": { - ".": {}, - "f:rayStartParams": { - ".": {}, - "f:block": {}, - "f:dashboard-host": {}, - "f:num-gpus": {}, - }, - "f:serviceType": {}, - "f:template": { - ".": {}, - "f:spec": { - ".": {}, - "f:affinity": { - ".": {}, - "f:nodeAffinity": { - ".": {}, - "f:requiredDuringSchedulingIgnoredDuringExecution": {}, - }, - }, - "f:imagePullSecrets": {}, - "f:volumes": {}, - }, - }, - }, - "f:rayVersion": {}, - "f:workerGroupSpecs": {}, - }, - }, - "manager": "codeflare-operator", - "operation": "Update", - "time": "2024-03-05T09:55:37Z", - }, - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:status": { - ".": {}, - "f:desiredWorkerReplicas": {}, - "f:endpoints": { - ".": {}, - "f:client": {}, - "f:dashboard": {}, - "f:gcs": {}, - "f:metrics": {}, - }, - "f:head": {".": {}, "f:serviceIP": {}}, - "f:lastUpdateTime": {}, - "f:maxWorkerReplicas": {}, - "f:minWorkerReplicas": {}, - "f:observedGeneration": {}, - } - }, - "manager": "manager", - "operation": "Update", - "subresource": "status", - "time": "2024-03-05T09:55:37Z", - }, - ], "name": "quicktest", "namespace": "ns", "ownerReferences": [ { - "apiVersion": "workload.codeflare.dev/v1beta1", + "apiVersion": "workload.codeflare.dev/v1beta2", "blockOwnerDeletion": True, "controller": True, "kind": "AppWrapper", @@ -1166,23 +1059,6 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "template": { "metadata": {}, "spec": { - "affinity": { - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [ - { - "matchExpressions": [ - { - "key": "quicktest", - "operator": "In", - "values": ["quicktest"], - } - ] - } - ] - } - } - }, "containers": [ { "env": [ @@ -1321,23 +1197,6 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "labels": {"key": "value"}, }, "spec": { - "affinity": { - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [ - { - "matchExpressions": [ - { - "key": "quicktest", - "operator": "In", - "values": ["quicktest"], - } - ] - } - ] - } - } - }, "containers": [ { "env": [ @@ -1468,103 +1327,15 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "creationTimestamp": "2023-02-22T16:26:07Z", "generation": 1, "labels": { - "workload.codeflare.dev/appwrapper": "quicktest2", "controller-tools.k8s.io": "1.0", "resourceName": "quicktest2", "orderedinstance": "m4.xlarge_g4dn.xlarge", }, - "managedFields": [ - { - "apiVersion": "ray.io/v1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:labels": { - ".": {}, - "f:workload.codeflare.dev/appwrapper": {}, - "f:controller-tools.k8s.io": {}, - "f:resourceName": {}, - }, - "f:ownerReferences": { - ".": {}, - 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, - }, - }, - "f:spec": { - ".": {}, - "f:autoscalerOptions": { - ".": {}, - "f:idleTimeoutSeconds": {}, - "f:imagePullPolicy": {}, - "f:resources": { - ".": {}, - "f:limits": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - "f:requests": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - }, - "f:upscalingMode": {}, - }, - "f:enableInTreeAutoscaling": {}, - "f:headGroupSpec": { - ".": {}, - "f:rayStartParams": { - ".": {}, - "f:block": {}, - "f:dashboard-host": {}, - "f:num-gpus": {}, - }, - "f:serviceType": {}, - "f:template": { - ".": {}, - "f:spec": {".": {}, "f:containers": {}}, - }, - }, - "f:rayVersion": {}, - "f:workerGroupSpecs": {}, - }, - }, - "manager": "mcad-controller", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "ray.io/v1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:status": { - ".": {}, - "f:availableWorkerReplicas": {}, - "f:desiredWorkerReplicas": {}, - "f:endpoints": { - ".": {}, - "f:client": {}, - "f:dashboard": {}, - "f:gcs": {}, - }, - "f:lastUpdateTime": {}, - "f:maxWorkerReplicas": {}, - "f:minWorkerReplicas": {}, - "f:state": {}, - } - }, - "manager": "manager", - "operation": "Update", - "subresource": "status", - "time": "2023-02-22T16:26:16Z", - }, - ], "name": "quicktest2", "namespace": "ns", "ownerReferences": [ { - "apiVersion": "workload.codeflare.dev/v1beta1", + "apiVersion": "workload.codeflare.dev/v1beta2", "blockOwnerDeletion": True, "controller": True, "kind": "AppWrapper", @@ -1723,7 +1494,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None): def get_named_aw(group, version, namespace, plural, name): - aws = get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers") + aws = get_aw_obj("workload.codeflare.dev", "v1beta2", "ns", "appwrappers") return aws["items"][0] @@ -1731,144 +1502,128 @@ def get_aw_obj(group, version, namespace, plural): api_obj1 = { "items": [ { - "apiVersion": "workload.codeflare.dev/v1beta1", + "apiVersion": "workload.codeflare.dev/v1beta2", "kind": "AppWrapper", "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"codeflare.dev/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1","kind":"RayCluster","metadata":{"labels":{"appwrapper.codeflare.dev":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "workload.codeflare.dev/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": { - "f:GenericItems": {}, - "f:metadata": {}, - }, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, - }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "workload.codeflare.dev/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, - }, - }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - ], "name": "quicktest1", "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", }, "spec": { - "priority": 9, - "resources": { - "GenericItems": [ - { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, + "components": [ + { + "template": { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": { + "labels": { + "controller-tools.k8s.io": "1.0", }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", + "name": "quicktest1", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, }, + "upscalingMode": "Default", }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1", - "kind": "RayCluster", - "metadata": { - "labels": { - "workload.codeflare.dev/appwrapper": "quicktest1", - "controller-tools.k8s.io": "1.0", + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", }, - "name": "quicktest1", - "namespace": "ns", - }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, - }, - "upscalingMode": "Default", + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, "rayStartParams": { "block": "true", - "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "serviceType": "ClusterIP", + "replicas": 1, "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, "spec": { "containers": [ { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1880,322 +1635,190 @@ def get_aw_obj(group, version, namespace, plural): } } }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], + "name": "machine-learning", "resources": { "limits": { - "cpu": 2, - "memory": "8G", + "cpu": 1, + "memory": "2G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 2, - "memory": "8G", + "cpu": 1, + "memory": "2G", "nvidia.com/gpu": 0, }, }, } - ] - } - }, - }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, - "rayStartParams": { - "block": "true", - "num-gpus": "0", - }, - "replicas": 1, - "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, - "spec": { - "containers": [ - { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "machine-learning", - "resources": { - "limits": { - "cpu": 1, - "memory": "2G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 1, - "memory": "2G", - "nvidia.com/gpu": 0, - }, - }, - } - ], - }, + ], }, - } - ], - }, + }, + } + ], }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "networking.k8s.io/v1", - "kind": "Ingress", - "metadata": { - "labels": { - "ingress-owner": "appwrapper-name", - }, - "name": "ray-dashboard-quicktest", - "namespace": "default", + }, + { + "template": { + "apiVersion": "networking.k8s.io/v1", + "kind": "Ingress", + "metadata": { + "labels": { + "ingress-owner": "appwrapper-name", }, - "spec": { - "ingressClassName": "nginx", - "rules": [ - { - "http": { - "paths": { - "backend": { - "service": { - "name": "quicktest-head-svc", - "port": { - "number": 8265 - }, - }, + "name": "ray-dashboard-quicktest", + "namespace": "default", + }, + "spec": { + "ingressClassName": "nginx", + "rules": [ + { + "http": { + "paths": { + "backend": { + "service": { + "name": "quicktest-head-svc", + "port": {"number": 8265}, }, - "pathType": "Prefix", - "path": "/", }, + "pathType": "Prefix", + "path": "/", }, - "host": "quicktest.awsroute.com", - } - ], - }, + }, + "host": "quicktest.awsroute.com", + } + ], }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, }, - ], - "Items": [], - "metadata": {}, - }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", }, ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Running", - "systempriority": 9, + }, + "status": { + "phase": "Running", }, }, { - "apiVersion": "workload.codeflare.dev/v1beta1", + "apiVersion": "workload.codeflare.dev/v1beta2", "kind": "AppWrapper", "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"codeflare.dev/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1","kind":"RayCluster","metadata":{"labels":{"appwrapper.codeflare.dev":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "workload.codeflare.dev/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": { - "f:GenericItems": {}, - "f:metadata": {}, - }, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, - }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "workload.codeflare.dev/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, - }, - }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - ], "name": "quicktest2", "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", }, "spec": { - "priority": 9, - "resources": { - "GenericItems": [ - { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, + "components": [ + { + "template": { + "apiVersion": "ray.io/v1", + "kind": "RayCluster", + "metadata": { + "labels": { + "controller-tools.k8s.io": "1.0", }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", + "name": "quicktest2", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, }, + "upscalingMode": "Default", }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1", - "kind": "RayCluster", - "metadata": { - "labels": { - "workload.codeflare.dev/appwrapper": "quicktest2", - "controller-tools.k8s.io": "1.0", + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", }, - "name": "quicktest2", - "namespace": "ns", - }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, - }, - "upscalingMode": "Default", + "serviceType": "ClusterIP", + "template": { + "spec": { + "containers": [ + { + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], + "resources": { + "limits": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 2, + "memory": "8G", + "nvidia.com/gpu": 0, + }, + }, + } + ] + } }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, "rayStartParams": { "block": "true", - "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "serviceType": "ClusterIP", + "replicas": 1, "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, "spec": { "containers": [ { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -2207,166 +1830,52 @@ def get_aw_obj(group, version, namespace, plural): } } }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], + "name": "machine-learning", "resources": { "limits": { - "cpu": 2, - "memory": "8G", + "cpu": 1, + "memory": "2G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 2, - "memory": "8G", + "cpu": 1, + "memory": "2G", "nvidia.com/gpu": 0, }, }, } - ] - } - }, - }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, - "rayStartParams": { - "block": "true", - "num-gpus": "0", + ], }, - "replicas": 1, - "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, - "spec": { - "containers": [ - { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], - "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "machine-learning", - "resources": { - "limits": { - "cpu": 1, - "memory": "2G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 1, - "memory": "2G", - "nvidia.com/gpu": 0, - }, - }, - } - ], - }, - }, - } - ], - }, + }, + } + ], }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" - }, - "name": "ray-dashboard-quicktest", - "namespace": "default", + }, + { + "template": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", - }, + "name": "ray-dashboard-quicktest", + "namespace": "default", + }, + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", }, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, }, - ], - "Items": [], - "metadata": {}, - }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", }, ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Pending", - "systempriority": 9, + }, + "status": { + "phase": "Suspended", }, }, ] @@ -2428,6 +1937,10 @@ def test_get_cluster_openshift(mocker): MagicMock(versions=[MagicMock(group_version="route.openshift.io/v1")]) ] mocker.patch("kubernetes.client.ApisApi", return_value=mock_api) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) assert is_openshift_cluster() @@ -2437,7 +1950,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): elif plural == "rayclusters": return get_ray_obj("ray.io", "v1", "ns", "rayclusters") elif plural == "appwrappers": - return get_aw_obj("workload.codeflare.dev", "v1beta1", "ns", "appwrappers") + return get_aw_obj("workload.codeflare.dev", "v1beta2", "ns", "appwrappers") elif plural == "localqueues": return get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues") @@ -2492,6 +2005,10 @@ def test_get_cluster(mocker): "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", return_value=ingress_retrieval(cluster_name="quicktest", client_ing=True), ) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) cluster = get_cluster("quicktest") cluster_config = cluster.config assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" @@ -2502,7 +2019,6 @@ def test_get_cluster(mocker): assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" assert cluster_config.num_gpus == 0 - assert cluster_config.instascale assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2521,6 +2037,10 @@ def test_get_cluster_no_mcad(mocker): "kubernetes.client.NetworkingV1Api.list_namespaced_ingress", return_value=ingress_retrieval(cluster_name="quicktest", client_ing=True), ) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) cluster = get_cluster("quicktest") cluster_config = cluster.config assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" @@ -2531,7 +2051,6 @@ def test_get_cluster_no_mcad(mocker): assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" assert cluster_config.num_gpus == 0 - assert cluster_config.instascale assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2672,7 +2191,7 @@ def test_list_queue(mocker, capsys): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_obj_none, ) - list_all_queued("ns", mcad=True) + list_all_queued("ns", appwrapper=True) captured = capsys.readouterr() assert captured.out == ( "╭──────────────────────────────────────────────────────────────────────────────╮\n" @@ -2683,21 +2202,21 @@ def test_list_queue(mocker, capsys): "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", side_effect=get_aw_obj, ) - list_all_queued("ns", mcad=True) + list_all_queued("ns", appwrapper=True) captured = capsys.readouterr() assert captured.out == ( - "╭──────────────────────────╮\n" - "│ 🚀 Cluster Queue Status │\n" - "│ 🚀 │\n" - "│ +------------+---------+ │\n" - "│ | Name | Status | │\n" - "│ +============+=========+ │\n" - "│ | quicktest1 | running | │\n" - "│ | | | │\n" - "│ | quicktest2 | pending | │\n" - "│ | | | │\n" - "│ +------------+---------+ │\n" - "╰──────────────────────────╯\n" + "╭────────────────────────────╮\n" + "│ 🚀 Cluster Queue Status │\n" + "│ 🚀 │\n" + "│ +------------+-----------+ │\n" + "│ | Name | Status | │\n" + "│ +============+===========+ │\n" + "│ | quicktest1 | running | │\n" + "│ | | | │\n" + "│ | quicktest2 | suspended | │\n" + "│ | | | │\n" + "│ +------------+-----------+ │\n" + "╰────────────────────────────╯\n" ) @@ -2747,9 +2266,11 @@ def test_list_queue_rayclusters(mocker, capsys): def test_cluster_status(mocker): mocker.patch("kubernetes.client.ApisApi.get_api_versions") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") - fake_aw = AppWrapper( - "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused" + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", ) + fake_aw = AppWrapper("test", AppWrapperStatus.FAILED) fake_ray = RayCluster( name="test", status=RayClusterStatus.UNKNOWN, @@ -2770,7 +2291,8 @@ def test_cluster_status(mocker): namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, - mcad=True, + appwrapper=True, + local_queue="local_default_queue", ) ) mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) @@ -2786,29 +2308,24 @@ def test_cluster_status(mocker): assert status == CodeFlareClusterStatus.FAILED assert ready == False - fake_aw.status = AppWrapperStatus.DELETED - status, ready = cf.status() - assert status == CodeFlareClusterStatus.FAILED - assert ready == False - - fake_aw.status = AppWrapperStatus.PENDING + fake_aw.status = AppWrapperStatus.SUSPENDED status, ready = cf.status() assert status == CodeFlareClusterStatus.QUEUED assert ready == False - fake_aw.status = AppWrapperStatus.COMPLETED + fake_aw.status = AppWrapperStatus.RESUMING status, ready = cf.status() assert status == CodeFlareClusterStatus.STARTING assert ready == False - fake_aw.status = AppWrapperStatus.RUNNING_HOLD_COMPLETION + fake_aw.status = AppWrapperStatus.RESETTING status, ready = cf.status() assert status == CodeFlareClusterStatus.STARTING assert ready == False fake_aw.status = AppWrapperStatus.RUNNING status, ready = cf.status() - assert status == CodeFlareClusterStatus.STARTING + assert status == CodeFlareClusterStatus.UNKNOWN assert ready == False mocker.patch( @@ -2844,6 +2361,10 @@ def test_wait_ready(mocker, capsys): mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) mocker.patch.object( client.CustomObjectsApi, "list_namespaced_custom_object", @@ -2865,7 +2386,8 @@ def test_wait_ready(mocker, capsys): namespace="ns", image="quay.io/project-codeflare/ray:latest-py39-cu118", write_to_file=True, - mcad=True, + appwrapper=True, + local_queue="local-queue-default", ) ) try: @@ -2937,7 +2459,7 @@ def test_AWManager_creation(mocker): def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args): assert group == "workload.codeflare.dev" - assert version == "v1beta1" + assert version == "v1beta2" assert namespace == "ns" assert plural == "appwrappers" with open(f"{aw_dir}test.yaml") as f: @@ -2948,7 +2470,7 @@ def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args): def arg_check_aw_del_effect(group, version, namespace, plural, name, *args): assert group == "workload.codeflare.dev" - assert version == "v1beta1" + assert version == "v1beta2" assert namespace == "ns" assert plural == "appwrappers" assert name == "test" @@ -3255,7 +2777,6 @@ def test_rjc_list_jobs(ray_job_client, mocker): # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}unit-test-cluster.yaml") - os.remove(f"{aw_dir}prio-test-cluster.yaml") os.remove(f"{aw_dir}test.yaml") os.remove(f"{aw_dir}raytest2.yaml") os.remove(f"{aw_dir}unit-test-cluster-ray.yaml") diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 329df45ed..baa14aaca 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -14,8 +14,7 @@ def createClusterConfig(): min_memory=5, max_memory=6, num_gpus=7, - mcad=True, - instascale=True, + appwrapper=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], image="quay.io/project-codeflare/ray:latest-py39-cu118",