From e9786a22c5cf695f10aba1a18a3ac4f15c85d4cb Mon Sep 17 00:00:00 2001 From: codeflare-machine-account <138894154+codeflare-machine-account@users.noreply.github.com> Date: Fri, 27 Sep 2024 11:46:55 +0100 Subject: [PATCH] Changes in docs for release: v0.21.0 (#687) Co-authored-by: codeflare-machine-account --- .../cluster/cluster.html | 221 +++-- .../cluster/config.html | 94 ++- .../detailed-documentation/cluster/index.html | 9 + .../detailed-documentation/cluster/model.html | 74 +- .../cluster/widgets.html | 758 ++++++++++++++++++ docs/detailed-documentation/index.html | 1 + docs/detailed-documentation/job/ray_jobs.html | 13 +- .../utils/generate_yaml.html | 50 +- .../utils/pretty_print.html | 12 +- 9 files changed, 1071 insertions(+), 161 deletions(-) create mode 100644 docs/detailed-documentation/cluster/widgets.html diff --git a/docs/detailed-documentation/cluster/cluster.html b/docs/detailed-documentation/cluster/cluster.html index 5c8688eb5..5276ee942 100644 --- a/docs/detailed-documentation/cluster/cluster.html +++ b/docs/detailed-documentation/cluster/cluster.html @@ -51,6 +51,7 @@

Module codeflare_sdk.cluster.cluster

""" import re +import subprocess from time import sleep from typing import List, Optional, Tuple, Dict @@ -74,6 +75,10 @@

Module codeflare_sdk.cluster.cluster

RayCluster, RayClusterStatus, ) +from .widgets import ( + cluster_up_down_buttons, + is_notebook, +) from kubernetes import client, config from kubernetes.utils import parse_quantity import yaml @@ -103,6 +108,8 @@

Module codeflare_sdk.cluster.cluster

self.app_wrapper_yaml = self.create_app_wrapper() self._job_submission_client = None self.app_wrapper_name = self.config.name + if is_notebook(): + cluster_up_down_buttons(self) @property def _client_headers(self): @@ -188,8 +195,12 @@

Module codeflare_sdk.cluster.cluster

plural="appwrappers", body=aw, ) + print(f"AppWrapper: '{self.config.name}' has successfully been created") else: self._component_resources_up(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been created" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -230,8 +241,12 @@

Module codeflare_sdk.cluster.cluster

plural="appwrappers", name=self.app_wrapper_name, ) + print(f"AppWrapper: '{self.config.name}' has successfully been deleted") else: self._component_resources_down(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been deleted" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -480,6 +495,18 @@

Module codeflare_sdk.cluster.cluster

name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" @@ -578,41 +605,25 @@

Module codeflare_sdk.cluster.cluster

def get_current_namespace(): # pragma: no cover - if api_config_handler() != None: - if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): - try: - file = open( - "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r" - ) - active_context = file.readline().strip("\n") - return active_context - except Exception as e: - print("Unable to find current namespace") - return None - else: + if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): + try: + file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") + active_context = file.readline().strip("\n") + return active_context + except Exception as e: print("Unable to find current namespace") - return None - else: - if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): - try: - file = open( - "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r" - ) - active_context = file.readline().strip("\n") - return active_context - except Exception as e: - print( - "unable to gather namespace from /var/run/secrets/kubernetes.io/serviceaccount/namespace trying to gather from current context" - ) - else: - try: - _, active_context = config.list_kube_config_contexts(config_check()) - except Exception as e: - return _kube_api_error_handling(e) - try: - return active_context["context"]["namespace"] - except KeyError: - return None + + if api_config_handler() != None: + return None + print("trying to gather from current context") + try: + _, active_context = config.list_kube_config_contexts(config_check()) + except Exception as e: + return _kube_api_error_handling(e) + try: + return active_context["context"]["namespace"] + except KeyError: + return None def get_cluster( @@ -884,24 +895,33 @@

Module codeflare_sdk.cluster.cluster

name=rc["metadata"]["name"], status=status, # for now we are not using autoscaling so same replicas is fine - workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], - worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + num_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["memory"], - worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ - 0 - ]["resources"]["limits"]["cpu"], + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], - head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["cpu"], - head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["memory"], + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], + head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["requests"]["memory"], + head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["memory"], head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -923,15 +943,18 @@

Module codeflare_sdk.cluster.cluster

ray = RayCluster( name=cluster.config.name, status=cluster.status(print_to_console=False)[0], - workers=cluster.config.num_workers, - worker_mem_min=cluster.config.worker_memory_requests, - worker_mem_max=cluster.config.worker_memory_limits, - worker_cpu=cluster.config.worker_cpu_requests, + num_workers=cluster.config.num_workers, + worker_mem_requests=cluster.config.worker_memory_requests, + worker_mem_limits=cluster.config.worker_memory_limits, + worker_cpu_requests=cluster.config.worker_cpu_requests, + worker_cpu_limits=cluster.config.worker_cpu_limits, worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), - head_cpus=cluster.config.head_cpus, - head_mem=cluster.config.head_memory, + head_mem_requests=cluster.config.head_memory_requests, + head_mem_limits=cluster.config.head_memory_limits, + head_cpu_requests=cluster.config.head_cpu_requests, + head_cpu_limits=cluster.config.head_cpu_limits, head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: @@ -997,41 +1020,25 @@

Functions

Expand source code
def get_current_namespace():  # pragma: no cover
-    if api_config_handler() != None:
-        if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
-            try:
-                file = open(
-                    "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r"
-                )
-                active_context = file.readline().strip("\n")
-                return active_context
-            except Exception as e:
-                print("Unable to find current namespace")
-                return None
-        else:
+    if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
+        try:
+            file = open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r")
+            active_context = file.readline().strip("\n")
+            return active_context
+        except Exception as e:
             print("Unable to find current namespace")
-            return None
-    else:
-        if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"):
-            try:
-                file = open(
-                    "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r"
-                )
-                active_context = file.readline().strip("\n")
-                return active_context
-            except Exception as e:
-                print(
-                    "unable to gather namespace from /var/run/secrets/kubernetes.io/serviceaccount/namespace trying to gather from current context"
-                )
-        else:
-            try:
-                _, active_context = config.list_kube_config_contexts(config_check())
-            except Exception as e:
-                return _kube_api_error_handling(e)
-            try:
-                return active_context["context"]["namespace"]
-            except KeyError:
-                return None
+ + if api_config_handler() != None: + return None + print("trying to gather from current context") + try: + _, active_context = config.list_kube_config_contexts(config_check()) + except Exception as e: + return _kube_api_error_handling(e) + try: + return active_context["context"]["namespace"] + except KeyError: + return None
@@ -1123,6 +1130,8 @@

Classes

self.app_wrapper_yaml = self.create_app_wrapper() self._job_submission_client = None self.app_wrapper_name = self.config.name + if is_notebook(): + cluster_up_down_buttons(self) @property def _client_headers(self): @@ -1208,8 +1217,12 @@

Classes

plural="appwrappers", body=aw, ) + print(f"AppWrapper: '{self.config.name}' has successfully been created") else: self._component_resources_up(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been created" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -1250,8 +1263,12 @@

Classes

plural="appwrappers", name=self.app_wrapper_name, ) + print(f"AppWrapper: '{self.config.name}' has successfully been deleted") else: self._component_resources_down(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been deleted" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -1500,6 +1517,18 @@

Classes

name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" @@ -1742,8 +1771,12 @@

Methods

plural="appwrappers", name=self.app_wrapper_name, ) + print(f"AppWrapper: '{self.config.name}' has successfully been deleted") else: self._component_resources_down(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been deleted" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e)
@@ -1779,6 +1812,18 @@

Methods

name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], machine_types=machine_types, + head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" @@ -2022,8 +2067,12 @@

Methods

plural="appwrappers", body=aw, ) + print(f"AppWrapper: '{self.config.name}' has successfully been created") else: self._component_resources_up(namespace, api_instance) + print( + f"Ray Cluster: '{self.config.name}' has successfully been created" + ) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) diff --git a/docs/detailed-documentation/cluster/config.html b/docs/detailed-documentation/cluster/config.html index 87ad3b77c..b329fb031 100644 --- a/docs/detailed-documentation/cluster/config.html +++ b/docs/detailed-documentation/cluster/config.html @@ -107,10 +107,16 @@

Module codeflare_sdk.cluster.config

name: str namespace: Optional[str] = None head_info: List[str] = field(default_factory=list) - head_cpus: Union[int, str] = 2 - head_memory: Union[int, str] = 8 + head_cpu_requests: Union[int, str] = 2 + head_cpu_limits: Union[int, str] = 2 + head_cpus: Optional[Union[int, str]] = None # Deprecating + head_memory_requests: Union[int, str] = 8 + head_memory_limits: Union[int, str] = 8 + head_memory: Optional[Union[int, str]] = None # Deprecating head_gpus: Optional[int] = None # Deprecating - head_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + head_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) machine_types: List[str] = field( default_factory=list ) # ["m4.xlarge", "g4dn.xlarge"] @@ -132,7 +138,9 @@

Module codeflare_sdk.cluster.config

write_to_file: bool = False verify_tls: bool = True labels: Dict[str, str] = field(default_factory=dict) - worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + worker_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) extended_resource_mapping: Dict[str, str] = field(default_factory=dict) overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None @@ -215,14 +223,21 @@

Module codeflare_sdk.cluster.config

self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): - if isinstance(self.head_memory, int): - self.head_memory = f"{self.head_memory}G" + if isinstance(self.head_memory_requests, int): + self.head_memory_requests = f"{self.head_memory_requests}G" + if isinstance(self.head_memory_limits, int): + self.head_memory_limits = f"{self.head_memory_limits}G" if isinstance(self.worker_memory_requests, int): self.worker_memory_requests = f"{self.worker_memory_requests}G" if isinstance(self.worker_memory_limits, int): self.worker_memory_limits = f"{self.worker_memory_limits}G" def _cpu_to_resource(self): + if self.head_cpus: + warnings.warn( + "head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits" + ) + self.head_cpu_requests = self.head_cpu_limits = self.head_cpus if self.min_cpus: warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") self.worker_cpu_requests = self.min_cpus @@ -231,6 +246,11 @@

Module codeflare_sdk.cluster.config

self.worker_cpu_limits = self.max_cpus def _memory_to_resource(self): + if self.head_memory: + warnings.warn( + "head_memory is being deprecated, use head_memory_requests and head_memory_limits" + ) + self.head_memory_requests = self.head_memory_limits = self.head_memory if self.min_memory: warnings.warn("min_memory is being deprecated, use worker_memory_requests") self.worker_memory_requests = f"{self.min_memory}G" @@ -282,7 +302,7 @@

Classes

class ClusterConfiguration -(name: str, namespace: Optional[str] = None, head_info: List[str] = <factory>, head_cpus: Union[int, str] = 2, head_memory: Union[int, str] = 8, head_gpus: Optional[int] = None, head_extended_resource_requests: Dict[str, int] = <factory>, machine_types: List[str] = <factory>, worker_cpu_requests: Union[int, str] = 1, worker_cpu_limits: Union[int, str] = 1, min_cpus: Union[int, str, ForwardRef(None)] = None, max_cpus: Union[int, str, ForwardRef(None)] = None, num_workers: int = 1, worker_memory_requests: Union[int, str] = 2, worker_memory_limits: Union[int, str] = 2, min_memory: Union[int, str, ForwardRef(None)] = None, max_memory: Union[int, str, ForwardRef(None)] = None, num_gpus: Optional[int] = None, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', appwrapper: bool = False, envs: Dict[str, str] = <factory>, image: str = '', image_pull_secrets: List[str] = <factory>, write_to_file: bool = False, verify_tls: bool = True, labels: Dict[str, str] = <factory>, worker_extended_resource_requests: Dict[str, int] = <factory>, extended_resource_mapping: Dict[str, str] = <factory>, overwrite_default_resource_mapping: bool = False, local_queue: Optional[str] = None) +(name: str, namespace: Optional[str] = None, head_info: List[str] = <factory>, head_cpu_requests: Union[int, str] = 2, head_cpu_limits: Union[int, str] = 2, head_cpus: Union[int, str, ForwardRef(None)] = None, head_memory_requests: Union[int, str] = 8, head_memory_limits: Union[int, str] = 8, head_memory: Union[int, str, ForwardRef(None)] = None, head_gpus: Optional[int] = None, head_extended_resource_requests: Dict[str, Union[str, int]] = <factory>, machine_types: List[str] = <factory>, worker_cpu_requests: Union[int, str] = 1, worker_cpu_limits: Union[int, str] = 1, min_cpus: Union[int, str, ForwardRef(None)] = None, max_cpus: Union[int, str, ForwardRef(None)] = None, num_workers: int = 1, worker_memory_requests: Union[int, str] = 2, worker_memory_limits: Union[int, str] = 2, min_memory: Union[int, str, ForwardRef(None)] = None, max_memory: Union[int, str, ForwardRef(None)] = None, num_gpus: Optional[int] = None, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', appwrapper: bool = False, envs: Dict[str, str] = <factory>, image: str = '', image_pull_secrets: List[str] = <factory>, write_to_file: bool = False, verify_tls: bool = True, labels: Dict[str, str] = <factory>, worker_extended_resource_requests: Dict[str, Union[str, int]] = <factory>, extended_resource_mapping: Dict[str, str] = <factory>, overwrite_default_resource_mapping: bool = False, local_queue: Optional[str] = None)

This dataclass is used to specify resource requirements and other details, and @@ -354,10 +374,16 @@

Classes

name: str namespace: Optional[str] = None head_info: List[str] = field(default_factory=list) - head_cpus: Union[int, str] = 2 - head_memory: Union[int, str] = 8 + head_cpu_requests: Union[int, str] = 2 + head_cpu_limits: Union[int, str] = 2 + head_cpus: Optional[Union[int, str]] = None # Deprecating + head_memory_requests: Union[int, str] = 8 + head_memory_limits: Union[int, str] = 8 + head_memory: Optional[Union[int, str]] = None # Deprecating head_gpus: Optional[int] = None # Deprecating - head_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + head_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) machine_types: List[str] = field( default_factory=list ) # ["m4.xlarge", "g4dn.xlarge"] @@ -379,7 +405,9 @@

Classes

write_to_file: bool = False verify_tls: bool = True labels: Dict[str, str] = field(default_factory=dict) - worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict) + worker_extended_resource_requests: Dict[str, Union[str, int]] = field( + default_factory=dict + ) extended_resource_mapping: Dict[str, str] = field(default_factory=dict) overwrite_default_resource_mapping: bool = False local_queue: Optional[str] = None @@ -462,14 +490,21 @@

Classes

self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): - if isinstance(self.head_memory, int): - self.head_memory = f"{self.head_memory}G" + if isinstance(self.head_memory_requests, int): + self.head_memory_requests = f"{self.head_memory_requests}G" + if isinstance(self.head_memory_limits, int): + self.head_memory_limits = f"{self.head_memory_limits}G" if isinstance(self.worker_memory_requests, int): self.worker_memory_requests = f"{self.worker_memory_requests}G" if isinstance(self.worker_memory_limits, int): self.worker_memory_limits = f"{self.worker_memory_limits}G" def _cpu_to_resource(self): + if self.head_cpus: + warnings.warn( + "head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits" + ) + self.head_cpu_requests = self.head_cpu_limits = self.head_cpus if self.min_cpus: warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") self.worker_cpu_requests = self.min_cpus @@ -478,6 +513,11 @@

Classes

self.worker_cpu_limits = self.max_cpus def _memory_to_resource(self): + if self.head_memory: + warnings.warn( + "head_memory is being deprecated, use head_memory_requests and head_memory_limits" + ) + self.head_memory_requests = self.head_memory_limits = self.head_memory if self.min_memory: warnings.warn("min_memory is being deprecated, use worker_memory_requests") self.worker_memory_requests = f"{self.min_memory}G" @@ -531,11 +571,19 @@

Class variables

-
var head_cpus : Union[int, str]
+
var head_cpu_limits : Union[int, str]
+
+
+
+
var head_cpu_requests : Union[int, str]
+
+
+
+
var head_cpus : Union[int, str, ForwardRef(None)]
-
var head_extended_resource_requests : Dict[str, int]
+
var head_extended_resource_requests : Dict[str, Union[str, int]]
@@ -547,7 +595,15 @@

Class variables

-
var head_memory : Union[int, str]
+
var head_memory : Union[int, str, ForwardRef(None)]
+
+
+
+
var head_memory_limits : Union[int, str]
+
+
+
+
var head_memory_requests : Union[int, str]
@@ -623,7 +679,7 @@

Class variables

-
var worker_extended_resource_requests : Dict[str, int]
+
var worker_extended_resource_requests : Dict[str, Union[str, int]]
@@ -663,11 +719,15 @@

appwrapper
  • envs
  • extended_resource_mapping
  • +
  • head_cpu_limits
  • +
  • head_cpu_requests
  • head_cpus
  • head_extended_resource_requests
  • head_gpus
  • head_info
  • head_memory
  • +
  • head_memory_limits
  • +
  • head_memory_requests
  • image
  • image_pull_secrets
  • labels
  • diff --git a/docs/detailed-documentation/cluster/index.html b/docs/detailed-documentation/cluster/index.html index a7967885c..f8c04fa29 100644 --- a/docs/detailed-documentation/cluster/index.html +++ b/docs/detailed-documentation/cluster/index.html @@ -49,6 +49,10 @@

    Module codeflare_sdk.cluster

    list_all_clusters, ) +from .widgets import ( + view_clusters, +) + from .awload import AWManager @@ -81,6 +85,10 @@

    Sub-modules

    states and AppWrapper states, and CodeFlare cluster states, as well as …

    +
    codeflare_sdk.cluster.widgets
    +
    +

    The widgets sub-module contains the ui widgets created using the ipywidgets package.

    +
    @@ -108,6 +116,7 @@

    Index

  • codeflare_sdk.cluster.cluster
  • codeflare_sdk.cluster.config
  • codeflare_sdk.cluster.model
  • +
  • codeflare_sdk.cluster.widgets
  • diff --git a/docs/detailed-documentation/cluster/model.html b/docs/detailed-documentation/cluster/model.html index a07027da2..7d87e34f8 100644 --- a/docs/detailed-documentation/cluster/model.html +++ b/docs/detailed-documentation/cluster/model.html @@ -54,6 +54,7 @@

    Module codeflare_sdk.cluster.model

    from dataclasses import dataclass, field from enum import Enum import typing +from typing import Union class RayClusterStatus(Enum): @@ -106,12 +107,15 @@

    Module codeflare_sdk.cluster.model

    name: str status: RayClusterStatus - head_cpus: int - head_mem: str - workers: int - worker_mem_min: str - worker_mem_max: str - worker_cpu: int + head_cpu_requests: int + head_cpu_limits: int + head_mem_requests: str + head_mem_limits: str + num_workers: int + worker_mem_requests: str + worker_mem_limits: str + worker_cpu_requests: Union[int, str] + worker_cpu_limits: Union[int, str] namespace: str dashboard: str worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) @@ -293,7 +297,7 @@

    Class variables

    class RayCluster -(name: str, status: RayClusterStatus, head_cpus: int, head_mem: str, workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, namespace: str, dashboard: str, worker_extended_resources: Dict[str, int] = <factory>, head_extended_resources: Dict[str, int] = <factory>) +(name: str, status: RayClusterStatus, head_cpu_requests: int, head_cpu_limits: int, head_mem_requests: str, head_mem_limits: str, num_workers: int, worker_mem_requests: str, worker_mem_limits: str, worker_cpu_requests: Union[int, str], worker_cpu_limits: Union[int, str], namespace: str, dashboard: str, worker_extended_resources: Dict[str, int] = <factory>, head_extended_resources: Dict[str, int] = <factory>)

    For storing information about a Ray cluster.

    @@ -309,12 +313,15 @@

    Class variables

    name: str status: RayClusterStatus - head_cpus: int - head_mem: str - workers: int - worker_mem_min: str - worker_mem_max: str - worker_cpu: int + head_cpu_requests: int + head_cpu_limits: int + head_mem_requests: str + head_mem_limits: str + num_workers: int + worker_mem_requests: str + worker_mem_limits: str + worker_cpu_requests: Union[int, str] + worker_cpu_limits: Union[int, str] namespace: str dashboard: str worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) @@ -326,7 +333,11 @@

    Class variables

    -
    var head_cpus : int
    +
    var head_cpu_limits : int
    +
    +
    +
    +
    var head_cpu_requests : int
    @@ -334,7 +345,11 @@

    Class variables

    -
    var head_mem : str
    +
    var head_mem_limits : str
    +
    +
    +
    +
    var head_mem_requests : str
    @@ -346,27 +361,31 @@

    Class variables

    +
    var num_workers : int
    +
    +
    +
    var statusRayClusterStatus
    -
    var worker_cpu : int
    +
    var worker_cpu_limits : Union[int, str]
    -
    var worker_extended_resources : Dict[str, int]
    +
    var worker_cpu_requests : Union[int, str]
    -
    var worker_mem_max : str
    +
    var worker_extended_resources : Dict[str, int]
    -
    var worker_mem_min : str
    +
    var worker_mem_limits : str
    -
    var workers : int
    +
    var worker_mem_requests : str
    @@ -474,17 +493,20 @@

    RayCluster

  • diff --git a/docs/detailed-documentation/cluster/widgets.html b/docs/detailed-documentation/cluster/widgets.html new file mode 100644 index 000000000..e07fa2ea2 --- /dev/null +++ b/docs/detailed-documentation/cluster/widgets.html @@ -0,0 +1,758 @@ + + + + + + +codeflare_sdk.cluster.widgets API documentation + + + + + + + + + + + +
    +
    +
    +

    Module codeflare_sdk.cluster.widgets

    +
    +
    +

    The widgets sub-module contains the ui widgets created using the ipywidgets package.

    +
    + +Expand source code + +
    # Copyright 2024 IBM, Red Hat
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#      http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +"""
    +The widgets sub-module contains the ui widgets created using the ipywidgets package.
    +"""
    +import contextlib
    +import io
    +import os
    +import warnings
    +import time
    +import codeflare_sdk
    +from kubernetes import client
    +from kubernetes.client.rest import ApiException
    +import ipywidgets as widgets
    +from IPython.display import display, HTML, Javascript
    +import pandas as pd
    +from .config import ClusterConfiguration
    +from .model import RayClusterStatus
    +from ..utils.kube_api_helpers import _kube_api_error_handling
    +from .auth import config_check, api_config_handler
    +
    +
    +def cluster_up_down_buttons(cluster: "codeflare_sdk.cluster.Cluster") -> widgets.Button:
    +    """
    +    The cluster_up_down_buttons function returns two button widgets for a create and delete button.
    +    The function uses the appwrapper bool to distinguish between resource type for the tool tip.
    +    """
    +    resource = "Ray Cluster"
    +    if cluster.config.appwrapper:
    +        resource = "AppWrapper"
    +
    +    up_button = widgets.Button(
    +        description="Cluster Up",
    +        tooltip=f"Create the {resource}",
    +        icon="play",
    +    )
    +
    +    delete_button = widgets.Button(
    +        description="Cluster Down",
    +        tooltip=f"Delete the {resource}",
    +        icon="trash",
    +    )
    +
    +    wait_ready_check = wait_ready_check_box()
    +    output = widgets.Output()
    +
    +    # Display the buttons in an HBox wrapped in a VBox which includes the wait_ready Checkbox
    +    button_display = widgets.HBox([up_button, delete_button])
    +    display(widgets.VBox([button_display, wait_ready_check]), output)
    +
    +    def on_up_button_clicked(b):  # Handle the up button click event
    +        with output:
    +            output.clear_output()
    +            cluster.up()
    +
    +            # If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
    +            if wait_ready_check.value:
    +                cluster.wait_ready()
    +
    +    def on_down_button_clicked(b):  # Handle the down button click event
    +        with output:
    +            output.clear_output()
    +            cluster.down()
    +
    +    up_button.on_click(on_up_button_clicked)
    +    delete_button.on_click(on_down_button_clicked)
    +
    +
    +def wait_ready_check_box():
    +    """
    +    The wait_ready_check_box function will return a checkbox widget used for waiting for the resource to be in the state READY.
    +    """
    +    wait_ready_check_box = widgets.Checkbox(
    +        False,
    +        description="Wait for Cluster?",
    +    )
    +    return wait_ready_check_box
    +
    +
    +def is_notebook() -> bool:
    +    """
    +    The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that.
    +    """
    +    if (
    +        "PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING" in os.environ
    +        or "JPY_SESSION_NAME" in os.environ
    +    ):  # If running Jupyter NBs in VsCode or RHOAI/ODH display UI buttons
    +        return True
    +    else:
    +        return False
    +
    +
    +def view_clusters(namespace: str = None):
    +    """
    +    view_clusters function will display existing clusters with their specs, and handle user interactions.
    +    """
    +    if not is_notebook():
    +        warnings.warn(
    +            "view_clusters can only be used in a Jupyter Notebook environment."
    +        )
    +        return  # Exit function if not in Jupyter Notebook
    +
    +    from .cluster import get_current_namespace
    +
    +    if not namespace:
    +        namespace = get_current_namespace()
    +
    +    user_output = widgets.Output()
    +    raycluster_data_output = widgets.Output()
    +    url_output = widgets.Output()
    +
    +    ray_clusters_df = _fetch_cluster_data(namespace)
    +    if ray_clusters_df.empty:
    +        print(f"No clusters found in the {namespace} namespace.")
    +        return
    +
    +    classification_widget = widgets.ToggleButtons(
    +        options=ray_clusters_df["Name"].tolist(),
    +        value=ray_clusters_df["Name"].tolist()[0],
    +        description="Select an existing cluster:",
    +    )
    +    # Setting the initial value to trigger the event handler to display the cluster details.
    +    initial_value = classification_widget.value
    +    _on_cluster_click(
    +        {"new": initial_value}, raycluster_data_output, namespace, classification_widget
    +    )
    +    classification_widget.observe(
    +        lambda selection_change: _on_cluster_click(
    +            selection_change, raycluster_data_output, namespace, classification_widget
    +        ),
    +        names="value",
    +    )
    +
    +    # UI table buttons
    +    delete_button = widgets.Button(
    +        description="Delete Cluster",
    +        icon="trash",
    +        tooltip="Delete the selected cluster",
    +    )
    +    delete_button.on_click(
    +        lambda b: _on_delete_button_click(
    +            b,
    +            classification_widget,
    +            ray_clusters_df,
    +            raycluster_data_output,
    +            user_output,
    +            delete_button,
    +            list_jobs_button,
    +            ray_dashboard_button,
    +        )
    +    )
    +
    +    list_jobs_button = widgets.Button(
    +        description="View Jobs", icon="suitcase", tooltip="Open the Ray Job Dashboard"
    +    )
    +    list_jobs_button.on_click(
    +        lambda b: _on_list_jobs_button_click(
    +            b, classification_widget, ray_clusters_df, user_output, url_output
    +        )
    +    )
    +
    +    ray_dashboard_button = widgets.Button(
    +        description="Open Ray Dashboard",
    +        icon="dashboard",
    +        tooltip="Open the Ray Dashboard in a new tab",
    +        layout=widgets.Layout(width="auto"),
    +    )
    +    ray_dashboard_button.on_click(
    +        lambda b: _on_ray_dashboard_button_click(
    +            b, classification_widget, ray_clusters_df, user_output, url_output
    +        )
    +    )
    +
    +    display(widgets.VBox([classification_widget, raycluster_data_output]))
    +    display(
    +        widgets.HBox([delete_button, list_jobs_button, ray_dashboard_button]),
    +        url_output,
    +        user_output,
    +    )
    +
    +
    +def _on_cluster_click(
    +    selection_change,
    +    raycluster_data_output: widgets.Output,
    +    namespace: str,
    +    classification_widget: widgets.ToggleButtons,
    +):
    +    """
    +    _on_cluster_click handles the event when a cluster is selected from the toggle buttons, updating the output with cluster details.
    +    """
    +    new_value = selection_change["new"]
    +    raycluster_data_output.clear_output()
    +    ray_clusters_df = _fetch_cluster_data(namespace)
    +    classification_widget.options = ray_clusters_df["Name"].tolist()
    +    with raycluster_data_output:
    +        display(
    +            HTML(
    +                ray_clusters_df[ray_clusters_df["Name"] == new_value][
    +                    [
    +                        "Name",
    +                        "Namespace",
    +                        "Num Workers",
    +                        "Head GPUs",
    +                        "Head CPU Req~Lim",
    +                        "Head Memory Req~Lim",
    +                        "Worker GPUs",
    +                        "Worker CPU Req~Lim",
    +                        "Worker Memory Req~Lim",
    +                        "status",
    +                    ]
    +                ].to_html(escape=False, index=False, border=2)
    +            )
    +        )
    +
    +
    +def _on_delete_button_click(
    +    b,
    +    classification_widget: widgets.ToggleButtons,
    +    ray_clusters_df: pd.DataFrame,
    +    raycluster_data_output: widgets.Output,
    +    user_output: widgets.Output,
    +    delete_button: widgets.Button,
    +    list_jobs_button: widgets.Button,
    +    ray_dashboard_button: widgets.Button,
    +):
    +    """
    +    _on_delete_button_click handles the event when the Delete Button is clicked, deleting the selected cluster.
    +    """
    +    cluster_name = classification_widget.value
    +    namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
    +        "Namespace"
    +    ].values[0]
    +
    +    _delete_cluster(cluster_name, namespace)
    +
    +    with user_output:
    +        user_output.clear_output()
    +        print(
    +            f"Cluster {cluster_name} in the {namespace} namespace was deleted successfully."
    +        )
    +
    +    # Refresh the dataframe
    +    new_df = _fetch_cluster_data(namespace)
    +    if new_df.empty:
    +        classification_widget.close()
    +        delete_button.close()
    +        list_jobs_button.close()
    +        ray_dashboard_button.close()
    +        with raycluster_data_output:
    +            raycluster_data_output.clear_output()
    +            print(f"No clusters found in the {namespace} namespace.")
    +    else:
    +        classification_widget.options = new_df["Name"].tolist()
    +
    +
    +def _on_ray_dashboard_button_click(
    +    b,
    +    classification_widget: widgets.ToggleButtons,
    +    ray_clusters_df: pd.DataFrame,
    +    user_output: widgets.Output,
    +    url_output: widgets.Output,
    +):
    +    """
    +    _on_ray_dashboard_button_click handles the event when the Open Ray Dashboard button is clicked, opening the Ray Dashboard in a new tab
    +    """
    +    from codeflare_sdk.cluster import Cluster
    +
    +    cluster_name = classification_widget.value
    +    namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
    +        "Namespace"
    +    ].values[0]
    +
    +    # Suppress from Cluster Object initialisation widgets and outputs
    +    with widgets.Output(), contextlib.redirect_stdout(
    +        io.StringIO()
    +    ), contextlib.redirect_stderr(io.StringIO()):
    +        cluster = Cluster(ClusterConfiguration(cluster_name, namespace))
    +    dashboard_url = cluster.cluster_dashboard_uri()
    +
    +    with user_output:
    +        user_output.clear_output()
    +        print(f"Opening Ray Dashboard for {cluster_name} cluster:\n{dashboard_url}")
    +    with url_output:
    +        display(Javascript(f'window.open("{dashboard_url}", "_blank");'))
    +
    +
    +def _on_list_jobs_button_click(
    +    b,
    +    classification_widget: widgets.ToggleButtons,
    +    ray_clusters_df: pd.DataFrame,
    +    user_output: widgets.Output,
    +    url_output: widgets.Output,
    +):
    +    """
    +    _on_list_jobs_button_click handles the event when the View Jobs button is clicked, opening the Ray Jobs Dashboard in a new tab
    +    """
    +    from codeflare_sdk.cluster import Cluster
    +
    +    cluster_name = classification_widget.value
    +    namespace = ray_clusters_df[ray_clusters_df["Name"] == classification_widget.value][
    +        "Namespace"
    +    ].values[0]
    +
    +    # Suppress from Cluster Object initialisation widgets and outputs
    +    with widgets.Output(), contextlib.redirect_stdout(
    +        io.StringIO()
    +    ), contextlib.redirect_stderr(io.StringIO()):
    +        cluster = Cluster(ClusterConfiguration(cluster_name, namespace))
    +    dashboard_url = cluster.cluster_dashboard_uri()
    +
    +    with user_output:
    +        user_output.clear_output()
    +        print(
    +            f"Opening Ray Jobs Dashboard for {cluster_name} cluster:\n{dashboard_url}/#/jobs"
    +        )
    +    with url_output:
    +        display(Javascript(f'window.open("{dashboard_url}/#/jobs", "_blank");'))
    +
    +
    +def _delete_cluster(
    +    cluster_name: str,
    +    namespace: str,
    +    timeout: int = 5,
    +    interval: int = 1,
    +):
    +    """
    +    _delete_cluster function deletes the cluster with the given name and namespace.
    +    It optionally waits for the cluster to be deleted.
    +    """
    +    from .cluster import _check_aw_exists
    +
    +    try:
    +        config_check()
    +        api_instance = client.CustomObjectsApi(api_config_handler())
    +
    +        if _check_aw_exists(cluster_name, namespace):
    +            api_instance.delete_namespaced_custom_object(
    +                group="workload.codeflare.dev",
    +                version="v1beta2",
    +                namespace=namespace,
    +                plural="appwrappers",
    +                name=cluster_name,
    +            )
    +            group = "workload.codeflare.dev"
    +            version = "v1beta2"
    +            plural = "appwrappers"
    +        else:
    +            api_instance.delete_namespaced_custom_object(
    +                group="ray.io",
    +                version="v1",
    +                namespace=namespace,
    +                plural="rayclusters",
    +                name=cluster_name,
    +            )
    +            group = "ray.io"
    +            version = "v1"
    +            plural = "rayclusters"
    +
    +        # Wait for the resource to be deleted
    +        while timeout > 0:
    +            try:
    +                api_instance.get_namespaced_custom_object(
    +                    group=group,
    +                    version=version,
    +                    namespace=namespace,
    +                    plural=plural,
    +                    name=cluster_name,
    +                )
    +                # Retry if resource still exists
    +                time.sleep(interval)
    +                timeout -= interval
    +                if timeout <= 0:
    +                    raise TimeoutError(
    +                        f"Timeout waiting for {cluster_name} to be deleted."
    +                    )
    +            except ApiException as e:
    +                # Resource is deleted
    +                if e.status == 404:
    +                    break
    +    except Exception as e:  # pragma: no cover
    +        return _kube_api_error_handling(e)
    +
    +
    +def _fetch_cluster_data(namespace):
    +    """
    +    _fetch_cluster_data function fetches all clusters and their spec in a given namespace and returns a DataFrame.
    +    """
    +    from .cluster import list_all_clusters
    +
    +    rayclusters = list_all_clusters(namespace, False)
    +    if not rayclusters:
    +        return pd.DataFrame()
    +    names = [item.name for item in rayclusters]
    +    namespaces = [item.namespace for item in rayclusters]
    +    num_workers = [item.num_workers for item in rayclusters]
    +    head_extended_resources = [
    +        f"{list(item.head_extended_resources.keys())[0]}: {list(item.head_extended_resources.values())[0]}"
    +        if item.head_extended_resources
    +        else "0"
    +        for item in rayclusters
    +    ]
    +    worker_extended_resources = [
    +        f"{list(item.worker_extended_resources.keys())[0]}: {list(item.worker_extended_resources.values())[0]}"
    +        if item.worker_extended_resources
    +        else "0"
    +        for item in rayclusters
    +    ]
    +    head_cpu_requests = [
    +        item.head_cpu_requests if item.head_cpu_requests else 0 for item in rayclusters
    +    ]
    +    head_cpu_limits = [
    +        item.head_cpu_limits if item.head_cpu_limits else 0 for item in rayclusters
    +    ]
    +    head_cpu_rl = [
    +        f"{requests}~{limits}"
    +        for requests, limits in zip(head_cpu_requests, head_cpu_limits)
    +    ]
    +    head_mem_requests = [
    +        item.head_mem_requests if item.head_mem_requests else 0 for item in rayclusters
    +    ]
    +    head_mem_limits = [
    +        item.head_mem_limits if item.head_mem_limits else 0 for item in rayclusters
    +    ]
    +    head_mem_rl = [
    +        f"{requests}~{limits}"
    +        for requests, limits in zip(head_mem_requests, head_mem_limits)
    +    ]
    +    worker_cpu_requests = [
    +        item.worker_cpu_requests if item.worker_cpu_requests else 0
    +        for item in rayclusters
    +    ]
    +    worker_cpu_limits = [
    +        item.worker_cpu_limits if item.worker_cpu_limits else 0 for item in rayclusters
    +    ]
    +    worker_cpu_rl = [
    +        f"{requests}~{limits}"
    +        for requests, limits in zip(worker_cpu_requests, worker_cpu_limits)
    +    ]
    +    worker_mem_requests = [
    +        item.worker_mem_requests if item.worker_mem_requests else 0
    +        for item in rayclusters
    +    ]
    +    worker_mem_limits = [
    +        item.worker_mem_limits if item.worker_mem_limits else 0 for item in rayclusters
    +    ]
    +    worker_mem_rl = [
    +        f"{requests}~{limits}"
    +        for requests, limits in zip(worker_mem_requests, worker_mem_limits)
    +    ]
    +    status = [item.status.name for item in rayclusters]
    +
    +    status = [_format_status(item.status) for item in rayclusters]
    +
    +    data = {
    +        "Name": names,
    +        "Namespace": namespaces,
    +        "Num Workers": num_workers,
    +        "Head GPUs": head_extended_resources,
    +        "Worker GPUs": worker_extended_resources,
    +        "Head CPU Req~Lim": head_cpu_rl,
    +        "Head Memory Req~Lim": head_mem_rl,
    +        "Worker CPU Req~Lim": worker_cpu_rl,
    +        "Worker Memory Req~Lim": worker_mem_rl,
    +        "status": status,
    +    }
    +    return pd.DataFrame(data)
    +
    +
    +def _format_status(status):
    +    """
    +    _format_status function formats the status enum.
    +    """
    +    status_map = {
    +        RayClusterStatus.READY: '<span style="color: green;">Ready ✓</span>',
    +        RayClusterStatus.SUSPENDED: '<span style="color: #007BFF;">Suspended ❄️</span>',
    +        RayClusterStatus.FAILED: '<span style="color: red;">Failed ✗</span>',
    +        RayClusterStatus.UNHEALTHY: '<span style="color: purple;">Unhealthy</span>',
    +        RayClusterStatus.UNKNOWN: '<span style="color: purple;">Unknown</span>',
    +    }
    +    return status_map.get(status, status)
    +
    +
    +
    +
    +
    +
    +
    +

    Functions

    +
    +
    +def cluster_up_down_buttons(cluster: codeflare_sdk.cluster.Cluster) ‑> ipywidgets.widgets.widget_button.Button +
    +
    +

    The cluster_up_down_buttons function returns two button widgets for a create and delete button. +The function uses the appwrapper bool to distinguish between resource type for the tool tip.

    +
    + +Expand source code + +
    def cluster_up_down_buttons(cluster: "codeflare_sdk.cluster.Cluster") -> widgets.Button:
    +    """
    +    The cluster_up_down_buttons function returns two button widgets for a create and delete button.
    +    The function uses the appwrapper bool to distinguish between resource type for the tool tip.
    +    """
    +    resource = "Ray Cluster"
    +    if cluster.config.appwrapper:
    +        resource = "AppWrapper"
    +
    +    up_button = widgets.Button(
    +        description="Cluster Up",
    +        tooltip=f"Create the {resource}",
    +        icon="play",
    +    )
    +
    +    delete_button = widgets.Button(
    +        description="Cluster Down",
    +        tooltip=f"Delete the {resource}",
    +        icon="trash",
    +    )
    +
    +    wait_ready_check = wait_ready_check_box()
    +    output = widgets.Output()
    +
    +    # Display the buttons in an HBox wrapped in a VBox which includes the wait_ready Checkbox
    +    button_display = widgets.HBox([up_button, delete_button])
    +    display(widgets.VBox([button_display, wait_ready_check]), output)
    +
    +    def on_up_button_clicked(b):  # Handle the up button click event
    +        with output:
    +            output.clear_output()
    +            cluster.up()
    +
    +            # If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
    +            if wait_ready_check.value:
    +                cluster.wait_ready()
    +
    +    def on_down_button_clicked(b):  # Handle the down button click event
    +        with output:
    +            output.clear_output()
    +            cluster.down()
    +
    +    up_button.on_click(on_up_button_clicked)
    +    delete_button.on_click(on_down_button_clicked)
    +
    +
    +
    +def is_notebook() ‑> bool +
    +
    +

    The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that.

    +
    + +Expand source code + +
    def is_notebook() -> bool:
    +    """
    +    The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that.
    +    """
    +    if (
    +        "PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING" in os.environ
    +        or "JPY_SESSION_NAME" in os.environ
    +    ):  # If running Jupyter NBs in VsCode or RHOAI/ODH display UI buttons
    +        return True
    +    else:
    +        return False
    +
    +
    +
    +def view_clusters(namespace: str = None) +
    +
    +

    view_clusters function will display existing clusters with their specs, and handle user interactions.

    +
    + +Expand source code + +
    def view_clusters(namespace: str = None):
    +    """
    +    view_clusters function will display existing clusters with their specs, and handle user interactions.
    +    """
    +    if not is_notebook():
    +        warnings.warn(
    +            "view_clusters can only be used in a Jupyter Notebook environment."
    +        )
    +        return  # Exit function if not in Jupyter Notebook
    +
    +    from .cluster import get_current_namespace
    +
    +    if not namespace:
    +        namespace = get_current_namespace()
    +
    +    user_output = widgets.Output()
    +    raycluster_data_output = widgets.Output()
    +    url_output = widgets.Output()
    +
    +    ray_clusters_df = _fetch_cluster_data(namespace)
    +    if ray_clusters_df.empty:
    +        print(f"No clusters found in the {namespace} namespace.")
    +        return
    +
    +    classification_widget = widgets.ToggleButtons(
    +        options=ray_clusters_df["Name"].tolist(),
    +        value=ray_clusters_df["Name"].tolist()[0],
    +        description="Select an existing cluster:",
    +    )
    +    # Setting the initial value to trigger the event handler to display the cluster details.
    +    initial_value = classification_widget.value
    +    _on_cluster_click(
    +        {"new": initial_value}, raycluster_data_output, namespace, classification_widget
    +    )
    +    classification_widget.observe(
    +        lambda selection_change: _on_cluster_click(
    +            selection_change, raycluster_data_output, namespace, classification_widget
    +        ),
    +        names="value",
    +    )
    +
    +    # UI table buttons
    +    delete_button = widgets.Button(
    +        description="Delete Cluster",
    +        icon="trash",
    +        tooltip="Delete the selected cluster",
    +    )
    +    delete_button.on_click(
    +        lambda b: _on_delete_button_click(
    +            b,
    +            classification_widget,
    +            ray_clusters_df,
    +            raycluster_data_output,
    +            user_output,
    +            delete_button,
    +            list_jobs_button,
    +            ray_dashboard_button,
    +        )
    +    )
    +
    +    list_jobs_button = widgets.Button(
    +        description="View Jobs", icon="suitcase", tooltip="Open the Ray Job Dashboard"
    +    )
    +    list_jobs_button.on_click(
    +        lambda b: _on_list_jobs_button_click(
    +            b, classification_widget, ray_clusters_df, user_output, url_output
    +        )
    +    )
    +
    +    ray_dashboard_button = widgets.Button(
    +        description="Open Ray Dashboard",
    +        icon="dashboard",
    +        tooltip="Open the Ray Dashboard in a new tab",
    +        layout=widgets.Layout(width="auto"),
    +    )
    +    ray_dashboard_button.on_click(
    +        lambda b: _on_ray_dashboard_button_click(
    +            b, classification_widget, ray_clusters_df, user_output, url_output
    +        )
    +    )
    +
    +    display(widgets.VBox([classification_widget, raycluster_data_output]))
    +    display(
    +        widgets.HBox([delete_button, list_jobs_button, ray_dashboard_button]),
    +        url_output,
    +        user_output,
    +    )
    +
    +
    +
    +def wait_ready_check_box() +
    +
    +

    The wait_ready_check_box function will return a checkbox widget used for waiting for the resource to be in the state READY.

    +
    + +Expand source code + +
    def wait_ready_check_box():
    +    """
    +    The wait_ready_check_box function will return a checkbox widget used for waiting for the resource to be in the state READY.
    +    """
    +    wait_ready_check_box = widgets.Checkbox(
    +        False,
    +        description="Wait for Cluster?",
    +    )
    +    return wait_ready_check_box
    +
    +
    +
    +
    +
    +
    +
    + +
    + + + diff --git a/docs/detailed-documentation/index.html b/docs/detailed-documentation/index.html index fd74344f1..450007196 100644 --- a/docs/detailed-documentation/index.html +++ b/docs/detailed-documentation/index.html @@ -42,6 +42,7 @@

    Package codeflare_sdk

    get_cluster, list_all_queued, list_all_clusters, + view_clusters, ) from .job import RayJobClient diff --git a/docs/detailed-documentation/job/ray_jobs.html b/docs/detailed-documentation/job/ray_jobs.html index 01fa5fb51..20002e27e 100644 --- a/docs/detailed-documentation/job/ray_jobs.html +++ b/docs/detailed-documentation/job/ray_jobs.html @@ -47,6 +47,7 @@

    Module codeflare_sdk.job.ray_jobs

    The ray_jobs sub-module contains methods needed to submit jobs and connect to Ray Clusters that were not created by CodeFlare. The SDK acts as a wrapper for the Ray Job Submission Client. """ + from ray.job_submission import JobSubmissionClient from ray.dashboard.modules.job.pydantic_models import JobDetails from typing import Iterator, Optional, Dict, Any, Union, List @@ -93,6 +94,7 @@

    Module codeflare_sdk.job.ray_jobs

    submission_id: Optional[str] = None, entrypoint_num_cpus: Optional[Union[int, float]] = None, entrypoint_num_gpus: Optional[Union[int, float]] = None, + entrypoint_memory: Optional[int] = None, entrypoint_resources: Optional[Dict[str, float]] = None, ) -> str: """ @@ -106,6 +108,7 @@

    Module codeflare_sdk.job.ray_jobs

    job_id -- DEPRECATED. This has been renamed to submission_id entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. + entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. """ return self.rayJobClient.submit_job( @@ -116,6 +119,7 @@

    Module codeflare_sdk.job.ray_jobs

    submission_id=submission_id, entrypoint_num_cpus=entrypoint_num_cpus, entrypoint_num_gpus=entrypoint_num_gpus, + entrypoint_memory=entrypoint_memory, entrypoint_resources=entrypoint_resources, ) @@ -248,6 +252,7 @@

    Classes

    submission_id: Optional[str] = None, entrypoint_num_cpus: Optional[Union[int, float]] = None, entrypoint_num_gpus: Optional[Union[int, float]] = None, + entrypoint_memory: Optional[int] = None, entrypoint_resources: Optional[Dict[str, float]] = None, ) -> str: """ @@ -261,6 +266,7 @@

    Classes

    job_id -- DEPRECATED. This has been renamed to submission_id entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. + entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. """ return self.rayJobClient.submit_job( @@ -271,6 +277,7 @@

    Classes

    submission_id=submission_id, entrypoint_num_cpus=entrypoint_num_cpus, entrypoint_num_gpus=entrypoint_num_gpus, + entrypoint_memory=entrypoint_memory, entrypoint_resources=entrypoint_resources, ) @@ -461,7 +468,7 @@

    Methods

    -def submit_job(self, entrypoint: str, job_id: Optional[str] = None, runtime_env: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, str]] = None, submission_id: Optional[str] = None, entrypoint_num_cpus: Union[int, float, ForwardRef(None)] = None, entrypoint_num_gpus: Union[int, float, ForwardRef(None)] = None, entrypoint_resources: Optional[Dict[str, float]] = None) ‑> str +def submit_job(self, entrypoint: str, job_id: Optional[str] = None, runtime_env: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, str]] = None, submission_id: Optional[str] = None, entrypoint_num_cpus: Union[int, float, ForwardRef(None)] = None, entrypoint_num_gpus: Union[int, float, ForwardRef(None)] = None, entrypoint_memory: Optional[int] = None, entrypoint_resources: Optional[Dict[str, float]] = None) ‑> str

    Method for submitting jobs to a Ray Cluster and returning the job id with entrypoint being a mandatory field.

    @@ -473,6 +480,7 @@

    Methods

    job_id – DEPRECATED. This has been renamed to submission_id entrypoint_num_cpus – The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_num_gpus – The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. +entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_resources – The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it.

    @@ -487,6 +495,7 @@

    Methods

    submission_id: Optional[str] = None, entrypoint_num_cpus: Optional[Union[int, float]] = None, entrypoint_num_gpus: Optional[Union[int, float]] = None, + entrypoint_memory: Optional[int] = None, entrypoint_resources: Optional[Dict[str, float]] = None, ) -> str: """ @@ -500,6 +509,7 @@

    Methods

    job_id -- DEPRECATED. This has been renamed to submission_id entrypoint_num_cpus -- The quantity of CPU cores to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_num_gpus -- The quantity of GPUs to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. + entrypoint_memory –- The quantity of memory to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. Defaults to 0. entrypoint_resources -- The quantity of custom resources to reserve for the execution of the entrypoint command, separately from any tasks or actors launched by it. """ return self.rayJobClient.submit_job( @@ -510,6 +520,7 @@

    Methods

    submission_id=submission_id, entrypoint_num_cpus=entrypoint_num_cpus, entrypoint_num_gpus=entrypoint_num_gpus, + entrypoint_memory=entrypoint_memory, entrypoint_resources=entrypoint_resources, )
    diff --git a/docs/detailed-documentation/utils/generate_yaml.html b/docs/detailed-documentation/utils/generate_yaml.html index 7b41e3c99..60ce89dfe 100644 --- a/docs/detailed-documentation/utils/generate_yaml.html +++ b/docs/detailed-documentation/utils/generate_yaml.html @@ -146,22 +146,22 @@

    Module codeflare_sdk.utils.generate_yaml

    def update_resources( spec, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, + cpu_requests, + cpu_limits, + memory_requests, + memory_limits, custom_resources, ): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: - requests["cpu"] = worker_cpu_requests - requests["memory"] = worker_memory_requests + requests["cpu"] = cpu_requests + requests["memory"] = memory_requests limits = resource.get("resources").get("limits") if limits is not None: - limits["cpu"] = worker_cpu_limits - limits["memory"] = worker_memory_limits + limits["cpu"] = cpu_limits + limits["memory"] = memory_limits for k in custom_resources.keys(): limits[k] = custom_resources[k] requests[k] = custom_resources[k] @@ -241,10 +241,10 @@

    Module codeflare_sdk.utils.generate_yaml

    # TODO: Eventually add head node configuration outside of template update_resources( spec, - cluster.config.head_cpus, - cluster.config.head_cpus, - cluster.config.head_memory, - cluster.config.head_memory, + cluster.config.head_cpu_requests, + cluster.config.head_cpu_limits, + cluster.config.head_memory_requests, + cluster.config.head_memory_limits, cluster.config.head_extended_resource_requests, ) else: @@ -812,10 +812,10 @@

    Functions

    # TODO: Eventually add head node configuration outside of template update_resources( spec, - cluster.config.head_cpus, - cluster.config.head_cpus, - cluster.config.head_memory, - cluster.config.head_memory, + cluster.config.head_cpu_requests, + cluster.config.head_cpu_limits, + cluster.config.head_memory_requests, + cluster.config.head_memory_limits, cluster.config.head_extended_resource_requests, ) else: @@ -830,7 +830,7 @@

    Functions

    -def update_resources(spec, worker_cpu_requests, worker_cpu_limits, worker_memory_requests, worker_memory_limits, custom_resources) +def update_resources(spec, cpu_requests, cpu_limits, memory_requests, memory_limits, custom_resources)
    @@ -840,22 +840,22 @@

    Functions

    def update_resources(
         spec,
    -    worker_cpu_requests,
    -    worker_cpu_limits,
    -    worker_memory_requests,
    -    worker_memory_limits,
    +    cpu_requests,
    +    cpu_limits,
    +    memory_requests,
    +    memory_limits,
         custom_resources,
     ):
         container = spec.get("containers")
         for resource in container:
             requests = resource.get("resources").get("requests")
             if requests is not None:
    -            requests["cpu"] = worker_cpu_requests
    -            requests["memory"] = worker_memory_requests
    +            requests["cpu"] = cpu_requests
    +            requests["memory"] = memory_requests
             limits = resource.get("resources").get("limits")
             if limits is not None:
    -            limits["cpu"] = worker_cpu_limits
    -            limits["memory"] = worker_memory_limits
    +            limits["cpu"] = cpu_limits
    +            limits["memory"] = memory_limits
             for k in custom_resources.keys():
                 limits[k] = custom_resources[k]
                 requests[k] = custom_resources[k]
    diff --git a/docs/detailed-documentation/utils/pretty_print.html b/docs/detailed-documentation/utils/pretty_print.html index cbffd1223..f2a8d7db9 100644 --- a/docs/detailed-documentation/utils/pretty_print.html +++ b/docs/detailed-documentation/utils/pretty_print.html @@ -166,9 +166,9 @@

    Module codeflare_sdk.utils.pretty_print

    ) name = cluster.name dashboard = cluster.dashboard - workers = str(cluster.workers) - memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" - cpu = str(cluster.worker_cpu) + workers = str(cluster.num_workers) + memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}" + cpu = f"{cluster.worker_cpu_requests}~{cluster.worker_cpu_limits}" gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link @@ -344,9 +344,9 @@

    Functions

    ) name = cluster.name dashboard = cluster.dashboard - workers = str(cluster.workers) - memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" - cpu = str(cluster.worker_cpu) + workers = str(cluster.num_workers) + memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}" + cpu = f"{cluster.worker_cpu_requests}~{cluster.worker_cpu_limits}" gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link