diff --git a/sky/cli.py b/sky/cli.py index 69064475ea7..3e5f551d0ee 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -23,7 +23,6 @@ listed in "sky --help". Take care to put logically connected commands close to each other. """ - import copy import datetime import functools @@ -82,10 +81,10 @@ if typing.TYPE_CHECKING: from sky.backends import backend as backend_lib -pd = adaptors_common.LazyImport("pandas") +pd = adaptors_common.LazyImport('pandas') logger = sky_logging.init_logger(__name__) -_CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) _CLUSTER_FLAG_HELP = """\ A cluster name. If provided, either reuse an existing cluster with that name or @@ -97,19 +96,15 @@ _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5 _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = ( - "{cluster_num} cluster{plural} {verb}. Please specify {cause} " - "cluster to show its {property}.\nUsage: `sky status --{flag} `" -) + '{cluster_num} cluster{plural} {verb}. Please specify {cause} ' + 'cluster to show its {property}.\nUsage: `sky status --{flag} `') -_ENDPOINTS_RETRY_MESSAGE = ( - "If the cluster was recently started, " "please retry after a while." -) +_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, ' + 'please retry after a while.') -_DAG_NOT_SUPPORTED_MESSAGE = ( - "YAML specifies a DAG which is only supported by " - "`sky jobs launch`. `{command}` supports a " - "single task only." -) +_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by ' + '`sky jobs launch`. `{command}` supports a ' + 'single task only.') def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: @@ -118,7 +113,7 @@ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]: for cluster in clusters: glob_cluster = global_user_state.get_glob_cluster_names(cluster) if len(glob_cluster) == 0 and not silent: - click.echo(f"Cluster {cluster} not found.") + click.echo(f'Cluster {cluster} not found.') glob_clusters.extend(glob_cluster) return list(set(glob_clusters)) @@ -129,153 +124,121 @@ def _get_glob_storages(storages: List[str]) -> List[str]: for storage_object in storages: glob_storage = global_user_state.get_glob_storage_name(storage_object) if len(glob_storage) == 0: - click.echo(f"Storage {storage_object} not found.") + click.echo(f'Storage {storage_object} not found.') glob_storages.extend(glob_storage) return list(set(glob_storages)) def _parse_env_var(env_var: str) -> Tuple[str, str]: """Parse env vars into a (KEY, VAL) pair.""" - if "=" not in env_var: + if '=' not in env_var: value = os.environ.get(env_var) if value is None: - raise click.UsageError(f"{env_var} is not set in local environment.") + raise click.UsageError( + f'{env_var} is not set in local environment.') return (env_var, value) - ret = tuple(env_var.split("=", 1)) + ret = tuple(env_var.split('=', 1)) if len(ret) != 2: raise click.UsageError( - f"Invalid env var: {env_var}. Must be in the form of KEY=VAL " "or KEY." - ) + f'Invalid env var: {env_var}. Must be in the form of KEY=VAL ' + 'or KEY.') return ret[0], ret[1] -def _merge_env_vars( - env_dict: Optional[Dict[str, str]], env_list: List[Tuple[str, str]] -) -> List[Tuple[str, str]]: +def _merge_env_vars(env_dict: Optional[Dict[str, str]], + env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]: """Merges all values from env_list into env_dict.""" if not env_dict: return env_list - for key, value in env_list: + for (key, value) in env_list: env_dict[key] = value return list(env_dict.items()) _TASK_OPTIONS = [ click.option( - "--workdir", + '--workdir', required=False, type=click.Path(exists=True, file_okay=False), - help=( - "If specified, sync this dir to the remote working directory, " - "where the task will be invoked. " - 'Overrides the "workdir" config in the YAML if both are supplied.' - ), - ), + help=('If specified, sync this dir to the remote working directory, ' + 'where the task will be invoked. ' + 'Overrides the "workdir" config in the YAML if both are supplied.' + )), click.option( - "--cloud", + '--cloud', required=False, type=str, - help=( - 'The cloud to use. If specified, overrides the "resources.cloud" ' - 'config. Passing "none" resets the config.' - ), - ), + help=('The cloud to use. If specified, overrides the "resources.cloud" ' + 'config. Passing "none" resets the config.')), click.option( - "--region", + '--region', required=False, type=str, - help=( - "The region to use. If specified, overrides the " - '"resources.region" config. Passing "none" resets the config.' - ), - ), + help=('The region to use. If specified, overrides the ' + '"resources.region" config. Passing "none" resets the config.')), click.option( - "--zone", + '--zone', required=False, type=str, - help=( - "The zone to use. If specified, overrides the " - '"resources.zone" config. Passing "none" resets the config.' - ), - ), + help=('The zone to use. If specified, overrides the ' + '"resources.zone" config. Passing "none" resets the config.')), click.option( - "--num-nodes", + '--num-nodes', required=False, type=int, - help=( - "Number of nodes to execute the task on. " - 'Overrides the "num_nodes" config in the YAML if both are ' - "supplied." - ), - ), + help=('Number of nodes to execute the task on. ' + 'Overrides the "num_nodes" config in the YAML if both are ' + 'supplied.')), click.option( - "--cpus", + '--cpus', default=None, type=str, required=False, - help=( - "Number of vCPUs each instance must have (e.g., " - "``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). " - "This is used to automatically select the instance type." - ), - ), + help=('Number of vCPUs each instance must have (e.g., ' + '``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). ' + 'This is used to automatically select the instance type.')), click.option( - "--memory", + '--memory', default=None, type=str, required=False, help=( - "Amount of memory each instance must have in GB (e.g., " - "``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))" - ), - ), - click.option( - "--disk-size", - default=None, - type=int, - required=False, - help=("OS disk size in GBs."), - ), - click.option( - "--disk-tier", - default=None, - type=click.Choice( - resources_utils.DiskTier.supported_tiers(), case_sensitive=False - ), - required=False, - help=resources_utils.DiskTier.cli_help_message(), - ), - click.option( - "--use-spot/--no-use-spot", - required=False, - default=None, - help=( - "Whether to request spot instances. If specified, overrides the " - '"resources.use_spot" config.' - ), - ), + 'Amount of memory each instance must have in GB (e.g., ' + '``--memory=16`` (exactly 16GB), ``--memory=16+`` (at least 16GB))' + )), + click.option('--disk-size', + default=None, + type=int, + required=False, + help=('OS disk size in GBs.')), + click.option('--disk-tier', + default=None, + type=click.Choice(resources_utils.DiskTier.supported_tiers(), + case_sensitive=False), + required=False, + help=resources_utils.DiskTier.cli_help_message()), click.option( - "--image-id", + '--use-spot/--no-use-spot', required=False, default=None, - help=( - "Custom image id for launching the instances. " - 'Passing "none" resets the config.' - ), - ), - click.option( - "--env-file", - required=False, - type=dotenv.dotenv_values, - help="""\ + help=('Whether to request spot instances. If specified, overrides the ' + '"resources.use_spot" config.')), + click.option('--image-id', + required=False, + default=None, + help=('Custom image id for launching the instances. ' + 'Passing "none" resets the config.')), + click.option('--env-file', + required=False, + type=dotenv.dotenv_values, + help="""\ Path to a dotenv file with environment variables to set on the remote node. If any values from ``--env-file`` conflict with values set by - ``--env``, the ``--env`` value will be preferred.""", - ), + ``--env``, the ``--env`` value will be preferred."""), click.option( - "--env", + '--env', required=False, type=_parse_env_var, multiple=True, @@ -293,92 +256,79 @@ def _merge_env_vars( 3. ``--env MY_ENV3``: set ``$MY_ENV3`` on the cluster to be the same value of ``$MY_ENV3`` in the local environment.""", - ), + ) ] _TASK_OPTIONS_WITH_NAME = [ - click.option( - "--name", - "-n", - required=False, - type=str, - help=( - 'Task name. Overrides the "name" ' - "config in the YAML if both are supplied." - ), - ), + click.option('--name', + '-n', + required=False, + type=str, + help=('Task name. Overrides the "name" ' + 'config in the YAML if both are supplied.')), ] + _TASK_OPTIONS _EXTRA_RESOURCES_OPTIONS = [ click.option( - "--gpus", + '--gpus', required=False, type=str, - help=( - "Type and number of GPUs to use. Example values: " - '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' - "(fractional counts are supported by the scheduling framework). " - "If a new cluster is being launched by this command, this is the " - "resources to provision. If an existing cluster is being reused, this" - " is seen as the task demand, which must fit the cluster's total " - "resources and is used for scheduling the task. " - 'Overrides the "accelerators" ' - "config in the YAML if both are supplied. " - 'Passing "none" resets the config.' - ), - ), + help= + ('Type and number of GPUs to use. Example values: ' + '"V100:8", "V100" (short for a count of 1), or "V100:0.5" ' + '(fractional counts are supported by the scheduling framework). ' + 'If a new cluster is being launched by this command, this is the ' + 'resources to provision. If an existing cluster is being reused, this' + ' is seen as the task demand, which must fit the cluster\'s total ' + 'resources and is used for scheduling the task. ' + 'Overrides the "accelerators" ' + 'config in the YAML if both are supplied. ' + 'Passing "none" resets the config.')), click.option( - "--instance-type", - "-t", + '--instance-type', + '-t', required=False, type=str, - help=( - "The instance type to use. If specified, overrides the " - '"resources.instance_type" config. Passing "none" resets the ' - "config." - ), + help=('The instance type to use. If specified, overrides the ' + '"resources.instance_type" config. Passing "none" resets the ' + 'config.'), ), click.option( - "--ports", + '--ports', required=False, type=str, multiple=True, - help=( - "Ports to open on the cluster. " - 'If specified, overrides the "ports" config in the YAML. ' - ), + help=('Ports to open on the cluster. ' + 'If specified, overrides the "ports" config in the YAML. '), ), ] -def _complete_cluster_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_cluster_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for cluster names.""" del ctx, param # Unused. return global_user_state.get_cluster_names_start_with(incomplete) -def _complete_storage_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_storage_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for storage names.""" del ctx, param # Unused. return global_user_state.get_storage_names_start_with(incomplete) -def _complete_file_name( - ctx: click.Context, param: click.Parameter, incomplete: str -) -> List[str]: +def _complete_file_name(ctx: click.Context, param: click.Parameter, + incomplete: str) -> List[str]: """Handle shell completion for file names. Returns a special completion marker that tells click to use the shell's default file completion. """ del ctx, param # Unused. - return [click.shell_completion.CompletionItem(incomplete, type="file")] + return [click.shell_completion.CompletionItem(incomplete, type='file')] def _get_click_major_version(): - return int(click.__version__.split(".", maxsplit=1)[0]) + return int(click.__version__.split('.', maxsplit=1)[0]) def _get_shell_complete_args(complete_fn): @@ -388,49 +338,49 @@ def _get_shell_complete_args(complete_fn): return {} -_RELOAD_ZSH_CMD = "source ~/.zshrc" -_RELOAD_FISH_CMD = "source ~/.config/fish/config.fish" -_RELOAD_BASH_CMD = "source ~/.bashrc" +_RELOAD_ZSH_CMD = 'source ~/.zshrc' +_RELOAD_FISH_CMD = 'source ~/.config/fish/config.fish' +_RELOAD_BASH_CMD = 'source ~/.bashrc' -def _install_shell_completion(ctx: click.Context, param: click.Parameter, value: str): +def _install_shell_completion(ctx: click.Context, param: click.Parameter, + value: str): """A callback for installing shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == "auto": - if "SHELL" not in os.environ: + if value == 'auto': + if 'SHELL' not in os.environ: click.secho( - "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" - ) + 'Cannot auto-detect shell. Please specify shell explicitly.', + fg='red') ctx.exit() else: - value = os.path.basename(os.environ["SHELL"]) + value = os.path.basename(os.environ['SHELL']) - zshrc_diff = "\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh" - bashrc_diff = "\n# For SkyPilot shell completion" "\n. ~/.sky/.sky-complete.bash" + zshrc_diff = '\n# For SkyPilot shell completion\n. ~/.sky/.sky-complete.zsh' + bashrc_diff = ('\n# For SkyPilot shell completion' + '\n. ~/.sky/.sky-complete.bash') - if value == "bash": + if value == 'bash': install_cmd = f'_SKY_COMPLETE=bash_source sky > \ ~/.sky/.sky-complete.bash && \ echo "{bashrc_diff}" >> ~/.bashrc' - cmd = ( - f'(grep -q "SkyPilot" ~/.bashrc) || ' - f"([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || " - f'(echo "Bash must be version 4 or above." && exit 1))' - ) + cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || ' + f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || ' + f'(echo "Bash must be version 4 or above." && exit 1))') reload_cmd = _RELOAD_BASH_CMD - elif value == "fish": - cmd = "_SKY_COMPLETE=fish_source sky > \ - ~/.config/fish/completions/sky.fish" + elif value == 'fish': + cmd = '_SKY_COMPLETE=fish_source sky > \ + ~/.config/fish/completions/sky.fish' reload_cmd = _RELOAD_FISH_CMD - elif value == "zsh": + elif value == 'zsh': install_cmd = f'_SKY_COMPLETE=zsh_source sky > \ ~/.sky/.sky-complete.zsh && \ echo "{zshrc_diff}" >> ~/.zshrc' @@ -439,48 +389,51 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter, value: reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f"Unsupported shell: {value}", fg="red") + click.secho(f'Unsupported shell: {value}', fg='red') ctx.exit() try: - subprocess.run(cmd, shell=True, check=True, executable=shutil.which("bash")) - click.secho(f"Shell completion installed for {value}", fg="green") + subprocess.run(cmd, + shell=True, + check=True, + executable=shutil.which('bash')) + click.secho(f'Shell completion installed for {value}', fg='green') click.echo( - "Completion will take effect once you restart the terminal: " - + click.style(f"{reload_cmd}", bold=True) - ) + 'Completion will take effect once you restart the terminal: ' + + click.style(f'{reload_cmd}', bold=True)) except subprocess.CalledProcessError as e: - click.secho(f"> Installation failed with code {e.returncode}", fg="red") + click.secho(f'> Installation failed with code {e.returncode}', fg='red') ctx.exit() -def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, value: str): +def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, + value: str): """A callback for uninstalling shell completion for click.""" del param # Unused. if not value or ctx.resilient_parsing: return - if value == "auto": - if "SHELL" not in os.environ: + if value == 'auto': + if 'SHELL' not in os.environ: click.secho( - "Cannot auto-detect shell. Please specify shell explicitly.", fg="red" - ) + 'Cannot auto-detect shell. Please specify shell explicitly.', + fg='red') ctx.exit() else: - value = os.path.basename(os.environ["SHELL"]) + value = os.path.basename(os.environ['SHELL']) - if value == "bash": + if value == 'bash': cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.bashrc && \ sed -i"" -e "/sky-complete.bash/d" ~/.bashrc && \ rm -f ~/.sky/.sky-complete.bash' reload_cmd = _RELOAD_BASH_CMD - elif value == "fish": - cmd = "rm -f ~/.config/fish/completions/sky.fish" + elif value == 'fish': + cmd = 'rm -f ~/.config/fish/completions/sky.fish' reload_cmd = _RELOAD_FISH_CMD - elif value == "zsh": + elif value == 'zsh': cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.zshrc && \ sed -i"" -e "/sky-complete.zsh/d" ~/.zshrc && \ rm -f ~/.sky/.sky-complete.zsh' @@ -488,18 +441,17 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter, valu reload_cmd = _RELOAD_ZSH_CMD else: - click.secho(f"Unsupported shell: {value}", fg="red") + click.secho(f'Unsupported shell: {value}', fg='red') ctx.exit() try: subprocess.run(cmd, shell=True, check=True) - click.secho(f"Shell completion uninstalled for {value}", fg="green") - click.echo( - "Changes will take effect once you restart the terminal: " - + click.style(f"{reload_cmd}", bold=True) - ) + click.secho(f'Shell completion uninstalled for {value}', fg='green') + click.echo('Changes will take effect once you restart the terminal: ' + + click.style(f'{reload_cmd}', bold=True)) except subprocess.CalledProcessError as e: - click.secho(f"> Uninstallation failed with code {e.returncode}", fg="red") + click.secho(f'> Uninstallation failed with code {e.returncode}', + fg='red') ctx.exit() @@ -515,72 +467,71 @@ def _add_options(func): def _parse_override_params( - cloud: Optional[str] = None, - region: Optional[str] = None, - zone: Optional[str] = None, - gpus: Optional[str] = None, - cpus: Optional[str] = None, - memory: Optional[str] = None, - instance_type: Optional[str] = None, - use_spot: Optional[bool] = None, - image_id: Optional[str] = None, - disk_size: Optional[int] = None, - disk_tier: Optional[str] = None, - ports: Optional[Tuple[str]] = None, -) -> Dict[str, Any]: + cloud: Optional[str] = None, + region: Optional[str] = None, + zone: Optional[str] = None, + gpus: Optional[str] = None, + cpus: Optional[str] = None, + memory: Optional[str] = None, + instance_type: Optional[str] = None, + use_spot: Optional[bool] = None, + image_id: Optional[str] = None, + disk_size: Optional[int] = None, + disk_tier: Optional[str] = None, + ports: Optional[Tuple[str]] = None) -> Dict[str, Any]: """Parses the override parameters into a dictionary.""" override_params: Dict[str, Any] = {} if cloud is not None: - if cloud.lower() == "none": - override_params["cloud"] = None + if cloud.lower() == 'none': + override_params['cloud'] = None else: - override_params["cloud"] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) + override_params['cloud'] = sky_clouds.CLOUD_REGISTRY.from_str(cloud) if region is not None: - if region.lower() == "none": - override_params["region"] = None + if region.lower() == 'none': + override_params['region'] = None else: - override_params["region"] = region + override_params['region'] = region if zone is not None: - if zone.lower() == "none": - override_params["zone"] = None + if zone.lower() == 'none': + override_params['zone'] = None else: - override_params["zone"] = zone + override_params['zone'] = zone if gpus is not None: - if gpus.lower() == "none": - override_params["accelerators"] = None + if gpus.lower() == 'none': + override_params['accelerators'] = None else: - override_params["accelerators"] = gpus + override_params['accelerators'] = gpus if cpus is not None: - if cpus.lower() == "none": - override_params["cpus"] = None + if cpus.lower() == 'none': + override_params['cpus'] = None else: - override_params["cpus"] = cpus + override_params['cpus'] = cpus if memory is not None: - if memory.lower() == "none": - override_params["memory"] = None + if memory.lower() == 'none': + override_params['memory'] = None else: - override_params["memory"] = memory + override_params['memory'] = memory if instance_type is not None: - if instance_type.lower() == "none": - override_params["instance_type"] = None + if instance_type.lower() == 'none': + override_params['instance_type'] = None else: - override_params["instance_type"] = instance_type + override_params['instance_type'] = instance_type if use_spot is not None: - override_params["use_spot"] = use_spot + override_params['use_spot'] = use_spot if image_id is not None: - if image_id.lower() == "none": - override_params["image_id"] = None + if image_id.lower() == 'none': + override_params['image_id'] = None else: - override_params["image_id"] = image_id + override_params['image_id'] = image_id if disk_size is not None: - override_params["disk_size"] = disk_size + override_params['disk_size'] = disk_size if disk_tier is not None: - if disk_tier.lower() == "none": - override_params["disk_tier"] = None + if disk_tier.lower() == 'none': + override_params['disk_tier'] = None else: - override_params["disk_tier"] = disk_tier + override_params['disk_tier'] = disk_tier if ports: - override_params["ports"] = ports + override_params['ports'] = ports return override_params @@ -603,12 +554,11 @@ def _launch_with_confirm( if cluster is None: cluster = backend_utils.generate_cluster_name() - clone_source_str = "" + clone_source_str = '' if clone_disk_from is not None: - clone_source_str = f" from the disk of {clone_disk_from!r}" + clone_source_str = f' from the disk of {clone_disk_from!r}' task, _ = backend_utils.check_can_clone_disk_and_override_task( - clone_disk_from, cluster, task - ) + clone_disk_from, cluster, task) with sky.Dag() as dag: dag.add(task) @@ -618,15 +568,13 @@ def _launch_with_confirm( # Show the optimize log before the prompt if the cluster does not exist. try: sky_check.get_cached_enabled_clouds_or_refresh( - raise_if_no_cloud_access=True - ) + raise_if_no_cloud_access=True) except exceptions.NoCloudAccessError as e: # Catch the exception where the public cloud is not enabled, and # make it yellow for better visibility. with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f"{colorama.Fore.YELLOW}{e}" f"{colorama.Style.RESET_ALL}" - ) from e + raise RuntimeError(f'{colorama.Fore.YELLOW}{e}' + f'{colorama.Style.RESET_ALL}') from e dag = sky.optimize(dag) task = dag.tasks[0] @@ -639,18 +587,18 @@ def _launch_with_confirm( # it exists but is STOPPED. prompt = None if maybe_status is None: - cluster_str = "" if cluster is None else f" {cluster!r}" + cluster_str = '' if cluster is None else f' {cluster!r}' prompt = ( - f"Launching a new cluster{cluster_str}{clone_source_str}. " "Proceed?" - ) + f'Launching a new cluster{cluster_str}{clone_source_str}. ' + 'Proceed?') elif maybe_status == status_lib.ClusterStatus.STOPPED: - prompt = f"Restarting the stopped cluster {cluster!r}. Proceed?" + prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?' if prompt is not None: confirm_shown = True click.confirm(prompt, default=True, abort=True, show_default=True) if not confirm_shown: - click.secho(f"Running task on cluster {cluster}...", fg="yellow") + click.secho(f'Running task on cluster {cluster}...', fg='yellow') sky.launch( dag, @@ -678,12 +626,12 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: config: Optional[List[Dict[str, Any]]] = None result = None shell_splits = shlex.split(entrypoint) - yaml_file_provided = len(shell_splits) == 1 and ( - shell_splits[0].endswith("yaml") or shell_splits[0].endswith(".yml") - ) - invalid_reason = "" + yaml_file_provided = (len(shell_splits) == 1 and + (shell_splits[0].endswith('yaml') or + shell_splits[0].endswith('.yml'))) + invalid_reason = '' try: - with open(entrypoint, "r", encoding="utf-8") as f: + with open(entrypoint, 'r', encoding='utf-8') as f: try: config = list(yaml.safe_load_all(f)) if config: @@ -698,43 +646,36 @@ def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]: except yaml.YAMLError as e: if yaml_file_provided: logger.debug(e) - detailed_error = f"\nYAML Error: {e}\n" - invalid_reason = ( - "contains an invalid configuration. " - "Please check syntax.\n" - f"{detailed_error}" - ) + detailed_error = f'\nYAML Error: {e}\n' + invalid_reason = ('contains an invalid configuration. ' + 'Please check syntax.\n' + f'{detailed_error}') is_yaml = False except OSError: if yaml_file_provided: entry_point_path = os.path.expanduser(entrypoint) if not os.path.exists(entry_point_path): - invalid_reason = ( - "does not exist. Please check if the path" " is correct." - ) + invalid_reason = ('does not exist. Please check if the path' + ' is correct.') elif not os.path.isfile(entry_point_path): - invalid_reason = ( - "is not a file. Please check if the path" " is correct." - ) + invalid_reason = ('is not a file. Please check if the path' + ' is correct.') else: - invalid_reason = ( - "yaml.safe_load() failed. Please check if the" " path is correct." - ) + invalid_reason = ('yaml.safe_load() failed. Please check if the' + ' path is correct.') is_yaml = False if not is_yaml: if yaml_file_provided: click.confirm( - f"{entrypoint!r} looks like a yaml path but {invalid_reason}\n" - "It will be treated as a command to be run remotely. Continue?", - abort=True, - ) + f'{entrypoint!r} looks like a yaml path but {invalid_reason}\n' + 'It will be treated as a command to be run remotely. Continue?', + abort=True) return is_yaml, result def _pop_and_ignore_fields_in_override_params( - params: Dict[str, Any], field_to_ignore: List[str] -) -> None: + params: Dict[str, Any], field_to_ignore: List[str]) -> None: """Pops and ignores fields in override params. Args: @@ -748,15 +689,14 @@ def _pop_and_ignore_fields_in_override_params( for field in field_to_ignore: field_value = params.pop(field, None) if field_value is not None: - click.secho( - f"Override param {field}={field_value} is ignored.", fg="yellow" - ) + click.secho(f'Override param {field}={field_value} is ignored.', + fg='yellow') def _make_task_or_dag_from_entrypoint_with_overrides( entrypoint: Tuple[str, ...], *, - entrypoint_name: str = "Task", + entrypoint_name: str = 'Task', name: Optional[str] = None, workdir: Optional[str] = None, cloud: Optional[str] = None, @@ -783,37 +723,40 @@ def _make_task_or_dag_from_entrypoint_with_overrides( A dag iff the entrypoint is YAML and contains more than 1 task. Otherwise, a task. """ - entrypoint = " ".join(entrypoint) + entrypoint = ' '.join(entrypoint) is_yaml, _ = _check_yaml(entrypoint) entrypoint: Optional[str] if is_yaml: # Treat entrypoint as a yaml. - click.secho(f"{entrypoint_name} from YAML spec: ", fg="yellow", nl=False) + click.secho(f'{entrypoint_name} from YAML spec: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) else: if not entrypoint: entrypoint = None else: # Treat entrypoint as a bash command. - click.secho(f"{entrypoint_name} from command: ", fg="yellow", nl=False) + click.secho(f'{entrypoint_name} from command: ', + fg='yellow', + nl=False) click.secho(entrypoint, bold=True) - override_params = _parse_override_params( - cloud=cloud, - region=region, - zone=zone, - gpus=gpus, - cpus=cpus, - memory=memory, - instance_type=instance_type, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports, - ) + override_params = _parse_override_params(cloud=cloud, + region=region, + zone=zone, + gpus=gpus, + cpus=cpus, + memory=memory, + instance_type=instance_type, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports) if field_to_ignore is not None: - _pop_and_ignore_fields_in_override_params(override_params, field_to_ignore) + _pop_and_ignore_fields_in_override_params(override_params, + field_to_ignore) if is_yaml: assert entrypoint is not None @@ -825,17 +768,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # override params. if override_params: click.secho( - f"WARNING: override params {override_params} are ignored, " - "since the yaml file contains multiple tasks.", - fg="yellow", - ) + f'WARNING: override params {override_params} are ignored, ' + 'since the yaml file contains multiple tasks.', + fg='yellow') return dag - assert ( - len(dag.tasks) == 1 - ), f"If you see this, please file an issue; tasks: {dag.tasks}" + assert len(dag.tasks) == 1, ( + f'If you see this, please file an issue; tasks: {dag.tasks}') task = dag.tasks[0] else: - task = sky.Task(name="sky-cmd", run=entrypoint) + task = sky.Task(name='sky-cmd', run=entrypoint) task.set_resources({sky.Resources()}) # env update has been done for DAG in load_chain_dag_from_yaml for YAML. task.update_envs(env) @@ -846,7 +787,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides( # job launch specific. if job_recovery is not None: - override_params["job_recovery"] = job_recovery + override_params['job_recovery'] = job_recovery task.set_resources_override(override_params) @@ -866,7 +807,7 @@ class _NaturalOrderGroup(click.Group): def list_commands(self, ctx): return self.commands.keys() - @usage_lib.entrypoint("sky.cli", fallback=True) + @usage_lib.entrypoint('sky.cli', fallback=True) def invoke(self, ctx): return super().invoke(ctx) @@ -878,38 +819,36 @@ class _DocumentedCodeCommand(click.Command): def get_help(self, ctx): help_str = ctx.command.help - ctx.command.help = help_str.replace(".. code-block:: bash\n", "\b") + ctx.command.help = help_str.replace('.. code-block:: bash\n', '\b') return super().get_help(ctx) def _with_deprecation_warning( - f, - original_name: str, - alias_name: str, - override_command_argument: Optional[Dict[str, Any]] = None, -): + f, + original_name: str, + alias_name: str, + override_command_argument: Optional[Dict[str, Any]] = None): @functools.wraps(f) def wrapper(self, *args, **kwargs): - override_str = "" + override_str = '' if override_command_argument is not None: overrides = [] for k, v in override_command_argument.items(): if isinstance(v, bool): if v: - overrides.append(f"--{k}") + overrides.append(f'--{k}') else: - overrides.append(f"--no-{k}") + overrides.append(f'--no-{k}') else: overrides.append(f'--{k.replace("_", "-")}={v}') - override_str = " with additional arguments " + " ".join(overrides) + override_str = ' with additional arguments ' + ' '.join(overrides) click.secho( - f"WARNING: `{alias_name}` has been renamed to `{original_name}` " - f"and will be removed in a future release. Please use the " - f"latter{override_str} instead.\n", + f'WARNING: `{alias_name}` has been renamed to `{original_name}` ' + f'and will be removed in a future release. Please use the ' + f'latter{override_str} instead.\n', err=True, - fg="yellow", - ) + fg='yellow') return f(self, *args, **kwargs) return wrapper @@ -918,7 +857,7 @@ def wrapper(self, *args, **kwargs): def _override_arguments(callback, override_command_argument: Dict[str, Any]): def wrapper(*args, **kwargs): - logger.info(f"Overriding arguments: {override_command_argument}") + logger.info(f'Overriding arguments: {override_command_argument}') kwargs.update(override_command_argument) return callback(*args, **kwargs) @@ -940,194 +879,161 @@ def _add_command_alias( if new_command_name is None: new_command_name = command.name if new_group == group and new_command_name == command.name: - raise ValueError("Cannot add an alias to the same command.") + raise ValueError('Cannot add an alias to the same command.') new_command = copy.deepcopy(command) new_command.hidden = hidden new_command.name = new_command_name if override_command_argument: - new_command.callback = _override_arguments( - new_command.callback, override_command_argument - ) + new_command.callback = _override_arguments(new_command.callback, + override_command_argument) - orig = f"sky {group.name} {command.name}" - alias = f"sky {new_group.name} {new_command_name}" + orig = f'sky {group.name} {command.name}' + alias = f'sky {new_group.name} {new_command_name}' if with_warning: new_command.invoke = _with_deprecation_warning( new_command.invoke, orig, alias, - override_command_argument=override_command_argument, - ) + override_command_argument=override_command_argument) new_group.add_command(new_command, name=new_command_name) -def _deprecate_and_hide_command(group, command_to_deprecate, alternative_command): +def _deprecate_and_hide_command(group, command_to_deprecate, + alternative_command): """Hide a command and show a deprecation note, hinting the alternative.""" command_to_deprecate.hidden = True if group is not None: - orig = f"sky {group.name} {command_to_deprecate.name}" + orig = f'sky {group.name} {command_to_deprecate.name}' else: - orig = f"sky {command_to_deprecate.name}" + orig = f'sky {command_to_deprecate.name}' command_to_deprecate.invoke = _with_deprecation_warning( - command_to_deprecate.invoke, alternative_command, orig - ) + command_to_deprecate.invoke, alternative_command, orig) @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS) -@click.option( - "--install-shell-completion", - type=click.Choice(["bash", "zsh", "fish", "auto"]), - callback=_install_shell_completion, - expose_value=False, - is_eager=True, - help="Install shell completion for the specified shell.", -) -@click.option( - "--uninstall-shell-completion", - type=click.Choice(["bash", "zsh", "fish", "auto"]), - callback=_uninstall_shell_completion, - expose_value=False, - is_eager=True, - help="Uninstall shell completion for the specified shell.", -) -@click.version_option(sky.__version__, "--version", "-v", prog_name="skypilot") -@click.version_option( - sky.__commit__, - "--commit", - "-c", - prog_name="skypilot", - message="%(prog)s, commit %(version)s", - help="Show the commit hash and exit", -) +@click.option('--install-shell-completion', + type=click.Choice(['bash', 'zsh', 'fish', 'auto']), + callback=_install_shell_completion, + expose_value=False, + is_eager=True, + help='Install shell completion for the specified shell.') +@click.option('--uninstall-shell-completion', + type=click.Choice(['bash', 'zsh', 'fish', 'auto']), + callback=_uninstall_shell_completion, + expose_value=False, + is_eager=True, + help='Uninstall shell completion for the specified shell.') +@click.version_option(sky.__version__, '--version', '-v', prog_name='skypilot') +@click.version_option(sky.__commit__, + '--commit', + '-c', + prog_name='skypilot', + message='%(prog)s, commit %(version)s', + help='Show the commit hash and exit') def cli(): pass @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@click.argument('entrypoint', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--cluster', + '-c', + default=None, + type=str, + **_get_shell_complete_args(_complete_cluster_name), + help=_CLUSTER_FLAG_HELP) +@click.option('--dryrun', + default=False, + is_flag=True, + help='If True, do not actually run the job.') @click.option( - "--cluster", - "-c", - default=None, - type=str, - **_get_shell_complete_args(_complete_cluster_name), - help=_CLUSTER_FLAG_HELP, -) -@click.option( - "--dryrun", - default=False, - is_flag=True, - help="If True, do not actually run the job.", -) -@click.option( - "--detach-setup", - "-s", + '--detach-setup', + '-s', default=False, is_flag=True, - help=( - "If True, run setup in non-interactive mode as part of the job itself. " - "You can safely ctrl-c to detach from logging, and it will not interrupt " - "the setup process. To see the logs again after detaching, use `sky logs`." - " To cancel setup, cancel the job via `sky cancel`. Useful for long-" - "running setup commands." - ), -) + help= + ('If True, run setup in non-interactive mode as part of the job itself. ' + 'You can safely ctrl-c to detach from logging, and it will not interrupt ' + 'the setup process. To see the logs again after detaching, use `sky logs`.' + ' To cancel setup, cancel the job via `sky cancel`. Useful for long-' + 'running setup commands.')) @click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) -@click.option( - "--docker", - "backend_name", - flag_value=backends.LocalDockerBackend.NAME, - default=False, - help="If used, runs locally inside a docker container.", -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) +@click.option('--docker', + 'backend_name', + flag_value=backends.LocalDockerBackend.NAME, + default=False, + help='If used, runs locally inside a docker container.') @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness, i.e., no running or pending jobs in the cluster's job " - "queue. Idleness gets reset whenever setting-up/running/pending jobs " - "are found in the job queue. " - "Setting this flag is equivalent to " - "running ``sky launch -d ...`` and then ``sky autostop -i ``" - ". If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' + 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' + 'are found in the job queue. ' + 'Setting this flag is equivalent to ' + 'running ``sky launch -d ...`` and then ``sky autostop -i ``' + '. If not set, the cluster will not be autostopped.')) @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help=( - "Autodown the cluster: tear down the cluster after all jobs finish " - "(successfully or abnormally). If --idle-minutes-to-autostop is also set, " - "the cluster will be torn down after the specified idle time. " - "Note that if errors occur during provisioning/data syncing/setting up, " - "the cluster will not be torn down for debugging purposes." - ), + help= + ('Autodown the cluster: tear down the cluster after all jobs finish ' + '(successfully or abnormally). If --idle-minutes-to-autostop is also set, ' + 'the cluster will be torn down after the specified idle time. ' + 'Note that if errors occur during provisioning/data syncing/setting up, ' + 'the cluster will not be torn down for debugging purposes.'), ) @click.option( - "--retry-until-up", - "-r", + '--retry-until-up', + '-r', default=False, is_flag=True, required=False, - help=( - "Whether to retry provisioning infinitely until the cluster is up, " - "if we fail to launch the cluster on any possible region/cloud due " - "to unavailability errors." - ), + help=('Whether to retry provisioning infinitely until the cluster is up, ' + 'if we fail to launch the cluster on any possible region/cloud due ' + 'to unavailability errors.'), ) @click.option( - "--yes", - "-y", + '--yes', + '-y', is_flag=True, default=False, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help="Skip confirmation prompt.", -) -@click.option( - "--no-setup", - is_flag=True, - default=False, - required=False, - help="Skip setup phase when (re-)launching cluster.", -) + help='Skip confirmation prompt.') +@click.option('--no-setup', + is_flag=True, + default=False, + required=False, + help='Skip setup phase when (re-)launching cluster.') @click.option( - "--clone-disk-from", - "--clone", + '--clone-disk-from', + '--clone', default=None, type=str, **_get_shell_complete_args(_complete_cluster_name), - help=( - "[Experimental] Clone disk from an existing cluster to launch " - "a new one. This is useful when the new cluster needs to have " - "the same data on the boot disk as an existing cluster." - ), -) + help=('[Experimental] Clone disk from an existing cluster to launch ' + 'a new one. This is useful when the new cluster needs to have ' + 'the same data on the boot disk as an existing cluster.')) @usage_lib.entrypoint def launch( entrypoint: Tuple[str, ...], @@ -1171,8 +1077,7 @@ def launch( # NOTE(dev): Keep the docstring consistent between the Python API and CLI. env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str="Launching tasks on it" - ) + cluster, operation_str='Launching tasks on it') if backend_name is None: backend_name = backends.CloudVmRayBackend.NAME @@ -1196,7 +1101,8 @@ def launch( ports=ports, ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError(_DAG_NOT_SUPPORTED_MESSAGE.format(command="sky launch")) + raise click.UsageError( + _DAG_NOT_SUPPORTED_MESSAGE.format(command='sky launch')) task = task_or_dag backend: backends.Backend @@ -1206,66 +1112,55 @@ def launch( backend = backends.CloudVmRayBackend() else: with ux_utils.print_exception_no_traceback(): - raise ValueError(f"{backend_name} backend is not supported.") + raise ValueError(f'{backend_name} backend is not supported.') if task.service is not None: logger.info( - f"{colorama.Fore.YELLOW}Service section will be ignored when using " - f"`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}" - "To spin up a service, use SkyServe CLI: " - f"{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up" - f"{colorama.Style.RESET_ALL}" - ) - - _launch_with_confirm( - task, - backend, - cluster, - dryrun=dryrun, - detach_setup=detach_setup, - detach_run=detach_run, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes_to_autostop, - down=down, - retry_until_up=retry_until_up, - no_setup=no_setup, - clone_disk_from=clone_disk_from, - ) + f'{colorama.Fore.YELLOW}Service section will be ignored when using ' + f'`sky launch`. {colorama.Style.RESET_ALL}\n{colorama.Fore.YELLOW}' + 'To spin up a service, use SkyServe CLI: ' + f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up' + f'{colorama.Style.RESET_ALL}') + + _launch_with_confirm(task, + backend, + cluster, + dryrun=dryrun, + detach_setup=detach_setup, + detach_run=detach_run, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + down=down, + retry_until_up=retry_until_up, + no_setup=no_setup, + clone_disk_from=clone_disk_from) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "cluster", - required=False, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.argument('cluster', + required=False, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) @click.option( - "--cluster", - "-c", - "cluster_option", + '--cluster', + '-c', + 'cluster_option', hidden=True, type=str, - help="This is the same as the positional argument, just for consistency.", - **_get_shell_complete_args(_complete_cluster_name), -) -@click.argument( - "entrypoint", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) + help='This is the same as the positional argument, just for consistency.', + **_get_shell_complete_args(_complete_cluster_name)) +@click.argument('entrypoint', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) @click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) @usage_lib.entrypoint # pylint: disable=redefined-builtin @@ -1351,24 +1246,23 @@ def exec( """ if cluster_option is None and cluster is None: - raise click.UsageError("Missing argument '[CLUSTER]' and " "'[ENTRYPOINT]...'") + raise click.UsageError('Missing argument \'[CLUSTER]\' and ' + '\'[ENTRYPOINT]...\'') if cluster_option is not None: if cluster is not None: entrypoint = (cluster,) + entrypoint cluster = cluster_option if not entrypoint: - raise click.UsageError("Missing argument '[ENTRYPOINT]...'") + raise click.UsageError('Missing argument \'[ENTRYPOINT]...\'') assert cluster is not None, (cluster, cluster_option, entrypoint) env = _merge_env_vars(env_file, env) controller_utils.check_cluster_name_not_controller( - cluster, operation_str="Executing task on it" - ) + cluster, operation_str='Executing task on it') handle = global_user_state.get_handle_from_cluster_name(cluster) if handle is None: - raise click.BadParameter( - f"Cluster {cluster!r} not found. " "Use `sky launch` to provision first." - ) + raise click.BadParameter(f'Cluster {cluster!r} not found. ' + 'Use `sky launch` to provision first.') backend = backend_utils.get_backend_from_handle(handle) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -1389,26 +1283,24 @@ def exec( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - field_to_ignore=["cpus", "memory", "disk_size", "disk_tier", "ports"], + field_to_ignore=['cpus', 'memory', 'disk_size', 'disk_tier', 'ports'], ) if isinstance(task_or_dag, sky.Dag): - raise click.UsageError( - "YAML specifies a DAG, while `sky exec` " "supports a single task only." - ) + raise click.UsageError('YAML specifies a DAG, while `sky exec` ' + 'supports a single task only.') task = task_or_dag - click.secho(f"Executing task on cluster {cluster}...", fg="yellow") + click.secho(f'Executing task on cluster {cluster}...', fg='yellow') sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run) def _get_managed_jobs( - refresh: bool, - skip_finished: bool, - show_all: bool, - limit_num_jobs_to_show: bool = False, - is_called_by_user: bool = False, -) -> Tuple[Optional[int], str]: + refresh: bool, + skip_finished: bool, + show_all: bool, + limit_num_jobs_to_show: bool = False, + is_called_by_user: bool = False) -> Tuple[Optional[int], str]: """Get the in-progress managed jobs. Args: @@ -1434,35 +1326,30 @@ def _get_managed_jobs( usage_lib.messages.usage.set_internal() with sky_logging.silent(): # Make the call silent - managed_jobs_ = managed_jobs.queue( - refresh=refresh, skip_finished=skip_finished - ) - num_in_progress_jobs = len(set(job["job_id"] for job in managed_jobs_)) + managed_jobs_ = managed_jobs.queue(refresh=refresh, + skip_finished=skip_finished) + num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_)) except exceptions.ClusterNotUpError as e: controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += ( - f" (See: {colorama.Style.BRIGHT}sky jobs -h" - f"{colorama.Style.RESET_ALL})" - ) - elif ( - controller_status == status_lib.ClusterStatus.STOPPED and is_called_by_user - ): - msg += ( - f" (See finished managed jobs: {colorama.Style.BRIGHT}" - f"sky jobs queue --refresh{colorama.Style.RESET_ALL})" - ) + msg += (f' (See: {colorama.Style.BRIGHT}sky jobs -h' + f'{colorama.Style.RESET_ALL})') + elif (controller_status == status_lib.ClusterStatus.STOPPED and + is_called_by_user): + msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}' + f'sky jobs queue --refresh{colorama.Style.RESET_ALL})') except RuntimeError as e: - msg = "" + msg = '' try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the jobs. controller_type = controller_utils.Controllers.JOBS_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, cluster_status_lock_timeout=0 - ) - if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: + controller_type.value.cluster_name, + cluster_status_lock_timeout=0) + if (record is None or + record['status'] == status_lib.ClusterStatus.STOPPED): msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1471,31 +1358,26 @@ def _get_managed_jobs( pass if not msg: msg = ( - "Failed to query managed jobs due to connection " - "issues. Try again later. " - f"Details: {common_utils.format_exception(e, use_bracket=True)}" + 'Failed to query managed jobs due to connection ' + 'issues. Try again later. ' + f'Details: {common_utils.format_exception(e, use_bracket=True)}' ) except Exception as e: # pylint: disable=broad-except - msg = ( - "Failed to query managed jobs: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to query managed jobs: ' + f'{common_utils.format_exception(e, use_bracket=True)}') else: - max_jobs_to_show = ( - _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS if limit_num_jobs_to_show else None - ) - msg = managed_jobs.format_job_table( - managed_jobs_, show_all=show_all, max_jobs=max_jobs_to_show - ) + max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS + if limit_num_jobs_to_show else None) + msg = managed_jobs.format_job_table(managed_jobs_, + show_all=show_all, + max_jobs=max_jobs_to_show) return num_in_progress_jobs, msg -def _get_services( - service_names: Optional[List[str]], - show_all: bool, - show_endpoint: bool, - is_called_by_user: bool = False, -) -> Tuple[Optional[int], str]: +def _get_services(service_names: Optional[List[str]], + show_all: bool, + show_endpoint: bool, + is_called_by_user: bool = False) -> Tuple[Optional[int], str]: """Get service statuses. Args: @@ -1524,21 +1406,20 @@ def _get_services( controller_status = e.cluster_status msg = str(e) if controller_status is None: - msg += ( - f" (See: {colorama.Style.BRIGHT}sky serve -h" - f"{colorama.Style.RESET_ALL})" - ) + msg += (f' (See: {colorama.Style.BRIGHT}sky serve -h' + f'{colorama.Style.RESET_ALL})') except RuntimeError as e: - msg = "" + msg = '' try: # Check the controller status again, as the RuntimeError is likely # due to the controller being autostopped when querying the # services. controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER record = backend_utils.refresh_cluster_record( - controller_type.value.cluster_name, cluster_status_lock_timeout=0 - ) - if record is None or record["status"] == status_lib.ClusterStatus.STOPPED: + controller_type.value.cluster_name, + cluster_status_lock_timeout=0) + if (record is None or + record['status'] == status_lib.ClusterStatus.STOPPED): msg = controller_type.value.default_hint_if_non_existent except Exception: # pylint: disable=broad-except # This is to an best effort to find the latest controller status to @@ -1546,128 +1427,92 @@ def _get_services( # print the original error. pass if not msg: - msg = ( - "Failed to fetch service statuses due to connection issues. " - "Please try again later. Details: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to fetch service statuses due to connection issues. ' + 'Please try again later. Details: ' + f'{common_utils.format_exception(e, use_bracket=True)}') except Exception as e: # pylint: disable=broad-except - msg = ( - "Failed to fetch service statuses: " - f"{common_utils.format_exception(e, use_bracket=True)}" - ) + msg = ('Failed to fetch service statuses: ' + f'{common_utils.format_exception(e, use_bracket=True)}') else: if show_endpoint: if len(service_records) != 1: - plural = "s" if len(service_records) > 1 else "" - service_num = ( - str(len(service_records)) if len(service_records) > 0 else "No" - ) + plural = 's' if len(service_records) > 1 else '' + service_num = (str(len(service_records)) + if len(service_records) > 0 else 'No') raise click.UsageError( - f"{service_num} service{plural} found. Please specify " - "an existing service to show its endpoint. Usage: " - "sky serve status --endpoint " - ) + f'{service_num} service{plural} found. Please specify ' + 'an existing service to show its endpoint. Usage: ' + 'sky serve status --endpoint ') msg = serve_lib.get_endpoint(service_records[0]) else: msg = serve_lib.format_service_table(service_records, show_all) - service_not_found_msg = "" + service_not_found_msg = '' if service_names is not None: for service_name in service_names: - if not any( - service_name == record["name"] for record in service_records - ): + if not any(service_name == record['name'] + for record in service_records): service_not_found_msg += ( - f"\nService {service_name!r} not found." - ) + f'\nService {service_name!r} not found.') if service_not_found_msg: - msg += f"\n{service_not_found_msg}" + msg += f'\n{service_not_found_msg}' return num_services, msg @cli.command() +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--refresh", - "-r", - default=False, - is_flag=True, - required=False, - help="Query the latest cluster statuses from the cloud provider(s).", -) -@click.option( - "--ip", - default=False, - is_flag=True, - required=False, - help=( - "Get the IP address of the head node of a cluster. This " - "option will override all other options. For Kubernetes " - "clusters, the returned IP address is the internal IP " - "of the head pod, and may not be accessible from outside " - "the cluster." - ), -) -@click.option( - "--endpoints", + '--refresh', + '-r', default=False, is_flag=True, required=False, - help=( - "Get all exposed endpoints and corresponding URLs for a" - "cluster. This option will override all other options." - ), -) -@click.option( - "--endpoint", - required=False, - default=None, - type=int, - help=( - "Get the endpoint URL for the specified port number on the " - "cluster. This option will override all other options." - ), -) -@click.option( - "--show-managed-jobs/--no-show-managed-jobs", - default=True, - is_flag=True, - required=False, - help="Also show recent in-progress managed jobs, if any.", -) -@click.option( - "--show-services/--no-show-services", - default=True, - is_flag=True, - required=False, - help="Also show sky serve services, if any.", -) -@click.argument( - "clusters", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name), -) + help='Query the latest cluster statuses from the cloud provider(s).') +@click.option('--ip', + default=False, + is_flag=True, + required=False, + help=('Get the IP address of the head node of a cluster. This ' + 'option will override all other options. For Kubernetes ' + 'clusters, the returned IP address is the internal IP ' + 'of the head pod, and may not be accessible from outside ' + 'the cluster.')) +@click.option('--endpoints', + default=False, + is_flag=True, + required=False, + help=('Get all exposed endpoints and corresponding URLs for a' + 'cluster. This option will override all other options.')) +@click.option('--endpoint', + required=False, + default=None, + type=int, + help=('Get the endpoint URL for the specified port number on the ' + 'cluster. This option will override all other options.')) +@click.option('--show-managed-jobs/--no-show-managed-jobs', + default=True, + is_flag=True, + required=False, + help='Also show recent in-progress managed jobs, if any.') +@click.option('--show-services/--no-show-services', + default=True, + is_flag=True, + required=False, + help='Also show sky serve services, if any.') +@click.argument('clusters', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name)) @usage_lib.entrypoint # pylint: disable=redefined-builtin -def status( - all: bool, - refresh: bool, - ip: bool, - endpoints: bool, - endpoint: Optional[int], - show_managed_jobs: bool, - show_services: bool, - clusters: List[str], -): +def status(all: bool, refresh: bool, ip: bool, endpoints: bool, + endpoint: Optional[int], show_managed_jobs: bool, + show_services: bool, clusters: List[str]): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show clusters. @@ -1732,159 +1577,127 @@ def status( with multiprocessing.Pool(2) as pool: # Do not show job queue if user specifies clusters, and if user # specifies --ip or --endpoint(s). - show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints]) + show_managed_jobs = show_managed_jobs and not any( + [clusters, ip, endpoints]) show_endpoints = endpoints or endpoint is not None show_single_endpoint = endpoint is not None if show_managed_jobs: # Run managed job query in parallel to speed up the status query. managed_jobs_future = pool.apply_async( _get_managed_jobs, - kwds=dict( - refresh=False, - skip_finished=True, - show_all=False, - limit_num_jobs_to_show=not all, - is_called_by_user=False, - ), - ) + kwds=dict(refresh=False, + skip_finished=True, + show_all=False, + limit_num_jobs_to_show=not all, + is_called_by_user=False)) show_services = show_services and not clusters and not ip if show_services: # Run the sky serve service query in parallel to speed up the # status query. - services_future = pool.apply_async( - _get_services, - kwds=dict( - service_names=None, - show_all=False, - show_endpoint=False, - is_called_by_user=False, - ), - ) + services_future = pool.apply_async(_get_services, + kwds=dict( + service_names=None, + show_all=False, + show_endpoint=False, + is_called_by_user=False)) if ip or show_endpoints: if refresh: raise click.UsageError( - "Using --ip or --endpoint(s) with --refresh is not" - "supported for now. To fix, refresh first, " - "then query the IP or endpoint." - ) + 'Using --ip or --endpoint(s) with --refresh is not' + 'supported for now. To fix, refresh first, ' + 'then query the IP or endpoint.') if ip and show_endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Cannot specify both --ip and --endpoint(s) " - "at the same time." - ) + 'Cannot specify both --ip and --endpoint(s) ' + 'at the same time.') if endpoint is not None and endpoints: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Cannot specify both --endpoint and --endpoints " - "at the same time." - ) + 'Cannot specify both --endpoint and --endpoints ' + 'at the same time.') if len(clusters) != 1: with ux_utils.print_exception_no_traceback(): - plural = "s" if len(clusters) > 1 else "" - cluster_num = str(len(clusters)) if len(clusters) > 0 else "No" - cause = "a single" if len(clusters) > 1 else "an existing" + plural = 's' if len(clusters) > 1 else '' + cluster_num = (str(len(clusters)) + if len(clusters) > 0 else 'No') + cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, - verb="specified", + verb='specified', cause=cause, - property="IP address" if ip else "endpoint(s)", - flag=( - "ip" - if ip - else ( - "endpoint port" - if show_single_endpoint - else "endpoints" - ) - ), - ) - ) + property='IP address' if ip else 'endpoint(s)', + flag='ip' if ip else + ('endpoint port' + if show_single_endpoint else 'endpoints'))) else: - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters" - f"{colorama.Style.RESET_ALL}" - ) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters' + f'{colorama.Style.RESET_ALL}') query_clusters: Optional[List[str]] = None if clusters: query_clusters = _get_glob_clusters(clusters, silent=ip) - cluster_records = core.status(cluster_names=query_clusters, refresh=refresh) + cluster_records = core.status(cluster_names=query_clusters, + refresh=refresh) if ip or show_endpoints: if len(cluster_records) != 1: with ux_utils.print_exception_no_traceback(): - plural = "s" if len(cluster_records) > 1 else "" - cluster_num = ( - str(len(cluster_records)) - if len(cluster_records) > 0 - else f"{clusters[0]!r}" - ) - verb = "found" if len(cluster_records) > 0 else "not found" - cause = "a single" if len(clusters) > 1 else "an existing" + plural = 's' if len(cluster_records) > 1 else '' + cluster_num = (str(len(cluster_records)) + if len(cluster_records) > 0 else + f'{clusters[0]!r}') + verb = 'found' if len(cluster_records) > 0 else 'not found' + cause = 'a single' if len(clusters) > 1 else 'an existing' raise ValueError( _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format( cluster_num=cluster_num, plural=plural, verb=verb, cause=cause, - property="IP address" if ip else "endpoint(s)", - flag=( - "ip" - if ip - else ( - "endpoint port" - if show_single_endpoint - else "endpoints" - ) - ), - ) - ) + property='IP address' if ip else 'endpoint(s)', + flag='ip' if ip else + ('endpoint port' + if show_single_endpoint else 'endpoints'))) cluster_record = cluster_records[0] - if cluster_record["status"] != status_lib.ClusterStatus.UP: + if cluster_record['status'] != status_lib.ClusterStatus.UP: with ux_utils.print_exception_no_traceback(): - raise RuntimeError( - f'Cluster {cluster_record["name"]!r} ' "is not in UP status." - ) - handle = cluster_record["handle"] + raise RuntimeError(f'Cluster {cluster_record["name"]!r} ' + 'is not in UP status.') + handle = cluster_record['handle'] if not isinstance(handle, backends.CloudVmRayResourceHandle): with ux_utils.print_exception_no_traceback(): - raise ValueError( - "Querying IP address is not supported " "for local clusters." - ) + raise ValueError('Querying IP address is not supported ' + 'for local clusters.') head_ip = handle.external_ips()[0] if show_endpoints: if endpoint: - cluster_endpoint = core.endpoints( - cluster_record["name"], endpoint - ).get(endpoint, None) + cluster_endpoint = core.endpoints(cluster_record['name'], + endpoint).get( + endpoint, None) if not cluster_endpoint: raise click.Abort( - f"Endpoint {endpoint} not found for cluster " - f'{cluster_record["name"]!r}.' - ) + f'Endpoint {endpoint} not found for cluster ' + f'{cluster_record["name"]!r}.') click.echo(cluster_endpoint) else: - cluster_endpoints = core.endpoints(cluster_record["name"]) + cluster_endpoints = core.endpoints(cluster_record['name']) assert isinstance(cluster_endpoints, dict) if not cluster_endpoints: - raise click.Abort( - f"No endpoint found for cluster " - f'{cluster_record["name"]!r}.' - ) + raise click.Abort(f'No endpoint found for cluster ' + f'{cluster_record["name"]!r}.') for port, port_endpoint in cluster_endpoints.items(): click.echo( - f"{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}" - f"{colorama.Style.RESET_ALL}: " - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"{port_endpoint}{colorama.Style.RESET_ALL}" - ) + f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}' + f'{colorama.Style.RESET_ALL}: ' + f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'{port_endpoint}{colorama.Style.RESET_ALL}') return click.echo(head_ip) return @@ -1892,7 +1705,7 @@ def status( normal_clusters = [] controllers = [] for cluster_record in cluster_records: - cluster_name = cluster_record["name"] + cluster_name = cluster_record['name'] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controllers.append(cluster_record) @@ -1901,8 +1714,7 @@ def status( num_pending_autostop = 0 num_pending_autostop += status_utils.show_status_table( - normal_clusters + controllers, all - ) + normal_clusters + controllers, all) def _try_get_future_result(future) -> Tuple[bool, Any]: result = None @@ -1916,69 +1728,61 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: managed_jobs_query_interrupted = False if show_managed_jobs: - click.echo( - f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Managed jobs{colorama.Style.RESET_ALL}" - ) - with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Managed jobs{colorama.Style.RESET_ALL}') + with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): managed_jobs_query_interrupted, result = _try_get_future_result( - managed_jobs_future - ) + managed_jobs_future) if managed_jobs_query_interrupted: # Set to -1, so that the controller is not considered # down, and the hint for showing sky jobs queue # will still be shown. num_in_progress_jobs = -1 - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: num_in_progress_jobs, msg = result click.echo(msg) if num_in_progress_jobs is not None: # jobs controller is UP. - job_info = "" + job_info = '' if num_in_progress_jobs > 0: - plural_and_verb = " is" + plural_and_verb = ' is' if num_in_progress_jobs > 1: - plural_and_verb = "s are" + plural_and_verb = 's are' job_info = ( - f"{num_in_progress_jobs} managed job{plural_and_verb} " - "in progress" - ) - if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS: + f'{num_in_progress_jobs} managed job{plural_and_verb} ' + 'in progress') + if (num_in_progress_jobs > + _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS): job_info += ( - f" ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest " - "ones shown)" - ) - job_info += ". " + f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest ' + 'ones shown)') + job_info += '. ' hints.append( - controller_utils.Controllers.JOBS_CONTROLLER.value.in_progress_hint.format( - job_info=job_info - ) - ) + controller_utils.Controllers.JOBS_CONTROLLER.value. + in_progress_hint.format(job_info=job_info)) if show_services: - click.echo( - f"\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Services{colorama.Style.RESET_ALL}" - ) + click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') num_services = None if managed_jobs_query_interrupted: # The pool is terminated, so we cannot run the service query. - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: - with rich_utils.safe_status("[cyan]Checking services[/]"): - interrupted, result = _try_get_future_result(services_future) + with rich_utils.safe_status('[cyan]Checking services[/]'): + interrupted, result = _try_get_future_result( + services_future) if interrupted: num_services = -1 - msg = "KeyboardInterrupt" + msg = 'KeyboardInterrupt' else: num_services, msg = result click.echo(msg) if num_services is not None: - hints.append( - controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.in_progress_hint - ) + hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER. + value.in_progress_hint) if show_managed_jobs or show_services: try: @@ -1995,28 +1799,24 @@ def _try_get_future_result(future) -> Tuple[bool, Any]: if num_pending_autostop > 0 and not refresh: # Don't print this hint if there's no pending autostop or user has # already passed --refresh. - plural_and_verb = " has" + plural_and_verb = ' has' if num_pending_autostop > 1: - plural_and_verb = "s have" - hints.append( - f"* {num_pending_autostop} cluster{plural_and_verb} " - "auto{stop,down} scheduled. Refresh statuses with: " - f"{colorama.Style.BRIGHT}sky status --refresh" - f"{colorama.Style.RESET_ALL}" - ) + plural_and_verb = 's have' + hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} ' + 'auto{stop,down} scheduled. Refresh statuses with: ' + f'{colorama.Style.BRIGHT}sky status --refresh' + f'{colorama.Style.RESET_ALL}') if hints: - click.echo("\n" + "\n".join(hints)) + click.echo('\n' + '\n'.join(hints)) @cli.command() -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @usage_lib.entrypoint def cost_report(all: bool): # pylint: disable=redefined-builtin # NOTE(dev): Keep the docstring consistent between the Python API and CLI. @@ -2042,7 +1842,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records = [] controllers = dict() for cluster_record in cluster_records: - cluster_name = cluster_record["name"] + cluster_name = cluster_record['name'] controller = controller_utils.Controllers.from_name(cluster_name) if controller is not None: controller_name = controller.value.name @@ -2054,139 +1854,111 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin normal_cluster_records.append(cluster_record) total_cost = status_utils.get_total_cost_of_displayed_records( - normal_cluster_records, all - ) + normal_cluster_records, all) status_utils.show_cost_report_table(normal_cluster_records, all) for controller_name, cluster_record in controllers.items(): status_utils.show_cost_report_table( - [cluster_record], all, controller_name=controller_name.capitalize() - ) - total_cost += cluster_record["total_cost"] + [cluster_record], all, controller_name=controller_name.capitalize()) + total_cost += cluster_record['total_cost'] - click.echo( - f"\n{colorama.Style.BRIGHT}" - f"Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}" - ) + click.echo(f'\n{colorama.Style.BRIGHT}' + f'Total Cost: ${total_cost:.2f}{colorama.Style.RESET_ALL}') if not all: click.secho( - f"Showing up to {status_utils.NUM_COST_REPORT_LINES} " - "most recent clusters. " - "To see all clusters in history, " - "pass the --all flag.", - fg="yellow", - ) + f'Showing up to {status_utils.NUM_COST_REPORT_LINES} ' + 'most recent clusters. ' + 'To see all clusters in history, ' + 'pass the --all flag.', + fg='yellow') click.secho( - "This feature is experimental. " - "Costs for clusters with auto{stop,down} " - "scheduled may not be accurate.", - fg="yellow", - ) + 'This feature is experimental. ' + 'Costs for clusters with auto{stop,down} ' + 'scheduled may not be accurate.', + fg='yellow') @cli.command() -@click.option( - "--all-users", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all users' information in full.", -) -@click.option( - "--skip-finished", - "-s", - default=False, - is_flag=True, - required=False, - help="Show only pending/running jobs' information.", -) -@click.argument( - "clusters", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.option('--all-users', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all users\' information in full.') +@click.option('--skip-finished', + '-s', + default=False, + is_flag=True, + required=False, + help='Show only pending/running jobs\' information.') +@click.argument('clusters', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_cluster_name)) @usage_lib.entrypoint def queue(clusters: List[str], skip_finished: bool, all_users: bool): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Show the job queue for cluster(s).""" - click.secho("Fetching and parsing job queue...", fg="yellow") + click.secho('Fetching and parsing job queue...', fg='yellow') if clusters: clusters = _get_glob_clusters(clusters) else: cluster_infos = global_user_state.get_clusters() - clusters = [c["name"] for c in cluster_infos] + clusters = [c['name'] for c in cluster_infos] unsupported_clusters = [] for cluster in clusters: try: job_table = core.queue(cluster, skip_finished, all_users) - except ( - exceptions.CommandError, - ValueError, - exceptions.NotSupportedError, - exceptions.ClusterNotUpError, - exceptions.CloudUserIdentityError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + except (exceptions.CommandError, ValueError, + exceptions.NotSupportedError, exceptions.ClusterNotUpError, + exceptions.CloudUserIdentityError, + exceptions.ClusterOwnerIdentityMismatchError) as e: if isinstance(e, exceptions.NotSupportedError): unsupported_clusters.append(cluster) - click.echo( - f"{colorama.Fore.YELLOW}Failed to get the job queue for " - f"cluster {cluster!r}.{colorama.Style.RESET_ALL}\n" - f" {common_utils.format_exception(e)}" - ) + click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for ' + f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n' + f' {common_utils.format_exception(e)}') continue job_table = job_lib.format_job_queue(job_table) - click.echo(f"\nJob queue of cluster {cluster}\n{job_table}") + click.echo(f'\nJob queue of cluster {cluster}\n{job_table}') if unsupported_clusters: click.secho( - f"Note: Job queues are not supported on clusters: " + f'Note: Job queues are not supported on clusters: ' f'{", ".join(unsupported_clusters)}', - fg="yellow", - ) + fg='yellow') @cli.command() @click.option( - "--sync-down", - "-s", + '--sync-down', + '-s', is_flag=True, default=False, - help="Sync down the logs of a job to the local machine. For a distributed" - " job, a separate log file from each worker will be downloaded.", -) + help='Sync down the logs of a job to the local machine. For a distributed' + ' job, a separate log file from each worker will be downloaded.') @click.option( - "--status", + '--status', is_flag=True, default=False, - help=( - "If specified, do not show logs but exit with a status code for the " - "job's status: 0 for succeeded, or 1 for all other statuses." - ), -) + help=('If specified, do not show logs but exit with a status code for the ' + 'job\'s status: 0 for succeeded, or 1 for all other statuses.')) @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of a job. " - "If --no-follow is specified, print the log so far and exit. " - "[default: --follow]" - ), -) -@click.argument( - "cluster", - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.argument("job_ids", type=str, nargs=-1) + help=('Follow the logs of a job. ' + 'If --no-follow is specified, print the log so far and exit. ' + '[default: --follow]')) +@click.argument('cluster', + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) +@click.argument('job_ids', type=str, nargs=-1) # TODO(zhwu): support logs by job name @usage_lib.entrypoint def logs( @@ -2214,15 +1986,13 @@ def logs( """ if sync_down and status: raise click.UsageError( - "Both --sync_down and --status are specified " - "(ambiguous). To fix: specify at most one of them." - ) + 'Both --sync_down and --status are specified ' + '(ambiguous). To fix: specify at most one of them.') if len(job_ids) > 1 and not sync_down: raise click.UsageError( f'Cannot stream logs of multiple jobs (IDs: {", ".join(job_ids)}).' - "\nPass -s/--sync-down to download the logs instead." - ) + '\nPass -s/--sync-down to download the logs instead.') job_ids = None if not job_ids else job_ids @@ -2238,9 +2008,8 @@ def logs( # in core.tail_logs. job_id = job_ids[0] if not job_id.isdigit(): - raise click.UsageError( - f"Invalid job ID {job_id}. " "Job ID must be integers." - ) + raise click.UsageError(f'Invalid job ID {job_id}. ' + 'Job ID must be integers.') job_ids_to_query = [int(job_id)] else: # job_ids is either None or empty list, so it is safe to cast it here. @@ -2251,50 +2020,42 @@ def logs( # If job_ids is None and no job has been submitted to the cluster, # it will return {None: None}. if job_id is None: - click.secho(f"No job found on cluster {cluster!r}.", fg="red") + click.secho(f'No job found on cluster {cluster!r}.', fg='red') sys.exit(1) job_status = list(job_statuses.values())[0] - job_status_str = job_status.value if job_status is not None else "None" - click.echo(f"Job {job_id}: {job_status_str}") + job_status_str = job_status.value if job_status is not None else 'None' + click.echo(f'Job {job_id}: {job_status_str}') if job_status == job_lib.JobStatus.SUCCEEDED: return else: if job_status is None: - id_str = "" if job_id is None else f"{job_id} " - click.secho(f"Job {id_str}not found", fg="red") + id_str = '' if job_id is None else f'{job_id} ' + click.secho(f'Job {id_str}not found', fg='red') sys.exit(1) core.tail_logs(cluster, job_id, follow) @cli.command() -@click.argument( - "cluster", - required=True, - type=str, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Cancel all jobs on the specified cluster.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) -@click.argument("jobs", required=False, type=int, nargs=-1) +@click.argument('cluster', + required=True, + type=str, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Cancel all jobs on the specified cluster.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') +@click.argument('jobs', required=False, type=int, nargs=-1) @usage_lib.entrypoint -def cancel( - cluster: str, all: bool, jobs: List[int], yes: bool -): # pylint: disable=redefined-builtin, redefined-outer-name +def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disable=redefined-builtin, redefined-outer-name # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Cancel job(s). @@ -2318,30 +2079,26 @@ def cancel( job_identity_str = None job_ids_to_cancel = None if not jobs and not all: - click.echo( - f"{colorama.Fore.YELLOW}No job IDs or --all provided; " - "cancelling the latest running job." - f"{colorama.Style.RESET_ALL}" - ) - job_identity_str = "the latest running job" + click.echo(f'{colorama.Fore.YELLOW}No job IDs or --all provided; ' + 'cancelling the latest running job.' + f'{colorama.Style.RESET_ALL}') + job_identity_str = 'the latest running job' else: # Cancelling specific jobs or --all. - job_ids = " ".join(map(str, jobs)) - plural = "s" if len(job_ids) > 1 else "" - job_identity_str = f"job{plural} {job_ids}" + job_ids = ' '.join(map(str, jobs)) + plural = 's' if len(job_ids) > 1 else '' + job_identity_str = f'job{plural} {job_ids}' job_ids_to_cancel = jobs if all: - job_identity_str = "all jobs" + job_identity_str = 'all jobs' job_ids_to_cancel = None - job_identity_str += f" on cluster {cluster!r}" + job_identity_str += f' on cluster {cluster!r}' if not yes: - click.confirm( - f"Cancelling {job_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + click.confirm(f'Cancelling {job_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) try: core.cancel(cluster, all=all, job_ids=job_ids_to_cancel) @@ -2358,23 +2115,21 @@ def cancel( @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", "-a", default=None, is_flag=True, help="Stop all existing clusters." -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Stop all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def stop( clusters: List[str], @@ -2410,58 +2165,49 @@ def stop( sky stop -a """ - _down_or_stop_clusters(clusters, apply_to_all=all, down=False, no_confirm=yes) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=False, + no_confirm=yes) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Apply this command to all existing clusters.') +@click.option('--idle-minutes', + '-i', + type=int, + default=None, + required=False, + help=('Set the idle minutes before autostopping the cluster. ' + 'See the doc above for detailed semantics.')) @click.option( - "--all", - "-a", - default=None, - is_flag=True, - help="Apply this command to all existing clusters.", -) -@click.option( - "--idle-minutes", - "-i", - type=int, - default=None, - required=False, - help=( - "Set the idle minutes before autostopping the cluster. " - "See the doc above for detailed semantics." - ), -) -@click.option( - "--cancel", + '--cancel', default=False, is_flag=True, required=False, - help="Cancel any currently active auto{stop,down} setting for the " - "cluster. No-op if there is no active setting.", -) + help='Cancel any currently active auto{stop,down} setting for the ' + 'cluster. No-op if there is no active setting.') @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help="Use autodown (tear down the cluster; non-restartable), instead " - "of autostop (restartable).", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + help='Use autodown (tear down the cluster; non-restartable), instead ' + 'of autostop (restartable).') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def autostop( clusters: List[str], @@ -2514,108 +2260,89 @@ def autostop( """ if cancel and idle_minutes is not None: raise click.UsageError( - "Only one of --idle-minutes and --cancel should be specified. " - f"cancel: {cancel}, idle_minutes: {idle_minutes}" - ) + 'Only one of --idle-minutes and --cancel should be specified. ' + f'cancel: {cancel}, idle_minutes: {idle_minutes}') if cancel: idle_minutes = -1 elif idle_minutes is None: idle_minutes = 5 - _down_or_stop_clusters( - clusters, - apply_to_all=all, - down=down, - no_confirm=yes, - idle_minutes_to_autostop=idle_minutes, - ) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=down, + no_confirm=yes, + idle_minutes_to_autostop=idle_minutes) @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Start all existing clusters.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Start all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness, i.e., no running or pending jobs in the cluster's job " - "queue. Idleness gets reset whenever setting-up/running/pending jobs " - "are found in the job queue. " - "Setting this flag is equivalent to " - "running ``sky launch -d ...`` and then ``sky autostop -i ``" - ". If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness, i.e., no running or pending jobs in the cluster\'s job ' + 'queue. Idleness gets reset whenever setting-up/running/pending jobs ' + 'are found in the job queue. ' + 'Setting this flag is equivalent to ' + 'running ``sky launch -d ...`` and then ``sky autostop -i ``' + '. If not set, the cluster will not be autostopped.')) @click.option( - "--down", + '--down', default=False, is_flag=True, required=False, - help=( - "Autodown the cluster: tear down the cluster after specified minutes of " - "idle time after all jobs finish (successfully or abnormally). Requires " - "--idle-minutes-to-autostop to be set." - ), + help= + ('Autodown the cluster: tear down the cluster after specified minutes of ' + 'idle time after all jobs finish (successfully or abnormally). Requires ' + '--idle-minutes-to-autostop to be set.'), ) @click.option( - "--retry-until-up", - "-r", + '--retry-until-up', + '-r', default=False, is_flag=True, required=False, # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes - help=( - "Retry provisioning infinitely until the cluster is up, " - "if we fail to start the cluster due to unavailability errors." - ), + help=('Retry provisioning infinitely until the cluster is up, ' + 'if we fail to start the cluster due to unavailability errors.'), ) @click.option( - "--force", - "-f", + '--force', + '-f', default=False, is_flag=True, required=False, - help=( - "Force start the cluster even if it is already UP. Useful for " - "upgrading the SkyPilot runtime on the cluster." - ), -) + help=('Force start the cluster even if it is already UP. Useful for ' + 'upgrading the SkyPilot runtime on the cluster.')) @usage_lib.entrypoint # pylint: disable=redefined-builtin def start( - clusters: List[str], - all: bool, - yes: bool, - idle_minutes_to_autostop: Optional[int], - down: bool, # pylint: disable=redefined-outer-name - retry_until_up: bool, - force: bool, -): + clusters: List[str], + all: bool, + yes: bool, + idle_minutes_to_autostop: Optional[int], + down: bool, # pylint: disable=redefined-outer-name + retry_until_up: bool, + force: bool): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Restart cluster(s). @@ -2646,48 +2373,43 @@ def start( """ if down and idle_minutes_to_autostop is None: raise click.UsageError( - "--idle-minutes-to-autostop must be set if --down is set." - ) + '--idle-minutes-to-autostop must be set if --down is set.') to_start = [] if not clusters and not all: # UX: frequently users may have only 1 cluster. In this case, be smart # and default to that unique choice. - all_cluster_names = global_user_state.get_cluster_names_start_with("") + all_cluster_names = global_user_state.get_cluster_names_start_with('') if len(all_cluster_names) <= 1: clusters = all_cluster_names else: raise click.UsageError( - "`sky start` requires either a cluster name or glob " - "(see `sky status`), or the -a/--all flag." - ) + '`sky start` requires either a cluster name or glob ' + '(see `sky status`), or the -a/--all flag.') if all: if len(clusters) > 0: - click.echo( - "Both --all and cluster(s) specified for sky start. " - "Letting --all take effect." - ) + click.echo('Both --all and cluster(s) specified for sky start. ' + 'Letting --all take effect.') # Get all clusters that are not controllers. clusters = [ - cluster["name"] + cluster['name'] for cluster in global_user_state.get_clusters() - if controller_utils.Controllers.from_name(cluster["name"]) is None + if controller_utils.Controllers.from_name(cluster['name']) is None ] if not clusters: - click.echo( - "Cluster(s) not found (tip: see `sky status`). Do you " - "mean to use `sky launch` to provision a new cluster?" - ) + click.echo('Cluster(s) not found (tip: see `sky status`). Do you ' + 'mean to use `sky launch` to provision a new cluster?') return else: # Get GLOB cluster names clusters = _get_glob_clusters(clusters) for name in clusters: - cluster_status, _ = backend_utils.refresh_cluster_status_handle(name) + cluster_status, _ = backend_utils.refresh_cluster_status_handle( + name) # A cluster may have one of the following states: # # STOPPED - ok to restart @@ -2711,7 +2433,7 @@ def start( # INIT state cluster due to head_ip not being cached). # # This can be replicated by adding `exit 1` to Task.setup. - if not force and cluster_status == status_lib.ClusterStatus.UP: + if (not force and cluster_status == status_lib.ClusterStatus.UP): # An UP cluster; skipping 'sky start' because: # 1. For a really up cluster, this has no effects (ray up -y # --no-restart) anyway. @@ -2722,13 +2444,12 @@ def start( # zombied (remains as stopped in the cloud's UI). # # This is dangerous and unwanted behavior! - click.echo(f"Cluster {name} already has status UP.") + click.echo(f'Cluster {name} already has status UP.') continue assert force or cluster_status in ( status_lib.ClusterStatus.INIT, - status_lib.ClusterStatus.STOPPED, - ), cluster_status + status_lib.ClusterStatus.STOPPED), cluster_status to_start.append(name) if not to_start: return @@ -2742,83 +2463,74 @@ def start( normal_clusters.append(name) if controllers and normal_clusters: # Keep this behavior the same as _down_or_stop_clusters(). - raise click.UsageError( - "Starting controllers with other cluster(s) " - "is currently not supported.\n" - "Please start the former independently." - ) + raise click.UsageError('Starting controllers with other cluster(s) ' + 'is currently not supported.\n' + 'Please start the former independently.') if controllers: bold = backend_utils.BOLD reset_bold = backend_utils.RESET_BOLD if len(controllers) != 1: raise click.UsageError( - "Starting multiple controllers is currently not supported.\n" - "Please start them independently." - ) + 'Starting multiple controllers is currently not supported.\n' + 'Please start them independently.') if idle_minutes_to_autostop is not None: raise click.UsageError( - "Autostop options are currently not allowed when starting the " - "controllers. Use the default autostop settings by directly " - f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}' - ) + 'Autostop options are currently not allowed when starting the ' + 'controllers. Use the default autostop settings by directly ' + f'calling: {bold}sky start {" ".join(controllers)}{reset_bold}') if not yes: - cluster_str = "clusters" if len(to_start) > 1 else "cluster" - cluster_list = ", ".join(to_start) + cluster_str = 'clusters' if len(to_start) > 1 else 'cluster' + cluster_list = ', '.join(to_start) click.confirm( - f"Restarting {len(to_start)} {cluster_str}: " f"{cluster_list}. Proceed?", + f'Restarting {len(to_start)} {cluster_str}: ' + f'{cluster_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) for name in to_start: try: - core.start( - name, idle_minutes_to_autostop, retry_until_up, down=down, force=force - ) - except ( - exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + core.start(name, + idle_minutes_to_autostop, + retry_until_up, + down=down, + force=force) + except (exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError) as e: click.echo(str(e)) else: - click.secho(f"Cluster {name} started.", fg="green") + click.secho(f'Cluster {name} started.', fg='green') @cli.command(cls=_DocumentedCodeCommand) -@click.argument( - "clusters", - nargs=-1, - required=False, - **_get_shell_complete_args(_complete_cluster_name), -) -@click.option( - "--all", "-a", default=None, is_flag=True, help="Tear down all existing clusters." -) +@click.argument('clusters', + nargs=-1, + required=False, + **_get_shell_complete_args(_complete_cluster_name)) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Tear down all existing clusters.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @click.option( - "--yes", - "-y", + '--purge', + '-p', is_flag=True, default=False, required=False, - help="Skip confirmation prompt.", -) -@click.option( - "--purge", - "-p", - is_flag=True, - default=False, - required=False, - help=( - "(Advanced) Forcefully remove the cluster(s) from " - "SkyPilot's cluster table, even if the actual cluster termination " - "failed on the cloud. WARNING: This flag should only be set sparingly" - " in certain manual troubleshooting scenarios; with it set, it is the" - " user's responsibility to ensure there are no leaked instances and " - "related resources." - ), -) + help=('(Advanced) Forcefully remove the cluster(s) from ' + 'SkyPilot\'s cluster table, even if the actual cluster termination ' + 'failed on the cloud. WARNING: This flag should only be set sparingly' + ' in certain manual troubleshooting scenarios; with it set, it is the' + ' user\'s responsibility to ensure there are no leaked instances and ' + 'related resources.')) @usage_lib.entrypoint def down( clusters: List[str], @@ -2854,9 +2566,11 @@ def down( sky down -a """ - _down_or_stop_clusters( - clusters, apply_to_all=all, down=True, no_confirm=yes, purge=purge - ) + _down_or_stop_clusters(clusters, + apply_to_all=all, + down=True, + no_confirm=yes, + purge=purge) def _hint_or_raise_for_down_jobs_controller(controller_name: str): @@ -2874,43 +2588,43 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str): controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status("[bold cyan]Checking for in-progress managed jobs[/]"): + with rich_utils.safe_status( + '[bold cyan]Checking for in-progress managed jobs[/]'): try: - managed_jobs_ = managed_jobs.queue(refresh=False, skip_finished=True) + managed_jobs_ = managed_jobs.queue(refresh=False, + skip_finished=True) except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value.decline_down_when_failed_to_fetch_status_hint - ) + controller.value. + decline_down_when_failed_to_fetch_status_hint) if e.cluster_status is None: - click.echo("Managed jobs controller has already been torn down.") + click.echo( + 'Managed jobs controller has already been torn down.') sys.exit(0) # At this point, the managed jobs are failed to be fetched due to # the controller being STOPPED or being firstly launched, i.e., # there is no in-prgress managed jobs. managed_jobs_ = [] - msg = ( - f"{colorama.Fore.YELLOW}WARNING: Tearing down the managed " - "jobs controller. Please be aware of the following:" - f"{colorama.Style.RESET_ALL}" - "\n * All logs and status information of the managed " - "jobs (output of `sky jobs queue`) will be lost." - ) + msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed ' + 'jobs controller. Please be aware of the following:' + f'{colorama.Style.RESET_ALL}' + '\n * All logs and status information of the managed ' + 'jobs (output of `sky jobs queue`) will be lost.') click.echo(msg) if managed_jobs_: job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False) msg = controller.value.decline_down_for_dirty_controller_hint # Add prefix to each line to align with the bullet point. - msg += "\n".join([" " + line for line in job_table.split("\n") if line != ""]) + msg += '\n'.join( + [' ' + line for line in job_table.split('\n') if line != '']) with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError(msg) else: - click.echo( - " * No in-progress managed jobs found. It should be safe to " - "terminate (see caveats above)." - ) + click.echo(' * No in-progress managed jobs found. It should be safe to ' + 'terminate (see caveats above).') def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): @@ -2927,17 +2641,17 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): """ controller = controller_utils.Controllers.from_name(controller_name) assert controller is not None, controller_name - with rich_utils.safe_status("[bold cyan]Checking for live services[/]"): + with rich_utils.safe_status('[bold cyan]Checking for live services[/]'): try: services = serve_lib.status() except exceptions.ClusterNotUpError as e: if controller.value.connection_error_hint in str(e): with ux_utils.print_exception_no_traceback(): raise exceptions.NotSupportedError( - controller.value.decline_down_when_failed_to_fetch_status_hint - ) + controller.value. + decline_down_when_failed_to_fetch_status_hint) if e.cluster_status is None: - click.echo("Serve controller has already been torn down.") + click.echo('Serve controller has already been torn down.') sys.exit(0) # At this point, the services are failed to be fetched due to the # controller being STOPPED or being firstly launched, i.e., there is @@ -2945,34 +2659,31 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str): services = [] if services: - service_names = [service["name"] for service in services] + service_names = [service['name'] for service in services] with ux_utils.print_exception_no_traceback(): - msg = controller.value.decline_down_for_dirty_controller_hint.format( - service_names=", ".join(service_names) - ) + msg = ( + controller.value.decline_down_for_dirty_controller_hint.format( + service_names=', '.join(service_names))) raise exceptions.NotSupportedError(msg) # Do nothing for STOPPED state, as it is safe to terminate the cluster. - click.echo(f"Terminate sky serve controller: {controller_name}.") + click.echo(f'Terminate sky serve controller: {controller_name}.') _CONTROLLER_TO_HINT_OR_RAISE = { - controller_utils.Controllers.JOBS_CONTROLLER: ( - _hint_or_raise_for_down_jobs_controller - ), - controller_utils.Controllers.SKY_SERVE_CONTROLLER: ( - _hint_or_raise_for_down_sky_serve_controller - ), + controller_utils.Controllers.JOBS_CONTROLLER: + (_hint_or_raise_for_down_jobs_controller), + controller_utils.Controllers.SKY_SERVE_CONTROLLER: + (_hint_or_raise_for_down_sky_serve_controller), } def _down_or_stop_clusters( - names: List[str], - apply_to_all: Optional[bool], - down: bool, # pylint: disable=redefined-outer-name - no_confirm: bool, - purge: bool = False, - idle_minutes_to_autostop: Optional[int] = None, -) -> None: + names: List[str], + apply_to_all: Optional[bool], + down: bool, # pylint: disable=redefined-outer-name + no_confirm: bool, + purge: bool = False, + idle_minutes_to_autostop: Optional[int] = None) -> None: """Tears down or (auto-)stops a cluster (or all clusters). Controllers (jobs controller and sky serve controller) can only be @@ -2980,43 +2691,40 @@ def _down_or_stop_clusters( via glob). """ if down: - command = "down" + command = 'down' elif idle_minutes_to_autostop is not None: - command = "autostop" + command = 'autostop' else: - command = "stop" + command = 'stop' if not names and apply_to_all is None: # UX: frequently users may have only 1 cluster. In this case, 'sky # stop/down' without args should be smart and default to that unique # choice. - all_cluster_names = global_user_state.get_cluster_names_start_with("") + all_cluster_names = global_user_state.get_cluster_names_start_with('') if len(all_cluster_names) <= 1: names = all_cluster_names else: raise click.UsageError( - f"`sky {command}` requires either a cluster name or glob " - "(see `sky status`), or the -a/--all flag." - ) + f'`sky {command}` requires either a cluster name or glob ' + '(see `sky status`), or the -a/--all flag.') - operation = "Terminating" if down else "Stopping" + operation = 'Terminating' if down else 'Stopping' if idle_minutes_to_autostop is not None: is_cancel = idle_minutes_to_autostop < 0 - verb = "Cancelling" if is_cancel else "Scheduling" - option_str = "down" if down else "stop" + verb = 'Cancelling' if is_cancel else 'Scheduling' + option_str = 'down' if down else 'stop' if is_cancel: - option_str = "{stop,down}" - operation = f"{verb} auto{option_str} on" + option_str = '{stop,down}' + operation = f'{verb} auto{option_str} on' if len(names) > 0: controllers = [ - name - for name in names + name for name in names if controller_utils.Controllers.from_name(name) is not None ] - controllers_str = ", ".join(map(repr, controllers)) + controllers_str = ', '.join(map(repr, controllers)) names = [ - name - for name in _get_glob_clusters(names) + name for name in _get_glob_clusters(names) if controller_utils.Controllers.from_name(name) is None ] @@ -3024,27 +2732,25 @@ def _down_or_stop_clusters( # normal clusters. if controllers: if len(names) != 0: - names_str = ", ".join(map(repr, names)) + names_str = ', '.join(map(repr, names)) raise click.UsageError( - f"{operation} controller(s) " - f"{controllers_str} with other cluster(s) " - f"{names_str} is currently not supported.\n" - f"Please omit the controller(s) {controllers}." - ) + f'{operation} controller(s) ' + f'{controllers_str} with other cluster(s) ' + f'{names_str} is currently not supported.\n' + f'Please omit the controller(s) {controllers}.') if len(controllers) > 1: raise click.UsageError( - f"{operation} multiple controllers " - f"{controllers_str} is currently not supported.\n" - f"Please specify only one controller." - ) + f'{operation} multiple controllers ' + f'{controllers_str} is currently not supported.\n' + f'Please specify only one controller.') controller_name = controllers[0] if not down: raise click.UsageError( - f"{operation} controller(s) " - f"{controllers_str} is currently not supported." - ) + f'{operation} controller(s) ' + f'{controllers_str} is currently not supported.') else: - controller = controller_utils.Controllers.from_name(controller_name) + controller = controller_utils.Controllers.from_name( + controller_name) assert controller is not None hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller] try: @@ -3056,30 +2762,21 @@ def _down_or_stop_clusters( # managed job or service. We should make this check atomic # with the termination. hint_or_raise(controller_name) - except ( - exceptions.ClusterOwnerIdentityMismatchError, - RuntimeError, - ) as e: + except (exceptions.ClusterOwnerIdentityMismatchError, + RuntimeError) as e: if purge: click.echo(common_utils.format_exception(e)) else: raise - confirm_str = "delete" - input_prefix = ( - ( - "Since --purge is set, errors will be ignored " - "and controller will be removed from " - "local state.\n" - ) - if purge - else "" - ) + confirm_str = 'delete' + input_prefix = ('Since --purge is set, errors will be ignored ' + 'and controller will be removed from ' + 'local state.\n') if purge else '' user_input = click.prompt( - f"{input_prefix}" - f"To proceed, please type {colorama.Style.BRIGHT}" - f"{confirm_str!r}{colorama.Style.RESET_ALL}", - type=str, - ) + f'{input_prefix}' + f'To proceed, please type {colorama.Style.BRIGHT}' + f'{confirm_str!r}{colorama.Style.RESET_ALL}', + type=str) if user_input != confirm_str: raise click.Abort() no_confirm = True @@ -3089,15 +2786,14 @@ def _down_or_stop_clusters( all_clusters = global_user_state.get_clusters() if len(names) > 0: click.echo( - f"Both --all and cluster(s) specified for `sky {command}`. " - "Letting --all take effect." - ) + f'Both --all and cluster(s) specified for `sky {command}`. ' + 'Letting --all take effect.') # We should not remove controllers when --all is specified. # Otherwise, it would be very easy to accidentally delete a controller. names = [ - record["name"] + record['name'] for record in all_clusters - if controller_utils.Controllers.from_name(record["name"]) is None + if controller_utils.Controllers.from_name(record['name']) is None ] clusters = [] @@ -3112,54 +2808,51 @@ def _down_or_stop_clusters( usage_lib.record_cluster_name_for_current_operation(clusters) if not clusters: - click.echo("Cluster(s) not found (tip: see `sky status`).") + click.echo('Cluster(s) not found (tip: see `sky status`).') return if not no_confirm and len(clusters) > 0: - cluster_str = "clusters" if len(clusters) > 1 else "cluster" - cluster_list = ", ".join(clusters) + cluster_str = 'clusters' if len(clusters) > 1 else 'cluster' + cluster_list = ', '.join(clusters) click.confirm( - f"{operation} {len(clusters)} {cluster_str}: " f"{cluster_list}. Proceed?", + f'{operation} {len(clusters)} {cluster_str}: ' + f'{cluster_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) - plural = "s" if len(clusters) > 1 else "" - progress = rich_progress.Progress( - transient=True, redirect_stdout=False, redirect_stderr=False - ) + plural = 's' if len(clusters) > 1 else '' + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) task = progress.add_task( - f"[bold cyan]{operation} {len(clusters)} cluster{plural}[/]", - total=len(clusters), - ) + f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]', + total=len(clusters)) def _down_or_stop(name: str): success_progress = False if idle_minutes_to_autostop is not None: try: core.autostop(name, idle_minutes_to_autostop, down) - except (exceptions.NotSupportedError, exceptions.ClusterNotUpError) as e: + except (exceptions.NotSupportedError, + exceptions.ClusterNotUpError) as e: message = str(e) else: # no exception raised success_progress = True - message = ( - f"{colorama.Fore.GREEN}{operation} " - f"cluster {name!r}...done{colorama.Style.RESET_ALL}" - ) + message = (f'{colorama.Fore.GREEN}{operation} ' + f'cluster {name!r}...done{colorama.Style.RESET_ALL}') if idle_minutes_to_autostop >= 0: - option_str = "down" if down else "stop" - passive_str = "downed" if down else "stopped" - plural = "s" if idle_minutes_to_autostop != 1 else "" + option_str = 'down' if down else 'stop' + passive_str = 'downed' if down else 'stopped' + plural = 's' if idle_minutes_to_autostop != 1 else '' message += ( - f"\n The cluster will be auto{passive_str} after " - f"{idle_minutes_to_autostop} minute{plural} of " - "idleness." - f"\n To cancel the auto{option_str}, run: " - f"{colorama.Style.BRIGHT}" - f"sky autostop {name} --cancel" - f"{colorama.Style.RESET_ALL}" - ) + f'\n The cluster will be auto{passive_str} after ' + f'{idle_minutes_to_autostop} minute{plural} of ' + 'idleness.' + f'\n To cancel the auto{option_str}, run: ' + f'{colorama.Style.BRIGHT}' + f'sky autostop {name} --cancel' + f'{colorama.Style.RESET_ALL}') else: try: if down: @@ -3168,26 +2861,20 @@ def _down_or_stop(name: str): core.stop(name, purge=purge) except RuntimeError as e: message = ( - f"{colorama.Fore.RED}{operation} cluster {name}...failed. " - f"{colorama.Style.RESET_ALL}" - f"\nReason: {common_utils.format_exception(e)}." - ) - except ( - exceptions.NotSupportedError, - exceptions.ClusterOwnerIdentityMismatchError, - ) as e: + f'{colorama.Fore.RED}{operation} cluster {name}...failed. ' + f'{colorama.Style.RESET_ALL}' + f'\nReason: {common_utils.format_exception(e)}.') + except (exceptions.NotSupportedError, + exceptions.ClusterOwnerIdentityMismatchError) as e: message = str(e) else: # no exception raised message = ( - f"{colorama.Fore.GREEN}{operation} cluster {name}...done." - f"{colorama.Style.RESET_ALL}" - ) + f'{colorama.Fore.GREEN}{operation} cluster {name}...done.' + f'{colorama.Style.RESET_ALL}') if not down: - message += ( - "\n To restart the cluster, run: " - f"{colorama.Style.BRIGHT}sky start {name}" - f"{colorama.Style.RESET_ALL}" - ) + message += ('\n To restart the cluster, run: ' + f'{colorama.Style.BRIGHT}sky start {name}' + f'{colorama.Style.RESET_ALL}') success_progress = True progress.stop() @@ -3204,14 +2891,12 @@ def _down_or_stop(name: str): @cli.command(cls=_DocumentedCodeCommand) -@click.argument("clouds", required=False, type=str, nargs=-1) -@click.option( - "--verbose", - "-v", - is_flag=True, - default=False, - help="Show the activated account for each cloud.", -) +@click.argument('clouds', required=False, type=str, nargs=-1) +@click.option('--verbose', + '-v', + is_flag=True, + default=False, + help='Show the activated account for each cloud.') @usage_lib.entrypoint def check(clouds: Tuple[str], verbose: bool): """Check which clouds are available to use. @@ -3237,40 +2922,41 @@ def check(clouds: Tuple[str], verbose: bool): """ clouds_arg = clouds if len(clouds) > 0 else None sky_check.check(verbose=verbose, clouds=clouds_arg) - - -@cli.command() -@click.argument("accelerator_str", required=False) -@click.option( - "--all", - "-a", - is_flag=True, - default=False, - help="Show details of all GPU/TPU/accelerator offerings.", -) -@click.option("--cloud", default=None, type=str, help="Cloud provider to query.") + + +@cli.command() +@click.argument('accelerator_str', required=False) +@click.option('--all', + '-a', + is_flag=True, + default=False, + help='Show details of all GPU/TPU/accelerator offerings.') +@click.option('--cloud', + default=None, + type=str, + help='Cloud provider to query.') @click.option( - "--region", + '--region', required=False, type=str, - help=("The region to use. If not specified, shows accelerators from all regions."), + help= + ('The region to use. If not specified, shows accelerators from all regions.' + ), ) @click.option( - "--all-regions", + '--all-regions', is_flag=True, default=False, - help="Show pricing and instance details for a specified accelerator across " - "all regions and clouds.", -) + help='Show pricing and instance details for a specified accelerator across ' + 'all regions and clouds.') @service_catalog.fallback_to_default_catalog @usage_lib.entrypoint def show_gpus( - accelerator_str: Optional[str], - all: bool, # pylint: disable=redefined-builtin - cloud: Optional[str], - region: Optional[str], - all_regions: Optional[bool], -): + accelerator_str: Optional[str], + all: bool, # pylint: disable=redefined-builtin + cloud: Optional[str], + region: Optional[str], + all_regions: Optional[bool]): """Show supported GPU/TPU/accelerators and their prices. The names and counts shown can be set in the ``accelerators`` field in task @@ -3316,110 +3002,102 @@ def show_gpus( # validation for the --region flag if region is not None and cloud is None: raise click.UsageError( - "The --region flag is only valid when the --cloud flag is set." - ) + 'The --region flag is only valid when the --cloud flag is set.') # validation for the --all-regions flag if all_regions and accelerator_str is None: raise click.UsageError( - "The --all-regions flag is only valid when an accelerator " "is specified." - ) + 'The --all-regions flag is only valid when an accelerator ' + 'is specified.') if all_regions and region is not None: raise click.UsageError( - "--all-regions and --region flags cannot be used simultaneously." - ) + '--all-regions and --region flags cannot be used simultaneously.') # This will validate 'cloud' and raise if not found. cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud) service_catalog.validate_region_zone(region, None, clouds=cloud) show_all = all if show_all and accelerator_str is not None: - raise click.UsageError("--all is only allowed without a GPU name.") + raise click.UsageError('--all is only allowed without a GPU name.') # Kubernetes specific bools cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None kubernetes_is_enabled = sky_clouds.cloud_in_iterable( - sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds() - ) + sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) if cloud_is_kubernetes and region is not None: raise click.UsageError( - "The --region flag cannot be set with --cloud kubernetes." - ) + 'The --region flag cannot be set with --cloud kubernetes.') def _list_to_str(lst): - return ", ".join([str(e) for e in lst]) + return ', '.join([str(e) for e in lst]) def _get_kubernetes_realtime_gpu_table( - name_filter: Optional[str] = None, quantity_filter: Optional[int] = None - ): + name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None): if quantity_filter: - qty_header = "QTY_FILTER" - free_header = "FILTERED_FREE_GPUS" + qty_header = 'QTY_FILTER' + free_header = 'FILTERED_FREE_GPUS' else: - qty_header = "QTY_PER_NODE" - free_header = "TOTAL_FREE_GPUS" + qty_header = 'QTY_PER_NODE' + free_header = 'TOTAL_FREE_GPUS' realtime_gpu_table = log_utils.create_table( - ["GPU", qty_header, "TOTAL_GPUS", free_header] - ) + ['GPU', qty_header, 'TOTAL_GPUS', free_header]) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, - clouds="kubernetes", + clouds='kubernetes', name_filter=name_filter, region_filter=region, quantity_filter=quantity_filter, - case_sensitive=False, - ) - assert set(counts.keys()) == set(capacity.keys()) == set(available.keys()), ( - f"Keys of counts ({list(counts.keys())}), " - f"capacity ({list(capacity.keys())}), " - f"and available ({list(available.keys())}) " - "must be same." - ) + case_sensitive=False) + assert (set(counts.keys()) == set(capacity.keys()) == set( + available.keys())), (f'Keys of counts ({list(counts.keys())}), ' + f'capacity ({list(capacity.keys())}), ' + f'and available ({list(available.keys())}) ' + 'must be same.') if len(counts) == 0: - err_msg = "No GPUs found in Kubernetes cluster. " - debug_msg = "To further debug, run: sky check " + err_msg = 'No GPUs found in Kubernetes cluster. ' + debug_msg = 'To further debug, run: sky check ' if name_filter is not None: - gpu_info_msg = f" {name_filter!r}" + gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: - gpu_info_msg += " with requested quantity" f" {quantity_filter}" - err_msg = ( - f"Resources{gpu_info_msg} not found " "in Kubernetes cluster. " - ) - debug_msg = ( - "To show available accelerators on kubernetes," - " run: sky show-gpus --cloud kubernetes " - ) - full_err_msg = err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg + gpu_info_msg += (' with requested quantity' + f' {quantity_filter}') + err_msg = (f'Resources{gpu_info_msg} not found ' + 'in Kubernetes cluster. ') + debug_msg = ('To show available accelerators on kubernetes,' + ' run: sky show-gpus --cloud kubernetes ') + full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + + debug_msg) raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): - realtime_gpu_table.add_row( - [gpu, _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]] - ) + realtime_gpu_table.add_row([ + gpu, + _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] + ]) return realtime_gpu_table def _get_kubernetes_node_info_table(): node_table = log_utils.create_table( - ["NODE_NAME", "GPU_NAME", "TOTAL_GPUS", "FREE_GPUS"] - ) + ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS']) node_info_dict = kubernetes_utils.get_kubernetes_node_info() for node_name, node_info in node_info_dict.items(): - node_table.add_row( - [ - node_name, - node_info.gpu_type, - node_info.total["nvidia.com/gpu"], - node_info.free["nvidia.com/gpu"], - ] - ) + node_table.add_row([ + node_name, node_info.gpu_type, + node_info.total['nvidia.com/gpu'], + node_info.free['nvidia.com/gpu'] + ]) return node_table def _output(): - gpu_table = log_utils.create_table(["COMMON_GPU", "AVAILABLE_QUANTITIES"]) - tpu_table = log_utils.create_table(["GOOGLE_TPU", "AVAILABLE_QUANTITIES"]) - other_table = log_utils.create_table(["OTHER_GPU", "AVAILABLE_QUANTITIES"]) + gpu_table = log_utils.create_table( + ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) + tpu_table = log_utils.create_table( + ['GOOGLE_TPU', 'AVAILABLE_QUANTITIES']) + other_table = log_utils.create_table( + ['OTHER_GPU', 'AVAILABLE_QUANTITIES']) name, quantity = None, None @@ -3429,10 +3107,10 @@ def _output(): clouds_to_list = cloud if cloud is None: clouds_to_list = [ - c for c in service_catalog.ALL_CLOUDS if c != "kubernetes" + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes' ] - k8s_messages = "" + k8s_messages = '' if accelerator_str is None: # Collect k8s related messages in k8s_messages and print them at end print_section_titles = False @@ -3446,32 +3124,27 @@ def _output(): except ValueError as e: if not cloud_is_kubernetes: # Make it a note if cloud is not kubernetes - k8s_messages += "Note: " + k8s_messages += 'Note: ' k8s_messages += str(e) else: print_section_titles = True - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" - ) + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from k8s_realtime_table.get_string() k8s_node_table = _get_kubernetes_node_info_table() - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes per node GPU availability" - f"{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes per node GPU availability' + f'{colorama.Style.RESET_ALL}\n') yield from k8s_node_table.get_string() if kubernetes_autoscaling: - k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += ( + '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ( - "Kubernetes is not enabled. To fix, run: " - "sky check kubernetes " - ) + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') yield k8s_messages return @@ -3479,7 +3152,7 @@ def _output(): # long and the user may not scroll to the end. if show_all and k8s_messages: yield k8s_messages - yield "\n\n" + yield '\n\n' result = service_catalog.list_accelerator_counts( gpus_only=True, @@ -3489,11 +3162,9 @@ def _output(): if print_section_titles: # If section titles were printed above, print again here - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Cloud GPUs{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') # "Common" GPUs for gpu in service_catalog.get_common_gpus(): @@ -3506,101 +3177,91 @@ def _output(): if tpu in result: tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) if len(tpu_table.get_string()) > 0: - yield "\n\n" + yield '\n\n' yield from tpu_table.get_string() # Handle Other GPUs if show_all or cloud is not None: - yield "\n\n" + yield '\n\n' for gpu, qty in sorted(result.items()): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() - yield "\n\n" + yield '\n\n' # Handle hints and messages if not show_all: if cloud is None: - yield ( - "\n\nHint: use -a/--all to see all accelerators " - "(including non-common ones) and pricing." - ) + yield ('\n\nHint: use -a/--all to see all accelerators ' + '(including non-common ones) and pricing.') # Handle k8 messages if present if k8s_messages: - yield "\n" + yield '\n' yield k8s_messages return else: - # Return if we're not showing all or if a specific cloud was queried - yield ("Hint: use -a/--all to see all accelerators " "and pricing.") + # Return if not showing all or a specific cloud was queried + yield ('Hint: use -a/--all to see all accelerators ' + 'and pricing.') return else: # Parse accelerator string - accelerator_split = accelerator_str.split(":") + accelerator_split = accelerator_str.split(':') if len(accelerator_split) > 2: raise click.UsageError( - f"Invalid accelerator string {accelerator_str}. " - "Expected format: [:]." - ) + f'Invalid accelerator string {accelerator_str}. ' + 'Expected format: [:].') if len(accelerator_split) == 2: name = accelerator_split[0] # Check if quantity is valid try: quantity = int(accelerator_split[1]) if quantity <= 0: - raise ValueError("Quantity cannot be non-positive integer.") + raise ValueError( + 'Quantity cannot be non-positive integer.') except ValueError as invalid_quantity: raise click.UsageError( - f"Invalid accelerator quantity {accelerator_split[1]}. " - "Expected a positive integer." - ) from invalid_quantity + f'Invalid accelerator quantity {accelerator_split[1]}. ' + 'Expected a positive integer.') from invalid_quantity else: name, quantity = accelerator_str, None print_section_titles = False - if ( - kubernetes_is_enabled - and (cloud is None or cloud_is_kubernetes) - and not show_all - ): + if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and + not show_all): # Print section title if not showing all and instead a specific # accelerator is requested print_section_titles = True - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Kubernetes GPUs{colorama.Style.RESET_ALL}\n" - ) + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') # TODO(romilb): Show filtered per node GPU availability here as well try: k8s_realtime_table = _get_kubernetes_realtime_gpu_table( - name_filter=name, quantity_filter=quantity - ) + name_filter=name, quantity_filter=quantity) yield from k8s_realtime_table.get_string() except ValueError as e: # In the case of a specific accelerator, show the error message # immediately (e.g., "Resources H100 not found ...") yield str(e) if kubernetes_autoscaling: - k8s_messages += "\n" + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += ('\n' + + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) yield k8s_messages if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: - yield ( - "Kubernetes is not enabled. To fix, run: " "sky check kubernetes " - ) + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') return # For clouds other than Kubernetes, get the accelerator details # Case-sensitive - result = service_catalog.list_accelerators( - gpus_only=True, - name_filter=name, - quantity_filter=quantity, - region_filter=region, - clouds=clouds_to_list, - case_sensitive=False, - all_regions=all_regions, - ) + result = service_catalog.list_accelerators(gpus_only=True, + name_filter=name, + quantity_filter=quantity, + region_filter=region, + clouds=clouds_to_list, + case_sensitive=False, + all_regions=all_regions) # Import here to save module load speed. # pylint: disable=import-outside-toplevel,line-too-long from sky.clouds.service_catalog import common @@ -3613,79 +3274,73 @@ def _output(): for i, (gpu, items) in enumerate(result.items()): df = pd.DataFrame([t._asdict() for t in items]) # Determine the minimum prices for each cloud. - min_price_df = df.groupby("cloud").agg( - min_price=("price", "min"), min_spot_price=("spot_price", "min") - ) - df = df.merge(min_price_df, on="cloud") + min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'), + min_spot_price=('spot_price', + 'min')) + df = df.merge(min_price_df, on='cloud') # Sort within each cloud by price. - df = df.groupby("cloud", group_keys=False).apply( - lambda x: x.sort_values(by=["price", "spot_price"]) - ) + df = df.groupby('cloud', group_keys=False).apply( + lambda x: x.sort_values(by=['price', 'spot_price'])) # Sort across groups (clouds). - df = df.sort_values(by=["min_price", "min_spot_price"]) - df = df.drop(columns=["min_price", "min_spot_price"]) + df = df.sort_values(by=['min_price', 'min_spot_price']) + df = df.drop(columns=['min_price', 'min_spot_price']) sorted_dataclasses = [ - common.InstanceTypeInfo(*row) for row in df.to_records(index=False) + common.InstanceTypeInfo(*row) + for row in df.to_records(index=False) ] new_result[gpu] = sorted_dataclasses result = new_result if print_section_titles and not show_all: - yield "\n\n" - yield ( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Cloud GPUs{colorama.Style.RESET_ALL}\n" - ) + yield '\n\n' + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') if len(result) == 0: - quantity_str = f" with requested quantity {quantity}" if quantity else "" - cloud_str = f" on {cloud_obj}." if cloud else " in cloud catalogs." - yield f"Resources '{name}'{quantity_str} not found{cloud_str} " - yield "To show available accelerators, run: sky show-gpus --all" + quantity_str = (f' with requested quantity {quantity}' + if quantity else '') + cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' + yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' + yield 'To show available accelerators, run: sky show-gpus --all' return for i, (gpu, items) in enumerate(result.items()): accelerator_table_headers = [ - "GPU", - "QTY", - "CLOUD", - "INSTANCE_TYPE", - "DEVICE_MEM", - "vCPUs", - "HOST_MEM", - "HOURLY_PRICE", - "HOURLY_SPOT_PRICE", + 'GPU', + 'QTY', + 'CLOUD', + 'INSTANCE_TYPE', + 'DEVICE_MEM', + 'vCPUs', + 'HOST_MEM', + 'HOURLY_PRICE', + 'HOURLY_SPOT_PRICE', ] if not show_all: - accelerator_table_headers.append("REGION") - accelerator_table = log_utils.create_table(accelerator_table_headers) + accelerator_table_headers.append('REGION') + accelerator_table = log_utils.create_table( + accelerator_table_headers) for item in items: - instance_type_str = ( - item.instance_type - if not pd.isna(item.instance_type) - else "(attachable)" - ) + instance_type_str = item.instance_type if not pd.isna( + item.instance_type) else '(attachable)' cpu_count = item.cpu_count - if not pd.isna(cpu_count) and isinstance(cpu_count, (float, int)): + if not pd.isna(cpu_count) and isinstance( + cpu_count, (float, int)): if int(cpu_count) == cpu_count: cpu_str = str(int(cpu_count)) else: - cpu_str = f"{cpu_count:.1f}" + cpu_str = f'{cpu_count:.1f}' else: - cpu_str = "-" - device_memory_str = ( - f"{item.device_memory:.0f}GB" - if not pd.isna(item.device_memory) - else "-" - ) - host_memory_str = ( - f"{item.memory:.0f}GB" if not pd.isna(item.memory) else "-" - ) - price_str = f"$ {item.price:.3f}" if not pd.isna(item.price) else "-" - spot_price_str = ( - f"$ {item.spot_price:.3f}" if not pd.isna(item.spot_price) else "-" - ) - region_str = item.region if not pd.isna(item.region) else "-" + cpu_str = '-' + device_memory_str = (f'{item.device_memory:.0f}GB' if + not pd.isna(item.device_memory) else '-') + host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( + item.memory) else '-' + price_str = f'$ {item.price:.3f}' if not pd.isna( + item.price) else '-' + spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( + item.spot_price) else '-' + region_str = item.region if not pd.isna(item.region) else '-' accelerator_table_vals = [ item.accelerator_name, item.accelerator_count, @@ -3702,7 +3357,7 @@ def _output(): accelerator_table.add_row(accelerator_table_vals) if i != 0: - yield "\n\n" + yield '\n\n' yield from accelerator_table.get_string() if show_all: @@ -3719,15 +3374,13 @@ def storage(): pass -@storage.command("ls", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) +@storage.command('ls', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def storage_ls(all: bool): @@ -3737,34 +3390,26 @@ def storage_ls(all: bool): click.echo(storage_table) -@storage.command("delete", cls=_DocumentedCodeCommand) -@click.argument( - "names", - required=False, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_storage_name), -) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Delete all storage objects.", -) -@click.option( - "--yes", - "-y", - default=False, - is_flag=True, - required=False, - help="Skip confirmation prompt.", -) +@storage.command('delete', cls=_DocumentedCodeCommand) +@click.argument('names', + required=False, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_storage_name)) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Delete all storage objects.') +@click.option('--yes', + '-y', + default=False, + is_flag=True, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint -def storage_delete( - names: List[str], all: bool, yes: bool -): # pylint: disable=redefined-builtin +def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin """Delete storage objects. Examples: @@ -3781,25 +3426,25 @@ def storage_delete( sky storage delete -a """ if sum([len(names) > 0, all]) != 1: - raise click.UsageError("Either --all or a name must be specified.") + raise click.UsageError('Either --all or a name must be specified.') if all: storages = sky.storage_ls() if not storages: - click.echo("No storage(s) to delete.") + click.echo('No storage(s) to delete.') return - names = [s["name"] for s in storages] + names = [s['name'] for s in storages] else: names = _get_glob_storages(names) if names: if not yes: - storage_names = ", ".join(names) - storage_str = "storages" if len(names) > 1 else "storage" + storage_names = ', '.join(names) + storage_str = 'storages' if len(names) > 1 else 'storage' click.confirm( - f"Deleting {len(names)} {storage_str}: " f"{storage_names}. Proceed?", + f'Deleting {len(names)} {storage_str}: ' + f'{storage_names}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) subprocess_utils.run_in_parallel(sky.storage_delete, names) @@ -3816,62 +3461,49 @@ def jobs(): pass -@jobs.command("launch", cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@jobs.command('launch', cls=_DocumentedCodeCommand) +@click.argument('entrypoint', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) # TODO(zhwu): Add --dryrun option to test the launch command. @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS) +@click.option('--cluster', + '-c', + default=None, + type=str, + hidden=True, + help=('Alias for --name, the name of the spot job.')) +@click.option('--job-recovery', + default=None, + type=str, + help='Recovery strategy to use for managed jobs.') @click.option( - "--cluster", - "-c", - default=None, - type=str, - hidden=True, - help=("Alias for --name, the name of the spot job."), -) -@click.option( - "--job-recovery", - default=None, - type=str, - help="Recovery strategy to use for managed jobs.", -) -@click.option( - "--detach-run", - "-d", + '--detach-run', + '-d', default=False, is_flag=True, - help=( - "If True, as soon as a job is submitted, return from this call " - "and do not stream execution logs." - ), -) + help=('If True, as soon as a job is submitted, return from this call ' + 'and do not stream execution logs.')) @click.option( - "--retry-until-up/--no-retry-until-up", - "-r/-no-r", + '--retry-until-up/--no-retry-until-up', + '-r/-no-r', default=None, is_flag=True, required=False, help=( - "(Default: True; this flag is deprecated and will be removed in a " - "future release.) Whether to retry provisioning infinitely until the " - "cluster is up, if unavailability errors are encountered. This " # pylint: disable=bad-docstring-quotes - "applies to launching all managed jobs (both the initial and " - "any recovery attempts), not the jobs controller." - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + '(Default: True; this flag is deprecated and will be removed in a ' + 'future release.) Whether to retry provisioning infinitely until the ' + 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes + 'applies to launching all managed jobs (both the initial and ' + 'any recovery attempts), not the jobs controller.')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3915,10 +3547,8 @@ def jobs_launch( """ if cluster is not None: if name is not None and name != cluster: - raise click.UsageError( - "Cannot specify both --name and --cluster. " - "Use one of the flags as they are alias." - ) + raise click.UsageError('Cannot specify both --name and --cluster. ' + 'Use one of the flags as they are alias.') name = cluster env = _merge_env_vars(env_file, env) task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides( @@ -3944,15 +3574,14 @@ def jobs_launch( # Deprecation. We set the default behavior to be retry until up, and the # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0. if retry_until_up is not None: - flag_str = "--retry-until-up" + flag_str = '--retry-until-up' if not retry_until_up: - flag_str = "--no-retry-until-up" + flag_str = '--no-retry-until-up' click.secho( - f"Flag {flag_str} is deprecated and will be removed in a " - "future release (managed jobs will always be retried). " - "Please file an issue if this does not work for you.", - fg="yellow", - ) + f'Flag {flag_str} is deprecated and will be removed in a ' + 'future release (managed jobs will always be retried). ' + 'Please file an issue if this does not work for you.', + fg='yellow') else: retry_until_up = True @@ -3970,46 +3599,44 @@ def jobs_launch( dag_utils.maybe_infer_and_fill_dag_and_task_names(dag) dag_utils.fill_default_config_in_dag_for_job_launch(dag) - click.secho( - f"Managed job {dag.name!r} will be launched on (estimated):", fg="yellow" - ) + click.secho(f'Managed job {dag.name!r} will be launched on (estimated):', + fg='yellow') dag = sky.optimize(dag) if not yes: - prompt = f"Launching a managed job {dag.name!r}. Proceed?" + prompt = f'Launching a managed job {dag.name!r}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) common_utils.check_cluster_name_is_valid(name) - managed_jobs.launch(dag, name, detach_run=detach_run, retry_until_up=retry_until_up) + managed_jobs.launch(dag, + name, + detach_run=detach_run, + retry_until_up=retry_until_up) -@jobs.command("queue", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--refresh", - "-r", - default=False, - is_flag=True, - required=False, - help="Query the latest statuses, restarting the jobs controller if stopped.", -) +@jobs.command('queue', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') @click.option( - "--skip-finished", - "-s", + '--refresh', + '-r', default=False, is_flag=True, required=False, - help="Show only pending/running jobs' information.", + help='Query the latest statuses, restarting the jobs controller if stopped.' ) +@click.option('--skip-finished', + '-s', + default=False, + is_flag=True, + required=False, + help='Show only pending/running jobs\' information.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_queue(all: bool, refresh: bool, skip_finished: bool): @@ -4066,46 +3693,40 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool): watch -n60 sky jobs queue """ - click.secho("Fetching managed job statuses...", fg="yellow") - with rich_utils.safe_status("[cyan]Checking managed jobs[/]"): - _, msg = _get_managed_jobs( - refresh=refresh, - skip_finished=skip_finished, - show_all=all, - is_called_by_user=True, - ) + click.secho('Fetching managed job statuses...', fg='yellow') + with rich_utils.safe_status('[cyan]Checking managed jobs[/]'): + _, msg = _get_managed_jobs(refresh=refresh, + skip_finished=skip_finished, + show_all=all, + is_called_by_user=True) if not skip_finished: - in_progress_only_hint = "" + in_progress_only_hint = '' else: - in_progress_only_hint = " (showing in-progress jobs only)" - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Managed jobs{colorama.Style.RESET_ALL}" - f"{in_progress_only_hint}\n{msg}" - ) - - -@jobs.command("cancel", cls=_DocumentedCodeCommand) -@click.option( - "--name", "-n", required=False, type=str, help="Managed job name to cancel." -) -@click.argument("job_ids", default=None, type=int, required=False, nargs=-1) -@click.option( - "--all", - "-a", - is_flag=True, - default=False, - required=False, - help="Cancel all managed jobs.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + in_progress_only_hint = ' (showing in-progress jobs only)' + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Managed jobs{colorama.Style.RESET_ALL}' + f'{in_progress_only_hint}\n{msg}') + + +@jobs.command('cancel', cls=_DocumentedCodeCommand) +@click.option('--name', + '-n', + required=False, + type=str, + help='Managed job name to cancel.') +@click.argument('job_ids', default=None, type=int, required=False, nargs=-1) +@click.option('--all', + '-a', + is_flag=True, + default=False, + required=False, + help='Cancel all managed jobs.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint # pylint: disable=redefined-builtin def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): @@ -4126,83 +3747,73 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool): """ backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, - stopped_message="All managed jobs should have finished.", - exit_if_not_accessible=True, - ) + stopped_message='All managed jobs should have finished.', + exit_if_not_accessible=True) - job_id_str = ",".join(map(str, job_ids)) + job_id_str = ','.join(map(str, job_ids)) if sum([len(job_ids) > 0, name is not None, all]) != 1: - argument_str = f"--job-ids {job_id_str}" if len(job_ids) > 0 else "" - argument_str += f" --name {name}" if name is not None else "" - argument_str += " --all" if all else "" + argument_str = f'--job-ids {job_id_str}' if len(job_ids) > 0 else '' + argument_str += f' --name {name}' if name is not None else '' + argument_str += ' --all' if all else '' raise click.UsageError( - "Can only specify one of JOB_IDS or --name or --all. " - f"Provided {argument_str!r}." - ) + 'Can only specify one of JOB_IDS or --name or --all. ' + f'Provided {argument_str!r}.') if not yes: - job_identity_str = ( - f"managed jobs with IDs {job_id_str}" if job_ids else repr(name) - ) + job_identity_str = (f'managed jobs with IDs {job_id_str}' + if job_ids else repr(name)) if all: - job_identity_str = "all managed jobs" - click.confirm( - f"Cancelling {job_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + job_identity_str = 'all managed jobs' + click.confirm(f'Cancelling {job_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) managed_jobs.cancel(job_ids=job_ids, name=name, all=all) -@jobs.command("logs", cls=_DocumentedCodeCommand) -@click.option("--name", "-n", required=False, type=str, help="Managed job name.") +@jobs.command('logs', cls=_DocumentedCodeCommand) +@click.option('--name', + '-n', + required=False, + type=str, + help='Managed job name.') @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of the job. [default: --follow] " - "If --no-follow is specified, print the log so far and exit." - ), -) + help=('Follow the logs of the job. [default: --follow] ' + 'If --no-follow is specified, print the log so far and exit.')) @click.option( - "--controller", + '--controller', is_flag=True, default=False, - help=( - "Show the controller logs of this job; useful for debugging " - "launching/recoveries, etc." - ), -) -@click.argument("job_id", required=False, type=int) + help=('Show the controller logs of this job; useful for debugging ' + 'launching/recoveries, etc.')) +@click.argument('job_id', required=False, type=int) @usage_lib.entrypoint -def jobs_logs( - name: Optional[str], job_id: Optional[int], follow: bool, controller: bool -): +def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool, + controller: bool): """Tail the log of a managed job.""" try: - managed_jobs.tail_logs( - name=name, job_id=job_id, follow=follow, controller=controller - ) + managed_jobs.tail_logs(name=name, + job_id=job_id, + follow=follow, + controller=controller) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise -@jobs.command("dashboard", cls=_DocumentedCodeCommand) +@jobs.command('dashboard', cls=_DocumentedCodeCommand) @click.option( - "--port", - "-p", + '--port', + '-p', default=None, type=int, required=False, - help=( - "Local port to use for the dashboard. If None, a free port is " - "automatically chosen." - ), -) + help=('Local port to use for the dashboard. If None, a free port is ' + 'automatically chosen.')) @usage_lib.entrypoint def jobs_dashboard(port: Optional[int]): """Opens a dashboard for managed jobs (needs controller to be UP).""" @@ -4211,17 +3822,14 @@ def jobs_dashboard(port: Optional[int]): # see if the controller is UP first, which is slow; (2) not have to run SSH # port forwarding first (we'd just launch a local dashboard which would make # REST API calls to the controller dashboard server). - click.secho("Checking if jobs controller is up...", fg="yellow") - hint = ( - "Dashboard is not available if jobs controller is not up. Run a " - "managed job first." - ) + click.secho('Checking if jobs controller is up...', fg='yellow') + hint = ('Dashboard is not available if jobs controller is not up. Run a ' + 'managed job first.') backend_utils.is_controller_accessible( controller=controller_utils.Controllers.JOBS_CONTROLLER, stopped_message=hint, non_existent_message=hint, - exit_if_not_accessible=True, - ) + exit_if_not_accessible=True) # SSH forward a free local port to remote's dashboard port. remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT @@ -4230,20 +3838,18 @@ def jobs_dashboard(port: Optional[int]): else: free_port = port ssh_command = ( - f"ssh -qNL {free_port}:localhost:{remote_port} " - f"{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}" - ) - click.echo("Forwarding port: ", nl=False) - click.secho(f"{ssh_command}", dim=True) + f'ssh -qNL {free_port}:localhost:{remote_port} ' + f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}') + click.echo('Forwarding port: ', nl=False) + click.secho(f'{ssh_command}', dim=True) - with subprocess.Popen( - ssh_command, shell=True, start_new_session=True - ) as ssh_process: + with subprocess.Popen(ssh_command, shell=True, + start_new_session=True) as ssh_process: time.sleep(3) # Added delay for ssh_command to initialize. - webbrowser.open(f"http://localhost:{free_port}") + webbrowser.open(f'http://localhost:{free_port}') click.secho( - f"Dashboard is now available at: http://127.0.0.1:{free_port}", fg="green" - ) + f'Dashboard is now available at: http://127.0.0.1:{free_port}', + fg='green') try: ssh_process.wait() except KeyboardInterrupt: @@ -4255,7 +3861,7 @@ def jobs_dashboard(port: Optional[int]): # This happens if jobs controller is auto-stopped. pass finally: - click.echo("Exiting.") + click.echo('Exiting.') # TODO(zhwu): Backward compatibility for the old `sky spot launch` command. @@ -4267,9 +3873,10 @@ def spot(): pass -_add_command_alias( - jobs, jobs_launch, new_group=spot, override_command_argument={"use_spot": True} -) +_add_command_alias(jobs, + jobs_launch, + new_group=spot, + override_command_argument={'use_spot': True}) _add_command_alias(jobs, jobs_queue, new_group=spot) _add_command_alias(jobs, jobs_logs, new_group=spot) _add_command_alias(jobs, jobs_cancel, new_group=spot) @@ -4304,9 +3911,9 @@ def _generate_task_with_service( not_supported_cmd: str, ) -> sky.Task: """Generate a task with service section from a service YAML file.""" - is_yaml, _ = _check_yaml("".join(service_yaml_args)) + is_yaml, _ = _check_yaml(''.join(service_yaml_args)) if not is_yaml: - raise click.UsageError("SERVICE_YAML must be a valid YAML file.") + raise click.UsageError('SERVICE_YAML must be a valid YAML file.') env = _merge_env_vars(env_file, env) # We keep nargs=-1 in service_yaml argument to reuse this function. task = _make_task_or_dag_from_entrypoint_with_overrides( @@ -4328,36 +3935,31 @@ def _generate_task_with_service( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - entrypoint_name="Service", + entrypoint_name='Service', ) if isinstance(task, sky.Dag): raise click.UsageError( - _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd) - ) + _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd)) if task.service is None: with ux_utils.print_exception_no_traceback(): - raise ValueError( - "Service section not found in the YAML file. " - "To fix, add a valid `service` field." - ) + raise ValueError('Service section not found in the YAML file. ' + 'To fix, add a valid `service` field.') service_port: Optional[int] = None for requested_resources in list(task.resources): - if requested_resources.ports is None or len(requested_resources.ports) != 1: + if requested_resources.ports is None or len( + requested_resources.ports) != 1: with ux_utils.print_exception_no_traceback(): raise ValueError( - "Must only specify one port in resources. Each replica " - "will use the port specified as application ingress port." - ) + 'Must only specify one port in resources. Each replica ' + 'will use the port specified as application ingress port.') service_port_str = requested_resources.ports[0] if not service_port_str.isdigit(): # For the case when the user specified a port range like 10000-10010 with ux_utils.print_exception_no_traceback(): - raise ValueError( - f"Port {service_port_str!r} is not a valid " - "port number. Please specify a single port " - f"instead. Got: {service_port_str!r}" - ) + raise ValueError(f'Port {service_port_str!r} is not a valid ' + 'port number. Please specify a single port ' + f'instead. Got: {service_port_str!r}') # We request all the replicas using the same port for now, but it # should be fine to allow different replicas to use different ports # in the future. @@ -4366,39 +3968,31 @@ def _generate_task_with_service( service_port = resource_port if service_port != resource_port: with ux_utils.print_exception_no_traceback(): - raise ValueError( - f"Got multiple ports: {service_port} and " - f"{resource_port} in different resources. " - "Please specify single port instead." - ) + raise ValueError(f'Got multiple ports: {service_port} and ' + f'{resource_port} in different resources. ' + 'Please specify single port instead.') return task -@serve.command("up", cls=_DocumentedCodeCommand) -@click.argument( - "service_yaml", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) -@click.option( - "--service-name", - "-n", - default=None, - type=str, - help="A service name. Unique for each service. If not provided, " - "a unique name is autogenerated.", -) +@serve.command('up', cls=_DocumentedCodeCommand) +@click.argument('service_yaml', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--service-name', + '-n', + default=None, + type=str, + help='A service name. Unique for each service. If not provided, ' + 'a unique name is autogenerated.') @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def serve_up( @@ -4472,18 +4066,19 @@ def serve_up( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd="sky serve up", + not_supported_cmd='sky serve up', ) - click.secho("Service Spec:", fg="cyan") + click.secho('Service Spec:', fg='cyan') click.echo(task.service) - click.secho("Each replica will use the following resources (estimated):", fg="cyan") + click.secho('Each replica will use the following resources (estimated):', + fg='cyan') with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - prompt = f"Launching a new service {service_name!r}. Proceed?" + prompt = f'Launching a new service {service_name!r}. Proceed?' if prompt is not None: click.confirm(prompt, default=True, abort=True, show_default=True) @@ -4493,35 +4088,28 @@ def serve_up( # TODO(MaoZiming): Update Doc. # TODO(MaoZiming): Expose mix replica traffic option to user. # Currently, we do not mix traffic from old and new replicas. -@serve.command("update", cls=_DocumentedCodeCommand) -@click.argument("service_name", required=True, type=str) -@click.argument( - "service_yaml", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) +@serve.command('update', cls=_DocumentedCodeCommand) +@click.argument('service_name', required=True, type=str) +@click.argument('service_yaml', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS) -@click.option( - "--mode", - default=serve_lib.DEFAULT_UPDATE_MODE.value, - type=click.Choice([m.value for m in serve_lib.UpdateMode], case_sensitive=False), - required=False, - help=( - 'Update mode. If "rolling", SkyServe will update the ' - 'service with rolling update. If "blue_green", SkyServe ' - "will update the service with blue-green update. " - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--mode', + default=serve_lib.DEFAULT_UPDATE_MODE.value, + type=click.Choice([m.value for m in serve_lib.UpdateMode], + case_sensitive=False), + required=False, + help=('Update mode. If "rolling", SkyServe will update the ' + 'service with rolling update. If "blue_green", SkyServe ' + 'will update the service with blue-green update. ')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @timeline.event @usage_lib.entrypoint def serve_update( @@ -4594,44 +4182,39 @@ def serve_update( disk_size=disk_size, disk_tier=disk_tier, ports=ports, - not_supported_cmd="sky serve update", + not_supported_cmd='sky serve update', ) - click.secho("Service Spec:", fg="cyan") + click.secho('Service Spec:', fg='cyan') click.echo(task.service) - click.secho("New replica will use the following resources (estimated):", fg="cyan") + click.secho('New replica will use the following resources (estimated):', + fg='cyan') with sky.Dag() as dag: dag.add(task) sky.optimize(dag) if not yes: - click.confirm( - f"Updating service {service_name!r}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + click.confirm(f'Updating service {service_name!r}. Proceed?', + default=True, + abort=True, + show_default=True) serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode)) -@serve.command("status", cls=_DocumentedCodeCommand) -@click.option( - "--all", - "-a", - default=False, - is_flag=True, - required=False, - help="Show all information in full.", -) -@click.option( - "--endpoint", - default=False, - is_flag=True, - required=False, - help="Show service endpoint.", -) -@click.argument("service_names", required=False, type=str, nargs=-1) +@serve.command('status', cls=_DocumentedCodeCommand) +@click.option('--all', + '-a', + default=False, + is_flag=True, + required=False, + help='Show all information in full.') +@click.option('--endpoint', + default=False, + is_flag=True, + required=False, + help='Show service endpoint.') +@click.argument('service_names', required=False, type=str, nargs=-1) @usage_lib.entrypoint # pylint: disable=redefined-builtin def serve_status(all: bool, endpoint: bool, service_names: List[str]): @@ -4727,39 +4310,36 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]): sky serve status my-service """ # This won't pollute the output of --endpoint. - with rich_utils.safe_status("[cyan]Checking services[/]"): - _, msg = _get_services( - service_names, show_all=all, show_endpoint=endpoint, is_called_by_user=True - ) + with rich_utils.safe_status('[cyan]Checking services[/]'): + _, msg = _get_services(service_names, + show_all=all, + show_endpoint=endpoint, + is_called_by_user=True) if not endpoint: - click.echo( - f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT}" - f"Services{colorama.Style.RESET_ALL}" - ) + click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Services{colorama.Style.RESET_ALL}') click.echo(msg) -@serve.command("down", cls=_DocumentedCodeCommand) -@click.argument("service_names", required=False, type=str, nargs=-1) -@click.option( - "--all", "-a", default=False, is_flag=True, help="Tear down all services." -) -@click.option( - "--purge", - "-p", - default=False, - is_flag=True, - help="Tear down services in failed status.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@serve.command('down', cls=_DocumentedCodeCommand) +@click.argument('service_names', required=False, type=str, nargs=-1) +@click.option('--all', + '-a', + default=False, + is_flag=True, + help='Tear down all services.') +@click.option('--purge', + '-p', + default=False, + is_flag=True, + help='Tear down services in failed status.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') # pylint: disable=redefined-builtin def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): """Teardown service(s). @@ -4790,62 +4370,50 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool): sky serve down failed-service --purge """ if sum([len(service_names) > 0, all]) != 1: - argument_str = ( - f'SERVICE_NAMES={",".join(service_names)}' if len(service_names) > 0 else "" - ) - argument_str += " --all" if all else "" + argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len( + service_names) > 0 else '' + argument_str += ' --all' if all else '' raise click.UsageError( - "Can only specify one of SERVICE_NAMES or --all. " - f"Provided {argument_str!r}." - ) + 'Can only specify one of SERVICE_NAMES or --all. ' + f'Provided {argument_str!r}.') backend_utils.is_controller_accessible( controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER, - stopped_message="All services should have been terminated.", - exit_if_not_accessible=True, - ) + stopped_message='All services should have been terminated.', + exit_if_not_accessible=True) if not yes: - quoted_service_names = [f"{name!r}" for name in service_names] + quoted_service_names = [f'{name!r}' for name in service_names] service_identity_str = f'service(s) {", ".join(quoted_service_names)}' if all: - service_identity_str = "all services" - click.confirm( - f"Terminating {service_identity_str}. Proceed?", - default=True, - abort=True, - show_default=True, - ) + service_identity_str = 'all services' + click.confirm(f'Terminating {service_identity_str}. Proceed?', + default=True, + abort=True, + show_default=True) serve_lib.down(service_names=service_names, all=all, purge=purge) -@serve.command("logs", cls=_DocumentedCodeCommand) +@serve.command('logs', cls=_DocumentedCodeCommand) @click.option( - "--follow/--no-follow", + '--follow/--no-follow', is_flag=True, default=True, - help=( - "Follow the logs of the job. [default: --follow] " - "If --no-follow is specified, print the log so far and exit." - ), -) -@click.option( - "--controller", - is_flag=True, - default=False, - required=False, - help="Show the controller logs of this service.", -) -@click.option( - "--load-balancer", - is_flag=True, - default=False, - required=False, - help="Show the load balancer logs of this service.", -) -@click.argument("service_name", required=True, type=str) -@click.argument("replica_id", required=False, type=int) + help=('Follow the logs of the job. [default: --follow] ' + 'If --no-follow is specified, print the log so far and exit.')) +@click.option('--controller', + is_flag=True, + default=False, + required=False, + help='Show the controller logs of this service.') +@click.option('--load-balancer', + is_flag=True, + default=False, + required=False, + help='Show the load balancer logs of this service.') +@click.argument('service_name', required=True, type=str) +@click.argument('replica_id', required=False, type=int) @usage_lib.entrypoint # TODO(tian): Add default argument for this CLI if none of the flags are # specified. @@ -4872,16 +4440,13 @@ def serve_logs( sky serve logs [SERVICE_NAME] 1 """ have_replica_id = replica_id is not None - num_flags = controller + load_balancer + have_replica_id + num_flags = (controller + load_balancer + have_replica_id) if num_flags > 1: - raise click.UsageError( - "At most one of --controller, --load-balancer, " - "[REPLICA_ID] can be specified." - ) + raise click.UsageError('At most one of --controller, --load-balancer, ' + '[REPLICA_ID] can be specified.') if num_flags == 0: - raise click.UsageError( - "One of --controller, --load-balancer, " "[REPLICA_ID] must be specified." - ) + raise click.UsageError('One of --controller, --load-balancer, ' + '[REPLICA_ID] must be specified.') if controller: target_component = serve_lib.ServiceComponent.CONTROLLER elif load_balancer: @@ -4891,9 +4456,10 @@ def serve_logs( assert replica_id is not None target_component = serve_lib.ServiceComponent.REPLICA try: - serve_lib.tail_logs( - service_name, target=target_component, replica_id=replica_id, follow=follow - ) + serve_lib.tail_logs(service_name, + target=target_component, + replica_id=replica_id, + follow=follow) except exceptions.ClusterNotUpError: with ux_utils.print_exception_no_traceback(): raise @@ -4921,84 +4487,71 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]: """ config = common_utils.read_yaml(os.path.expanduser(yaml_path)) if not isinstance(config, dict): - raise ValueError( - f"Invalid YAML file: {yaml_path}. " - "The YAML file should be parsed into a dictionary." - ) - if config.get("resources") is None: + raise ValueError(f'Invalid YAML file: {yaml_path}. ' + 'The YAML file should be parsed into a dictionary.') + if config.get('resources') is None: return None - resources = config["resources"] + resources = config['resources'] if not isinstance(resources, dict): - raise ValueError( - f"Invalid resources configuration in {yaml_path}. " - "Resources must be a dictionary." - ) - if resources.get("candidates") is None: + raise ValueError(f'Invalid resources configuration in {yaml_path}. ' + 'Resources must be a dictionary.') + if resources.get('candidates') is None: return None - candidates = resources["candidates"] + candidates = resources['candidates'] if not isinstance(candidates, list): - raise ValueError("Resource candidates must be a list of dictionaries.") + raise ValueError('Resource candidates must be a list of dictionaries.') for candidate in candidates: if not isinstance(candidate, dict): - raise ValueError("Each resource candidate must be a dictionary.") + raise ValueError('Each resource candidate must be a dictionary.') return candidates -@bench.command("launch", cls=_DocumentedCodeCommand) -@click.argument( - "entrypoint", - required=True, - type=str, - nargs=-1, - **_get_shell_complete_args(_complete_file_name), -) -@click.option("--benchmark", "-b", required=True, type=str, help="Benchmark name.") +@bench.command('launch', cls=_DocumentedCodeCommand) +@click.argument('entrypoint', + required=True, + type=str, + nargs=-1, + **_get_shell_complete_args(_complete_file_name)) +@click.option('--benchmark', + '-b', + required=True, + type=str, + help='Benchmark name.') @_add_click_options(_TASK_OPTIONS_WITH_NAME) +@click.option('--gpus', + required=False, + type=str, + help=('Comma-separated list of GPUs to run benchmark on. ' + 'Example values: "T4:4,V100:8" (without blank spaces).')) @click.option( - "--gpus", - required=False, - type=str, - help=( - "Comma-separated list of GPUs to run benchmark on. " - 'Example values: "T4:4,V100:8" (without blank spaces).' - ), -) -@click.option( - "--ports", + '--ports', required=False, type=str, multiple=True, - help=( - "Ports to open on the cluster. " - 'If specified, overrides the "ports" config in the YAML. ' - ), + help=('Ports to open on the cluster. ' + 'If specified, overrides the "ports" config in the YAML. '), ) @click.option( - "--idle-minutes-to-autostop", - "-i", + '--idle-minutes-to-autostop', + '-i', default=None, type=int, required=False, - help=( - "Automatically stop the cluster after this many minutes " - "of idleness after setup/file_mounts. This is equivalent to " - "running `sky launch -d ...` and then `sky autostop -i `. " - "If not set, the cluster will not be autostopped." - ), -) + help=('Automatically stop the cluster after this many minutes ' + 'of idleness after setup/file_mounts. This is equivalent to ' + 'running `sky launch -d ...` and then `sky autostop -i `. ' + 'If not set, the cluster will not be autostopped.')) # Disabling quote check here, as there seems to be a bug in pylint, # which incorrectly recognizes the help string as a docstring. # pylint: disable=bad-docstring-quotes -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def benchmark_launch( entrypoint: str, @@ -5032,71 +4585,66 @@ def benchmark_launch( env = _merge_env_vars(env_file, env) record = benchmark_state.get_benchmark_from_name(benchmark) if record is not None: - raise click.BadParameter( - f"Benchmark {benchmark} already exists. " - "To delete the previous benchmark result, " - f"run `sky bench delete {benchmark}`." - ) + raise click.BadParameter(f'Benchmark {benchmark} already exists. ' + 'To delete the previous benchmark result, ' + f'run `sky bench delete {benchmark}`.') - entrypoint = " ".join(entrypoint) + entrypoint = ' '.join(entrypoint) if not entrypoint: - raise click.BadParameter("Please specify a task yaml to benchmark.") + raise click.BadParameter('Please specify a task yaml to benchmark.') is_yaml, config = _check_yaml(entrypoint) if not is_yaml: raise click.BadParameter( - "Sky Benchmark does not support command line tasks. " - "Please provide a YAML file." - ) + 'Sky Benchmark does not support command line tasks. ' + 'Please provide a YAML file.') assert config is not None, (is_yaml, config) - click.secho("Benchmarking a task from YAML spec: ", fg="yellow", nl=False) + click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False) click.secho(entrypoint, bold=True) candidates = _get_candidate_configs(entrypoint) # Check if the candidate configs are specified in both CLI and YAML. if candidates is not None: - message = ( - "is specified in both CLI and resources.candidates " - "in the YAML. Please specify only one of them." - ) + message = ('is specified in both CLI and resources.candidates ' + 'in the YAML. Please specify only one of them.') if cloud is not None: - if any("cloud" in candidate for candidate in candidates): - raise click.BadParameter(f"cloud {message}") + if any('cloud' in candidate for candidate in candidates): + raise click.BadParameter(f'cloud {message}') if region is not None: - if any("region" in candidate for candidate in candidates): - raise click.BadParameter(f"region {message}") + if any('region' in candidate for candidate in candidates): + raise click.BadParameter(f'region {message}') if zone is not None: - if any("zone" in candidate for candidate in candidates): - raise click.BadParameter(f"zone {message}") + if any('zone' in candidate for candidate in candidates): + raise click.BadParameter(f'zone {message}') if gpus is not None: - if any("accelerators" in candidate for candidate in candidates): - raise click.BadParameter(f"gpus (accelerators) {message}") + if any('accelerators' in candidate for candidate in candidates): + raise click.BadParameter(f'gpus (accelerators) {message}') if use_spot is not None: - if any("use_spot" in candidate for candidate in candidates): - raise click.BadParameter(f"use_spot {message}") + if any('use_spot' in candidate for candidate in candidates): + raise click.BadParameter(f'use_spot {message}') if image_id is not None: - if any("image_id" in candidate for candidate in candidates): - raise click.BadParameter(f"image_id {message}") + if any('image_id' in candidate for candidate in candidates): + raise click.BadParameter(f'image_id {message}') if disk_size is not None: - if any("disk_size" in candidate for candidate in candidates): - raise click.BadParameter(f"disk_size {message}") + if any('disk_size' in candidate for candidate in candidates): + raise click.BadParameter(f'disk_size {message}') if disk_tier is not None: - if any("disk_tier" in candidate for candidate in candidates): - raise click.BadParameter(f"disk_tier {message}") + if any('disk_tier' in candidate for candidate in candidates): + raise click.BadParameter(f'disk_tier {message}') if ports: - if any("ports" in candidate for candidate in candidates): - raise click.BadParameter(f"ports {message}") + if any('ports' in candidate for candidate in candidates): + raise click.BadParameter(f'ports {message}') # The user can specify the benchmark candidates in either of the two ways: # 1. By specifying resources.candidates in the YAML. # 2. By specifying gpu types as a command line argument (--gpus). override_gpu = None if gpus is not None: - gpu_list = gpus.split(",") + gpu_list = gpus.split(',') gpu_list = [gpu.strip() for gpu in gpu_list] - if " " in gpus: - raise click.BadParameter("Remove blanks in --gpus.") + if ' ' in gpus: + raise click.BadParameter('Remove blanks in --gpus.') if len(gpu_list) == 1: override_gpu = gpu_list[0] @@ -5104,73 +4652,66 @@ def benchmark_launch( # If len(gpu_list) > 1, gpus is interpreted # as a list of benchmark candidates. if candidates is None: - candidates = [{"accelerators": gpu} for gpu in gpu_list] + candidates = [{'accelerators': gpu} for gpu in gpu_list] override_gpu = None else: - raise ValueError( - "Provide benchmark candidates in either " - "--gpus or resources.candidates in the YAML." - ) + raise ValueError('Provide benchmark candidates in either ' + '--gpus or resources.candidates in the YAML.') if candidates is None: candidates = [{}] - if "resources" not in config: - config["resources"] = {} - resources_config = config["resources"] + if 'resources' not in config: + config['resources'] = {} + resources_config = config['resources'] # Override the yaml config with the command line arguments. if name is not None: - config["name"] = name + config['name'] = name if workdir is not None: - config["workdir"] = workdir + config['workdir'] = workdir if num_nodes is not None: - config["num_nodes"] = num_nodes - override_params = _parse_override_params( - cloud=cloud, - region=region, - zone=zone, - gpus=override_gpu, - cpus=cpus, - memory=memory, - use_spot=use_spot, - image_id=image_id, - disk_size=disk_size, - disk_tier=disk_tier, - ports=ports, - ) + config['num_nodes'] = num_nodes + override_params = _parse_override_params(cloud=cloud, + region=region, + zone=zone, + gpus=override_gpu, + cpus=cpus, + memory=memory, + use_spot=use_spot, + image_id=image_id, + disk_size=disk_size, + disk_tier=disk_tier, + ports=ports) _pop_and_ignore_fields_in_override_params( - override_params, field_to_ignore=["cpus", "memory"] - ) + override_params, field_to_ignore=['cpus', 'memory']) resources_config.update(override_params) - if "cloud" in resources_config: - cloud = resources_config.pop("cloud") + if 'cloud' in resources_config: + cloud = resources_config.pop('cloud') if cloud is not None: - resources_config["cloud"] = str(cloud) - if "region" in resources_config: - if resources_config["region"] is None: - resources_config.pop("region") - if "zone" in resources_config: - if resources_config["zone"] is None: - resources_config.pop("zone") - if "accelerators" in resources_config: - if resources_config["accelerators"] is None: - resources_config.pop("accelerators") - if "image_id" in resources_config: - if resources_config["image_id"] is None: - resources_config.pop("image_id") + resources_config['cloud'] = str(cloud) + if 'region' in resources_config: + if resources_config['region'] is None: + resources_config.pop('region') + if 'zone' in resources_config: + if resources_config['zone'] is None: + resources_config.pop('zone') + if 'accelerators' in resources_config: + if resources_config['accelerators'] is None: + resources_config.pop('accelerators') + if 'image_id' in resources_config: + if resources_config['image_id'] is None: + resources_config.pop('image_id') # Fully generate the benchmark candidate configs. clusters, candidate_configs = benchmark_utils.generate_benchmark_configs( - benchmark, config, candidates - ) + benchmark, config, candidates) # Show the benchmarking VM instances selected by the optimizer. # This also detects the case where the user requested infeasible resources. - benchmark_utils.print_benchmark_clusters( - benchmark, clusters, config, candidate_configs - ) + benchmark_utils.print_benchmark_clusters(benchmark, clusters, config, + candidate_configs) if not yes: - plural = "s" if len(candidates) > 1 else "" - prompt = f"Launching {len(candidates)} cluster{plural}. Proceed?" + plural = 's' if len(candidates) > 1 else '' + prompt = f'Launching {len(candidates)} cluster{plural}. Proceed?' click.confirm(prompt, default=True, abort=True, show_default=True) # Configs that are only accepted by the CLI. @@ -5179,96 +4720,96 @@ def benchmark_launch( # the serverless execution. if idle_minutes_to_autostop is None: idle_minutes_to_autostop = 5 - commandline_args["idle-minutes-to-autostop"] = idle_minutes_to_autostop + commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop if len(env) > 0: - commandline_args["env"] = [f"{k}={v}" for k, v in env] + commandline_args['env'] = [f'{k}={v}' for k, v in env] # Launch the benchmarking clusters in detach mode in parallel. benchmark_created = benchmark_utils.launch_benchmark_clusters( - benchmark, clusters, candidate_configs, commandline_args - ) + benchmark, clusters, candidate_configs, commandline_args) # If at least one cluster is created, print the following messages. if benchmark_created: logger.info( - f"\n{colorama.Fore.CYAN}Benchmark name: " - f"{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}" - "\nTo see the benchmark results: " - f"{backend_utils.BOLD}sky bench show " - f"{benchmark}{backend_utils.RESET_BOLD}" - "\nTo teardown the clusters: " - f"{backend_utils.BOLD}sky bench down " - f"{benchmark}{backend_utils.RESET_BOLD}" - ) - subprocess_utils.run("sky bench ls") + f'\n{colorama.Fore.CYAN}Benchmark name: ' + f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}' + '\nTo see the benchmark results: ' + f'{backend_utils.BOLD}sky bench show ' + f'{benchmark}{backend_utils.RESET_BOLD}' + '\nTo teardown the clusters: ' + f'{backend_utils.BOLD}sky bench down ' + f'{benchmark}{backend_utils.RESET_BOLD}') + subprocess_utils.run('sky bench ls') else: - logger.error("No benchmarking clusters are created.") - subprocess_utils.run("sky status") + logger.error('No benchmarking clusters are created.') + subprocess_utils.run('sky status') -@bench.command("ls", cls=_DocumentedCodeCommand) +@bench.command('ls', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def benchmark_ls() -> None: """List the benchmark history.""" benchmarks = benchmark_state.get_benchmarks() columns = [ - "BENCHMARK", - "TASK", - "LAUNCHED", + 'BENCHMARK', + 'TASK', + 'LAUNCHED', ] max_num_candidates = 1 for benchmark in benchmarks: - benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) + benchmark_results = benchmark_state.get_benchmark_results( + benchmark['name']) num_candidates = len(benchmark_results) if num_candidates > max_num_candidates: max_num_candidates = num_candidates if max_num_candidates == 1: - columns += ["CANDIDATE"] + columns += ['CANDIDATE'] else: - columns += [f"CANDIDATE {i}" for i in range(1, max_num_candidates + 1)] + columns += [f'CANDIDATE {i}' for i in range(1, max_num_candidates + 1)] benchmark_table = log_utils.create_table(columns) for benchmark in benchmarks: - if benchmark["task"] is not None: - task = benchmark["task"] + if benchmark['task'] is not None: + task = benchmark['task'] else: - task = "-" + task = '-' row = [ # BENCHMARK - benchmark["name"], + benchmark['name'], # TASK task, # LAUNCHED - datetime.datetime.fromtimestamp(benchmark["launched_at"]), + datetime.datetime.fromtimestamp(benchmark['launched_at']), ] - benchmark_results = benchmark_state.get_benchmark_results(benchmark["name"]) + benchmark_results = benchmark_state.get_benchmark_results( + benchmark['name']) # RESOURCES for b in benchmark_results: - num_nodes = b["num_nodes"] - resources = b["resources"] - postfix_spot = "[Spot]" if resources.use_spot else "" + num_nodes = b['num_nodes'] + resources = b['resources'] + postfix_spot = '[Spot]' if resources.use_spot else '' instance_type = resources.instance_type + postfix_spot if resources.accelerators is None: - accelerators = "" + accelerators = '' else: accelerator, count = list(resources.accelerators.items())[0] - accelerators = f" ({accelerator}:{count})" + accelerators = f' ({accelerator}:{count})' # For brevity, skip the cloud names. - resources_str = f"{num_nodes}x {instance_type}{accelerators}" + resources_str = f'{num_nodes}x {instance_type}{accelerators}' row.append(resources_str) - row += [""] * (max_num_candidates - len(benchmark_results)) + row += [''] * (max_num_candidates - len(benchmark_results)) benchmark_table.add_row(row) if benchmarks: click.echo(benchmark_table) else: - click.echo("No benchmark history found.") + click.echo('No benchmark history found.') -@bench.command("show", cls=_DocumentedCodeCommand) -@click.argument("benchmark", required=True, type=str) +@bench.command('show', cls=_DocumentedCodeCommand) +@click.argument('benchmark', required=True, type=str) # TODO(woosuk): Add --all option to show all the collected information # (e.g., setup time, warmup steps, total steps, etc.). @usage_lib.entrypoint @@ -5276,81 +4817,79 @@ def benchmark_show(benchmark: str) -> None: """Show a benchmark report.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f"Benchmark {benchmark} does not exist.") + raise click.BadParameter(f'Benchmark {benchmark} does not exist.') benchmark_utils.update_benchmark_state(benchmark) click.echo( - textwrap.dedent( - """\ + textwrap.dedent("""\ Legend: - #STEPS: Number of steps taken. - SEC/STEP, $/STEP: Average time (cost) per step. - EST(hr), EST($): Estimated total time (cost) to complete the benchmark. - """ - ) - ) + """)) columns = [ - "CLUSTER", - "RESOURCES", - "STATUS", - "DURATION", - "SPENT($)", - "#STEPS", - "SEC/STEP", - "$/STEP", - "EST(hr)", - "EST($)", + 'CLUSTER', + 'RESOURCES', + 'STATUS', + 'DURATION', + 'SPENT($)', + '#STEPS', + 'SEC/STEP', + '$/STEP', + 'EST(hr)', + 'EST($)', ] cluster_table = log_utils.create_table(columns) rows = [] benchmark_results = benchmark_state.get_benchmark_results(benchmark) for result in benchmark_results: - num_nodes = result["num_nodes"] - resources = result["resources"] + num_nodes = result['num_nodes'] + resources = result['resources'] row = [ # CLUSTER - result["cluster"], + result['cluster'], # RESOURCES - f"{num_nodes}x {resources}", + f'{num_nodes}x {resources}', # STATUS - result["status"].value, + result['status'].value, ] - record = result["record"] - if record is None or record.start_time is None or record.last_time is None: - row += ["-"] * (len(columns) - len(row)) + record = result['record'] + if (record is None or record.start_time is None or + record.last_time is None): + row += ['-'] * (len(columns) - len(row)) rows.append(row) continue - duration_str = log_utils.readable_time_duration( - record.start_time, record.last_time, absolute=True - ) + duration_str = log_utils.readable_time_duration(record.start_time, + record.last_time, + absolute=True) duration = record.last_time - record.start_time spent = num_nodes * resources.get_cost(duration) - spent_str = f"{spent:.4f}" + spent_str = f'{spent:.4f}' num_steps = record.num_steps_so_far if num_steps is None: - num_steps = "-" + num_steps = '-' seconds_per_step = record.seconds_per_step if seconds_per_step is None: - seconds_per_step_str = "-" - cost_per_step_str = "-" + seconds_per_step_str = '-' + cost_per_step_str = '-' else: - seconds_per_step_str = f"{seconds_per_step:.4f}" + seconds_per_step_str = f'{seconds_per_step:.4f}' cost_per_step = num_nodes * resources.get_cost(seconds_per_step) - cost_per_step_str = f"{cost_per_step:.6f}" + cost_per_step_str = f'{cost_per_step:.6f}' total_time = record.estimated_total_seconds if total_time is None: - total_time_str = "-" - total_cost_str = "-" + total_time_str = '-' + total_cost_str = '-' else: - total_time_str = f"{total_time / 3600:.2f}" + total_time_str = f'{total_time / 3600:.2f}' total_cost = num_nodes * resources.get_cost(total_time) - total_cost_str = f"{total_cost:.2f}" + total_cost_str = f'{total_cost:.2f}' row += [ # DURATION @@ -5374,51 +4913,45 @@ def benchmark_show(benchmark: str) -> None: click.echo(cluster_table) finished = [ - row for row in rows if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value + row for row in rows + if row[2] == benchmark_state.BenchmarkStatus.FINISHED.value ] - if any(row[5] == "-" for row in finished): + if any(row[5] == '-' for row in finished): # No #STEPS. SkyCallback was unused. click.secho( - "SkyCallback logs are not found in this benchmark. " - "Consider using SkyCallback to get more detailed information " - "in real time.", - fg="yellow", - ) - elif any(row[6] != "-" and row[-1] == "-" for row in rows): + 'SkyCallback logs are not found in this benchmark. ' + 'Consider using SkyCallback to get more detailed information ' + 'in real time.', + fg='yellow') + elif any(row[6] != '-' and row[-1] == '-' for row in rows): # No EST($). total_steps is not specified and cannot be inferred. click.secho( - "Cannot estimate total time and cost because " - "the total number of steps cannot be inferred by SkyCallback. " - "To get the estimation, specify the total number of steps in " - "either `sky_callback.init` or `Sky*Callback`.", - fg="yellow", - ) + 'Cannot estimate total time and cost because ' + 'the total number of steps cannot be inferred by SkyCallback. ' + 'To get the estimation, specify the total number of steps in ' + 'either `sky_callback.init` or `Sky*Callback`.', + fg='yellow') -@bench.command("down", cls=_DocumentedCodeCommand) -@click.argument("benchmark", required=True, type=str) +@bench.command('down', cls=_DocumentedCodeCommand) +@click.argument('benchmark', required=True, type=str) @click.option( - "--exclude", - "-e", - "clusters_to_exclude", + '--exclude', + '-e', + 'clusters_to_exclude', required=False, type=str, multiple=True, - help=( - "Cluster name(s) to exclude from termination. " - "Typically, you might want to see the benchmark results in " - '`sky bench show` and exclude a "winner" cluster from termination ' - "to finish the running task." - ), -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + help=('Cluster name(s) to exclude from termination. ' + 'Typically, you might want to see the benchmark results in ' + '`sky bench show` and exclude a "winner" cluster from termination ' + 'to finish the running task.')) +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint def benchmark_down( benchmark: str, @@ -5428,7 +4961,7 @@ def benchmark_down( """Tear down all clusters belonging to a benchmark.""" record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - raise click.BadParameter(f"Benchmark {benchmark} does not exist.") + raise click.BadParameter(f'Benchmark {benchmark} does not exist.') clusters = benchmark_state.get_benchmark_clusters(benchmark) to_stop: List[str] = [] @@ -5439,71 +4972,66 @@ def benchmark_down( continue to_stop.append(cluster) - _down_or_stop_clusters(to_stop, apply_to_all=False, down=True, no_confirm=yes) - - -@bench.command("delete", cls=_DocumentedCodeCommand) -@click.argument("benchmarks", required=False, type=str, nargs=-1) -@click.option( - "--all", - "-a", - default=None, - is_flag=True, - help="Delete all benchmark reports from the history.", -) -@click.option( - "--yes", - "-y", - is_flag=True, - default=False, - required=False, - help="Skip confirmation prompt.", -) + _down_or_stop_clusters(to_stop, + apply_to_all=False, + down=True, + no_confirm=yes) + + +@bench.command('delete', cls=_DocumentedCodeCommand) +@click.argument('benchmarks', required=False, type=str, nargs=-1) +@click.option('--all', + '-a', + default=None, + is_flag=True, + help='Delete all benchmark reports from the history.') +@click.option('--yes', + '-y', + is_flag=True, + default=False, + required=False, + help='Skip confirmation prompt.') @usage_lib.entrypoint # pylint: disable=redefined-builtin -def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], yes: bool) -> None: +def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool], + yes: bool) -> None: """Delete benchmark reports from the history.""" if not benchmarks and all is None: raise click.BadParameter( - "Either specify benchmarks or use --all to delete all benchmarks." - ) + 'Either specify benchmarks or use --all to delete all benchmarks.') to_delete = [] if len(benchmarks) > 0: for benchmark in benchmarks: record = benchmark_state.get_benchmark_from_name(benchmark) if record is None: - print(f"Benchmark {benchmark} not found.") + print(f'Benchmark {benchmark} not found.') else: to_delete.append(record) if all: to_delete = benchmark_state.get_benchmarks() if len(benchmarks) > 0: - print( - "Both --all and benchmark(s) specified " - "for sky bench delete. Letting --all take effect." - ) + print('Both --all and benchmark(s) specified ' + 'for sky bench delete. Letting --all take effect.') - to_delete = [r["name"] for r in to_delete] + to_delete = [r['name'] for r in to_delete] if not to_delete: return - benchmark_list = ", ".join(to_delete) - plural = "s" if len(to_delete) > 1 else "" + benchmark_list = ', '.join(to_delete) + plural = 's' if len(to_delete) > 1 else '' if not yes: click.confirm( - f"Deleting the benchmark{plural}: {benchmark_list}. Proceed?", + f'Deleting the benchmark{plural}: {benchmark_list}. Proceed?', default=True, abort=True, - show_default=True, - ) + show_default=True) - progress = rich_progress.Progress( - transient=True, redirect_stdout=False, redirect_stderr=False - ) + progress = rich_progress.Progress(transient=True, + redirect_stdout=False, + redirect_stderr=False) task = progress.add_task( - f"[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ", - total=len(to_delete), - ) + f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ', + total=len(to_delete)) def _delete_benchmark(benchmark: str) -> None: clusters = benchmark_state.get_benchmark_clusters(benchmark) @@ -5514,27 +5042,25 @@ def _delete_benchmark(benchmark: str) -> None: num_clusters = len([r for r in records if r is not None]) if num_clusters > 0: - plural = "s" if num_clusters > 1 else "" - message = ( - f"{colorama.Fore.YELLOW}Benchmark {benchmark} " - f"has {num_clusters} un-terminated cluster{plural}. " - f"Terminate the cluster{plural} with " - f"{backend_utils.BOLD} sky bench down {benchmark} " - f"{backend_utils.RESET_BOLD} " - "before deleting the benchmark report." - ) + plural = 's' if num_clusters > 1 else '' + message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} ' + f'has {num_clusters} un-terminated cluster{plural}. ' + f'Terminate the cluster{plural} with ' + f'{backend_utils.BOLD} sky bench down {benchmark} ' + f'{backend_utils.RESET_BOLD} ' + 'before deleting the benchmark report.') success = False else: - bucket_name = benchmark_state.get_benchmark_from_name(benchmark)["bucket"] + bucket_name = benchmark_state.get_benchmark_from_name( + benchmark)['bucket'] handle = global_user_state.get_handle_from_storage_name(bucket_name) assert handle is not None, bucket_name bucket_type = list(handle.sky_stores.keys())[0] - benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, bucket_type) + benchmark_utils.remove_benchmark_logs(benchmark, bucket_name, + bucket_type) benchmark_state.delete_benchmark(benchmark) - message = ( - f"{colorama.Fore.GREEN}Benchmark report for " - f"{benchmark} deleted.{colorama.Style.RESET_ALL}" - ) + message = (f'{colorama.Fore.GREEN}Benchmark report for ' + f'{benchmark} deleted.{colorama.Style.RESET_ALL}') success = True progress.stop() @@ -5555,13 +5081,12 @@ def local(): pass -@click.option( - "--gpus/--no-gpus", - default=True, - is_flag=True, - help="Launch cluster without GPU support even " "if GPUs are detected on the host.", -) -@local.command("up", cls=_DocumentedCodeCommand) +@click.option('--gpus/--no-gpus', + default=True, + is_flag=True, + help='Launch cluster without GPU support even ' + 'if GPUs are detected on the host.') +@local.command('up', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_up(gpus: bool): """Creates a local cluster.""" @@ -5572,39 +5097,36 @@ def local_up(gpus: bool): gpus = gpus and local_gpus_available # Check if ~/.kube/config exists: - if os.path.exists(os.path.expanduser("~/.kube/config")): + if os.path.exists(os.path.expanduser('~/.kube/config')): curr_context = kubernetes_utils.get_current_kube_config_context_name() - skypilot_context = "kind-skypilot" + skypilot_context = 'kind-skypilot' if curr_context is not None and curr_context != skypilot_context: click.echo( - f"Current context in kube config: {curr_context}" - "\nWill automatically switch to kind-skypilot after the local " - "cluster is created." - ) - message_str = "Creating local cluster{}..." - message_str = message_str.format( - (" with GPU support (this may take up " "to 15 minutes)") if gpus else "" - ) + f'Current context in kube config: {curr_context}' + '\nWill automatically switch to kind-skypilot after the local ' + 'cluster is created.') + message_str = 'Creating local cluster{}...' + message_str = message_str.format((' with GPU support (this may take up ' + 'to 15 minutes)') if gpus else '') path_to_package = os.path.dirname(os.path.dirname(__file__)) - up_script_path = os.path.join( - path_to_package, "sky/utils/kubernetes", "create_cluster.sh" - ) + up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', + 'create_cluster.sh') # Get directory of script and run it from there cwd = os.path.dirname(os.path.abspath(up_script_path)) - run_command = up_script_path + " --gpus" if gpus else up_script_path + run_command = up_script_path + ' --gpus' if gpus else up_script_path run_command = shlex.split(run_command) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_up.log") - tail_cmd = "tail -n100 -f " + log_path + log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, + 'local_up.log') + tail_cmd = 'tail -n100 -f ' + log_path click.echo(message_str) style = colorama.Style - click.echo( - "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" - ) + click.echo('To view detailed progress: ' + f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') returncode, _, stderr = log_lib.run_with_log( cmd=run_command, @@ -5612,155 +5134,138 @@ def local_up(gpus: bool): require_outputs=True, stream_logs=False, line_processor=log_utils.SkyLocalUpLineProcessor(), - cwd=cwd, - ) + cwd=cwd) # Kind always writes to stderr even if it succeeds. # If the failure happens after the cluster is created, we need # to strip all stderr of "No kind clusters found.", which is # printed when querying with kind get clusters. - stderr = stderr.replace("No kind clusters found.\n", "") + stderr = stderr.replace('No kind clusters found.\n', '') if returncode == 0: cluster_created = True elif returncode == 100: - click.echo( - f"{colorama.Fore.GREEN}Local cluster already " - f"exists.{style.RESET_ALL}\n" - "If you want to delete it instead, run: sky local down" - ) + click.echo(f'{colorama.Fore.GREEN}Local cluster already ' + f'exists.{style.RESET_ALL}\n' + 'If you want to delete it instead, run: sky local down') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - "Failed to create local cluster. " - f"Full log: {log_path}" - f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" - ) + 'Failed to create local cluster. ' + f'Full log: {log_path}' + f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') # Run sky check - with rich_utils.safe_status("[bold cyan]Running sky check..."): - sky_check.check(clouds=["kubernetes"], quiet=True) + with rich_utils.safe_status('[bold cyan]Running sky check...'): + sky_check.check(clouds=['kubernetes'], quiet=True) if cluster_created: # Prepare completion message which shows CPU and GPU count # Get number of CPUs p = subprocess_utils.run( - "kubectl get nodes -o jsonpath='{.items[0].status.capacity.cpu}'", - capture_output=True, - ) - num_cpus = int(p.stdout.decode("utf-8")) + 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'', + capture_output=True) + num_cpus = int(p.stdout.decode('utf-8')) # GPU count/type parsing - gpu_message = "" - gpu_hint = "" + gpu_message = '' + gpu_hint = '' if gpus: # Get GPU model by querying the node labels - label_name_escaped = "skypilot.co/accelerator".replace(".", "\\.") - gpu_type_cmd = f"kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels['{label_name_escaped}']}}\"" # pylint: disable=line-too-long + label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.') + gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output( - gpu_type_cmd, shell=True, text=True - ) - gpu_type_str = gpu_count_output.strip() + " " + gpu_count_output = subprocess.check_output(gpu_type_cmd, + shell=True, + text=True) + gpu_type_str = gpu_count_output.strip() + ' ' except subprocess.CalledProcessError as e: - output = str(e.output.decode("utf-8")) - logger.warning(f"Failed to get GPU type: {output}") - gpu_type_str = "" + output = str(e.output.decode('utf-8')) + logger.warning(f'Failed to get GPU type: {output}') + gpu_type_str = '' # Get number of GPUs (sum of nvidia.com/gpu resources) - gpu_count_command = "kubectl get nodes -o=jsonpath='{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}' | awk '{sum += $1} END {print sum}'" # pylint: disable=line-too-long + gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long try: # Run the command and capture the output - gpu_count_output = subprocess.check_output( - gpu_count_command, shell=True, text=True - ) - gpu_count = gpu_count_output.strip() # Remove any extra whitespace - gpu_message = f" and {gpu_count} {gpu_type_str}GPUs" + gpu_count_output = subprocess.check_output(gpu_count_command, + shell=True, + text=True) + gpu_count = gpu_count_output.strip( + ) # Remove any extra whitespace + gpu_message = f' and {gpu_count} {gpu_type_str}GPUs' except subprocess.CalledProcessError as e: - output = str(e.output.decode("utf-8")) - logger.warning(f"Failed to get GPU count: {output}") - gpu_message = f" with {gpu_type_str}GPU support" + output = str(e.output.decode('utf-8')) + logger.warning(f'Failed to get GPU count: {output}') + gpu_message = f' with {gpu_type_str}GPU support' gpu_hint = ( - ( - "\nHint: To see the list of GPUs in the cluster, " - "run 'sky show-gpus --cloud kubernetes'" - ) - if gpus - else "" - ) + '\nHint: To see the list of GPUs in the cluster, ' + 'run \'sky show-gpus --cloud kubernetes\'') if gpus else '' if num_cpus < 2: - click.echo( - "Warning: Local cluster has less than 2 CPUs. " - "This may cause issues with running tasks." - ) + click.echo('Warning: Local cluster has less than 2 CPUs. ' + 'This may cause issues with running tasks.') click.echo( - f"\n{colorama.Fore.GREEN}Local Kubernetes cluster created " - "successfully with " - f"{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can " - "now run tasks locally." - "\nHint: To change the number of CPUs, change your docker " - "runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info." # pylint: disable=line-too-long - f"{gpu_hint}" - ) + f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created ' + 'successfully with ' + f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can ' + 'now run tasks locally.' + '\nHint: To change the number of CPUs, change your docker ' + 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long + f'{gpu_hint}') -@local.command("down", cls=_DocumentedCodeCommand) +@local.command('down', cls=_DocumentedCodeCommand) @usage_lib.entrypoint def local_down(): """Deletes a local cluster.""" cluster_removed = False path_to_package = os.path.dirname(os.path.dirname(__file__)) - down_script_path = os.path.join( - path_to_package, "sky/utils/kubernetes", "delete_cluster.sh" - ) + down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes', + 'delete_cluster.sh') cwd = os.path.dirname(os.path.abspath(down_script_path)) run_command = shlex.split(down_script_path) # Setup logging paths run_timestamp = backend_utils.get_run_timestamp() - log_path = os.path.join( - constants.SKY_LOGS_DIRECTORY, run_timestamp, "local_down.log" - ) - tail_cmd = "tail -n100 -f " + log_path + log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp, + 'local_down.log') + tail_cmd = 'tail -n100 -f ' + log_path - with rich_utils.safe_status("[bold cyan]Removing local cluster..."): + with rich_utils.safe_status('[bold cyan]Removing local cluster...'): style = colorama.Style - click.echo( - "To view detailed progress: " f"{style.BRIGHT}{tail_cmd}{style.RESET_ALL}" - ) - returncode, stdout, stderr = log_lib.run_with_log( - cmd=run_command, - log_path=log_path, - require_outputs=True, - stream_logs=False, - cwd=cwd, - ) - stderr = stderr.replace("No kind clusters found.\n", "") + click.echo('To view detailed progress: ' + f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}') + returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command, + log_path=log_path, + require_outputs=True, + stream_logs=False, + cwd=cwd) + stderr = stderr.replace('No kind clusters found.\n', '') if returncode == 0: cluster_removed = True elif returncode == 100: - click.echo("\nLocal cluster does not exist.") + click.echo('\nLocal cluster does not exist.') else: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - "Failed to create local cluster. " - f"Stdout: {stdout}" - f"\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}" - ) + 'Failed to create local cluster. ' + f'Stdout: {stdout}' + f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}') if cluster_removed: # Run sky check - with rich_utils.safe_status("[bold cyan]Running sky check..."): - sky_check.check(clouds=["kubernetes"], quiet=True) - click.echo(f"{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}") + with rich_utils.safe_status('[bold cyan]Running sky check...'): + sky_check.check(clouds=['kubernetes'], quiet=True) + click.echo( + f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}') def main(): return cli() -if __name__ == "__main__": +if __name__ == '__main__': main()