diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst index 9ee47b91643..e612efcea08 100644 --- a/docs/source/examples/auto-failover.rst +++ b/docs/source/examples/auto-failover.rst @@ -16,11 +16,9 @@ searching for regions (or clouds) that can provide the requested resources. .. tip:: - No action is required to use this feature. - - Auto-failover is automatically enabled whenever a new cluster is to be - provisioned, such as during :code:`sky launch` or the :ref:`interactive node - commands ` :code:`sky {gpunode,cpunode,tpunode}`. + No action is required to use this feature. Auto-failover is automatically + enabled whenever a new cluster is to be provisioned, such as during :code:`sky + launch`. If specific :code:`cloud`, ``region``, or ``zone`` are requested for a task, auto-failover retries only within the specified location. @@ -36,16 +34,8 @@ provisioner handles such a request: .. code-block:: - $ sky gpunode -c gpu --gpus V100 - I 02-11 21:17:43 optimizer.py:211] Defaulting estimated time to 1 hr. Call Task.set_time_estimator() to override. - I 02-11 21:17:43 optimizer.py:317] Optimizer - plan minimizing cost (~$3.0): - I 02-11 21:17:43 optimizer.py:332] - I 02-11 21:17:43 optimizer.py:332] TASK BEST_RESOURCE - I 02-11 21:17:43 optimizer.py:332] gpunode GCP(n1-highmem-8, {'V100': 1.0}) - I 02-11 21:17:43 optimizer.py:332] - I 02-11 21:17:43 optimizer.py:285] Considered resources -> cost - I 02-11 21:17:43 optimizer.py:286] {AWS(p3.2xlarge): 3.06, GCP(n1-highmem-8, {'V100': 1.0}): 2.953212} - I 02-11 21:17:43 optimizer.py:286] + $ sky launch -c gpu --gpus V100 + ... # optimizer output I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})]. I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. I 02-11 21:17:43 cloud_vm_ray_backend.py:614] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log @@ -78,17 +68,9 @@ AWS, where it succeeded after two regions: .. code-block:: - $ sky gpunode --gpus V100:8 - I 02-23 16:39:59 optimizer.py:213] Defaulting estimated time to 1 hr. Call Task.set_time_estimator() to override. - I 02-23 16:39:59 optimizer.py:323] Optimizer - plan minimizing cost (~$20.3): - I 02-23 16:39:59 optimizer.py:337] - I 02-23 16:39:59 optimizer.py:337] TASK BEST_RESOURCE - I 02-23 16:39:59 optimizer.py:337] gpunode GCP(n1-highmem-8, {'V100': 8.0}) - I 02-23 16:39:59 optimizer.py:337] - I 02-23 16:39:59 optimizer.py:290] Considered resources -> cost - I 02-23 16:39:59 optimizer.py:292] {GCP(n1-highmem-8, {'V100': 8.0}): 20.313212, AWS(p3.16xlarge): 24.48} - I 02-23 16:39:59 optimizer.py:292] - I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Creating a new cluster: "sky-gpunode-zongheng" [1x GCP(n1-highmem-8, {'V100': 8.0})]. + $ sky launch -c v100-8 --gpus V100:8 + ... # optimizer output + I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})]. I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. I 02-23 16:39:59 cloud_vm_ray_backend.py:658] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log I 02-23 16:39:59 cloud_vm_ray_backend.py:668] @@ -112,14 +94,7 @@ AWS, where it succeeded after two regions: E 02-23 16:41:50 cloud_vm_ray_backend.py:746] Failed to acquire resources in all regions/zones (requested GCP(n1-highmem-8, {'V100': 8.0})). Try changing resource requirements or use another cloud. W 02-23 16:41:50 cloud_vm_ray_backend.py:891] W 02-23 16:41:50 cloud_vm_ray_backend.py:891] Provision failed for GCP(n1-highmem-8, {'V100': 8.0}). Trying other launchable resources (if any)... - I 02-23 16:41:50 optimizer.py:213] Defaulting estimated time to 1 hr. Call Task.set_time_estimator() to override. - I 02-23 16:41:50 optimizer.py:323] Optimizer - plan minimizing cost (~$24.5): - I 02-23 16:41:50 optimizer.py:337] - I 02-23 16:41:50 optimizer.py:337] TASK BEST_RESOURCE - I 02-23 16:41:50 optimizer.py:337] gpunode AWS(p3.16xlarge) - I 02-23 16:41:50 optimizer.py:337] - I 02-23 16:41:50 cloud_vm_ray_backend.py:658] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log - I 02-23 16:41:50 cloud_vm_ray_backend.py:668] + ... I 02-23 16:41:50 cloud_vm_ray_backend.py:668] Launching on AWS us-east-1 (us-east-1a,us-east-1b,us-east-1c,us-east-1d,us-east-1e,us-east-1f) W 02-23 16:42:15 cloud_vm_ray_backend.py:477] Got error(s) in all zones of us-east-1: W 02-23 16:42:15 cloud_vm_ray_backend.py:479] create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-1a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-1b, us-east-1d, us-east-1f., retrying. diff --git a/docs/source/examples/gpu-jupyter.rst b/docs/source/examples/gpu-jupyter.rst index 014d529ff13..7f4c593e02f 100644 --- a/docs/source/examples/gpu-jupyter.rst +++ b/docs/source/examples/gpu-jupyter.rst @@ -5,22 +5,20 @@ Jupyter notebooks are a useful tool for interactive development, debugging, and visualization. SkyPilot makes the process of running a GPU-backed Jupyter notebook simple by automatically managing provisioning and port forwarding. -To get a machine with a GPU attached, we recommend using an interactive **GPU node**. -You can read more about interactive nodes :ref:`here `. +To get a machine with a GPU attached, use: .. code-block:: bash # Launch a VM with 1 NVIDIA GPU and forward port 8888 to localhost - sky gpunode -p 8888 -c jupyter-vm --gpus K80:1 + sky launch -c jupyter-vm --gpus K80:1 + ssh -L 8888:localhost:8888 jupyter-vm .. note:: View the supported GPUs with the :code:`sky show-gpus` command. - -The above command will automatically log in to the cluster once the cluster is provisioned (or re-use an existing one). - -Inside the VM, you can run the following commands to start a Jupyter session: +Use ``ssh jupyter-vm`` to SSH into the VM. Inside the VM, you can run the +following commands to start a Jupyter session: .. code-block:: bash diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 38f1b1ab129..b9720f4b383 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -123,7 +123,7 @@ This may show multiple clusters, if you have created several: .. code-block:: NAME LAUNCHED RESOURCES COMMAND STATUS - gcp 1 day ago 1x GCP(n1-highmem-8) sky cpunode -c gcp --cloud gcp STOPPED + mygcp 1 day ago 1x GCP(n1-highmem-8) sky launch -c mygcp --cloud gcp STOPPED mycluster 4 mins ago 1x AWS(p3.2xlarge) sky exec mycluster hello_sky.yaml UP @@ -152,6 +152,9 @@ Simply run :code:`ssh ` to log into a cluster: The above are achieved by adding appropriate entries to ``~/.ssh/config``. +Because SkyPilot exposes SSH access to clusters, this means clusters can be easily used inside +tools such as `Visual Studio Code Remote `_. + Transfer files =============== @@ -178,6 +181,16 @@ To terminate a cluster instead, run :code:`sky down`: $ sky down mycluster +.. note:: + + Stopping a cluster does not lose data on the attached disks (billing for the + instances will stop while the disks will still be charged). Those disks + will be reattached when restarting the cluster. + + Terminating a cluster will delete all associated resources (all billing + stops), and any data on the attached disks will be lost. Terminated + clusters cannot be restarted. + Find more commands that manage the lifecycle of clusters in the :ref:`CLI reference `. Scaling out @@ -186,7 +199,7 @@ Scaling out So far, we have used SkyPilot's CLI to submit work to and interact with a single cluster. When you are ready to scale out (e.g., run 10s or 100s of jobs), SkyPilot supports two options: -- Queue jobs on one or more clusters with ``sky exec`` (see :ref:`Job Queue `); or +- Queue many jobs on your cluster(s) with ``sky exec`` (see :ref:`Job Queue `); - Use :ref:`Managed Spot Jobs ` to run on auto-managed spot instances (users need not interact with the underlying clusters) diff --git a/docs/source/index.rst b/docs/source/index.rst index e6e48e8a0eb..a4cfcdb6319 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -132,7 +132,6 @@ Documentation examples/docker-containers examples/ports reference/tpu - reference/interactive-nodes reference/logging reference/faq diff --git a/docs/source/reference/cli.rst b/docs/source/reference/cli.rst index 3533e3c9524..2f399643c41 100644 --- a/docs/source/reference/cli.rst +++ b/docs/source/reference/cli.rst @@ -69,23 +69,6 @@ Managed Spot Jobs CLI :prog: sky spot logs :nested: full -Interactive Node CLI ------------------------ - -.. click:: sky.cli:cpunode - :prog: sky cpunode - :nested: full - -.. _sky-gpunode: -.. click:: sky.cli:gpunode - :prog: sky gpunode - :nested: full - -.. click:: sky.cli:tpunode - :prog: sky tpunode - :nested: full - - Storage CLI ------------ diff --git a/docs/source/reference/interactive-nodes.rst b/docs/source/reference/interactive-nodes.rst deleted file mode 100644 index f92347abba5..00000000000 --- a/docs/source/reference/interactive-nodes.rst +++ /dev/null @@ -1,128 +0,0 @@ -.. _interactive-nodes: - -Interactive Nodes -================= - -SkyPilot provides **interactive nodes**, the user's *personal work servers* in the -clouds. These are single-node VMs that can be quickly accessed by convenient -CLI commands: - -- :code:`sky gpunode` -- :code:`sky cpunode` -- :code:`sky tpunode` - -Interactive nodes are normal SkyPilot clusters. They allow fast access to instances -without requiring a task YAML specification. - -Workflow --------- - -Use :code:`sky gpunode` to get a node with GPU(s): - -.. code-block:: console - - $ # Create and log in to a cluster with the - $ # default name, "sky-gpunode-". - $ sky gpunode - - $ # Or, use -c to set a custom name to manage multiple clusters: - $ # sky gpunode -c node0 - -Use :code:`--gpus` to change the type and the number of GPUs: - -.. code-block:: console - - $ sky gpunode # By default, use 1 K80 GPU. - $ sky gpunode --gpus V100 - $ sky gpunode --gpus V100:8 - - $ # To see available GPU names: - $ # sky show-gpus - -Directly set a cloud and an instance type, if required: - -.. code-block:: console - - $ sky gpunode --cloud aws --instance-type p2.16xlarge - -See all available options and short keys: - -.. code-block:: console - - $ sky gpunode --help - -SkyPilot also provides :code:`sky cpunode` for CPU-only instances and :code:`sky -tpunode` for TPU instances (only available on Google Cloud Platform). - -To log in to an interactive node, either re-type the CLI command or use :code:`ssh`: - -.. code-block:: console - - $ # If the cluster with the default name exists, this will directly log in. - $ sky gpunode - - $ # Equivalently: - $ ssh sky-gpunode- - - $ # Use -c to refer to different interactive nodes. - $ # sky gpunode -c node0 - $ # ssh node0 - -Because SkyPilot exposes SSH access to clusters, this means clusters can be easily added into -tools such as `Visual Studio Code Remote `_. - -Since interactive nodes are just normal SkyPilot clusters, :code:`sky exec` can be used to submit jobs to them. - -Interactive nodes can be stopped, restarted, and terminated, like any other cluster: - -.. code-block:: console - - $ # Stop at the end of the work day: - $ sky stop sky-gpunode- - - $ # Restart it the next morning: - $ sky start sky-gpunode- - - $ # Terminate entirely: - $ sky down sky-gpunode- - -.. note:: - - Stopping a cluster does not lose data on the attached disks (billing for the - instances will stop while the disks will still be charged). Those disks - will be reattached when restarting the cluster. - - Terminating a cluster will delete all associated resources (all billing - stops), and any data on the attached disks will be lost. Terminated - clusters cannot be restarted. - -.. note:: - - Since :code:`sky start` restarts a stopped cluster, :ref:`auto-failover - provisioning ` is disabled---the cluster will be restarted on - the same cloud and region where it was originally provisioned. - - -Getting multiple nodes ----------------------- -By default, interactive clusters are a single node. If you require a cluster -with multiple nodes, use ``sky launch`` directly: - -.. code-block:: console - - $ sky launch -c my-cluster --num-nodes 16 --gpus V100:8 - -The same can be achieved with a YAML spec: - -.. code-block:: yaml - - # multi_node.yaml - num_nodes: 16 - resources: - accelerators: V100:8 - -.. code-block:: console - - $ sky launch -c my-cluster multi_node.yaml - -You can then :ref:`SSH into any node ` of the cluster by name. diff --git a/docs/source/reference/tpu.rst b/docs/source/reference/tpu.rst index dc56ff58a41..d723747c62c 100644 --- a/docs/source/reference/tpu.rst +++ b/docs/source/reference/tpu.rst @@ -16,15 +16,17 @@ ML researchers and students are encouraged to apply for free TPU access through Getting TPUs in one command =========================== -Like :ref:`GPUs `, SkyPilot provides a simple command to quickly get TPUs for development: +Use one command to quickly get TPU nodes for development: .. code-block:: bash - sky tpunode # By default TPU v2-8 is used - sky tpunode --use-spot # Preemptible TPUs - sky tpunode --tpus tpu-v3-8 # Change TPU type to tpu-v3-8 - sky tpunode --instance-type n1-highmem-16 # Change the host VM type to n1-highmem-16 - sky tpunode --tpu-vm # Use TPU VM (instead of TPU Node) + sky launch --gpus tpu-v2-8 + # Preemptible TPUs: + sky launch --gpus tpu-v2-8 --use-spot + # Change TPU type to tpu-v3-8: + sky launch --gpus tpu-v3-8 + # Change the host VM type to n1-highmem-16: + sky launch --gpus tpu-v3-8 -t n1-highmem-16 After the command finishes, you will be dropped into a TPU host VM and can start developing code right away. @@ -48,7 +50,7 @@ More details can be found on GCP `documentation 8 cores (the last number) + accelerators: tpu-v2-32 # Pods have > 8 cores (the last number) accelerator_args: runtime_version: tpu-vm-base tpu_vm: True diff --git a/sky/cli.py b/sky/cli.py index 1c40099ea90..fe69fe29587 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -1205,6 +1205,18 @@ def _add_command_alias_to_group(group, command, name, hidden): group.add_command(new_command, name=name) +def _deprecate_and_hide_command(group, command_to_deprecate, + alternative_command): + """Hide a command and show a deprecation note, hinting the alternative.""" + command_to_deprecate.hidden = True + if group is not None: + orig = f'sky {group.name} {command_to_deprecate.name}' + else: + orig = f'sky {command_to_deprecate.name}' + command_to_deprecate.invoke = _with_deprecation_warning( + command_to_deprecate.invoke, alternative_command, orig) + + @click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS) @click.option('--install-shell-completion', type=click.Choice(['bash', 'zsh', 'fish', 'auto']), @@ -1386,7 +1398,7 @@ def launch( no_setup: bool, clone_disk_from: Optional[str], ): - """Launch a task from a YAML or a command (rerun setup if cluster exists). + """Launch a cluster or task. If ENTRYPOINT points to a valid YAML file, it is read in as the task specification. Otherwise, it is interpreted as a bash command. @@ -1505,7 +1517,7 @@ def exec( env: List[Tuple[str, str]], ): # NOTE(dev): Keep the docstring consistent between the Python API and CLI. - """Execute a task or a command on a cluster (skip setup). + """Execute a task or command on an existing cluster. If ENTRYPOINT points to a valid YAML file, it is read in as the task specification. Otherwise, it is interpreted as a bash command. @@ -3669,7 +3681,9 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r subprocess_utils.run_in_parallel(sky.storage_delete, names) -@cli.group(cls=_NaturalOrderGroup) +# TODO(skypilot): remove all code related to the deprecated `sky admin` code +# path. +@cli.group(cls=_NaturalOrderGroup, hidden=True) def admin(): """SkyPilot On-prem administrator CLI.""" pass @@ -3745,9 +3759,15 @@ def admin_deploy(clusterspec_yaml: str): fg='green') +@cli.group(cls=_NaturalOrderGroup) +def bench(): + """SkyPilot Benchmark CLI.""" + pass + + @cli.group(cls=_NaturalOrderGroup) def spot(): - """Managed Spot commands (spot instances with auto-recovery).""" + """Managed Spot CLI (spot instances with auto-recovery).""" pass @@ -4022,9 +4042,6 @@ def spot_queue(all: bool, refresh: bool, skip_finished: bool): f'{in_progress_only_hint}\n{msg}') -_add_command_alias_to_group(spot, spot_queue, 'status', hidden=True) - - @spot.command('cancel', cls=_DocumentedCodeCommand) @click.option('--name', '-n', @@ -4187,7 +4204,7 @@ def spot_dashboard(port: Optional[int]): @cli.group(cls=_NaturalOrderGroup) def serve(): - """SkyServe commands CLI.""" + """SkyServe CLI (multi-region, multi-cloud serving).""" pass @@ -4590,12 +4607,6 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]: return candidates -@cli.group(cls=_NaturalOrderGroup) -def bench(): - """SkyPilot Benchmark CLI.""" - pass - - @bench.command('launch', cls=_DocumentedCodeCommand) @click.argument('entrypoint', required=True, @@ -5261,6 +5272,19 @@ def local_down(): click.echo('Local cluster removed.') +# TODO(skypilot): remove the below in v0.5. +_add_command_alias_to_group(spot, spot_queue, 'status', hidden=True) +_deprecate_and_hide_command(group=None, + command_to_deprecate=cpunode, + alternative_command='sky launch') +_deprecate_and_hide_command(group=None, + command_to_deprecate=gpunode, + alternative_command='sky launch --gpus ') +_deprecate_and_hide_command(group=None, + command_to_deprecate=tpunode, + alternative_command='sky launch --gpus ') + + def main(): return cli() diff --git a/sky/execution.py b/sky/execution.py index 2612b634d8d..affe3c1e90d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -410,7 +410,7 @@ def launch( _is_launched_by_sky_serve_controller: bool = False, ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. - """Launch a task. + """Launch a cluster or task. The task's setup and run commands are executed under the task's workdir (when specified, it is synced to remote cluster). The task undergoes job diff --git a/sky/task.py b/sky/task.py index 8d45679c707..b2af3b1ae24 100644 --- a/sky/task.py +++ b/sky/task.py @@ -188,7 +188,7 @@ def __init__( Optionally, call ``Task.set_resources()`` to set the resource requirements for this task. If not set, a default CPU-only requirement - is assumed (the same as ``sky cpunode``). + is assumed (the same as ``sky launch``). All setters of this class, ``Task.set_*()``, return ``self``, i.e., they are fluent APIs and can be chained together.