diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css index 4bb1b67f9e3..d5bbdd6cb51 100644 --- a/docs/source/_static/custom.css +++ b/docs/source/_static/custom.css @@ -115,7 +115,7 @@ html[data-theme="dark"] { padding: 2px 5px; /* Reduced padding for a more compact label */ margin-left: 6px; /* Space between the text and the label */ - vertical-align: middle; + vertical-align: text-bottom; line-height: 1; /* Adjust line height to ensure vertical alignment */ } diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index e0de1b50d51..31b3692d8b8 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -27,8 +27,9 @@ document.addEventListener('DOMContentLoaded', () => { const newItems = [ { selector: '.caption-text', text: 'SkyServe: Model Serving' }, { selector: '.toctree-l1 > a', text: 'Managed Jobs' }, - { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' }, { selector: '.toctree-l1 > a', text: 'Llama-3.1 (Meta)' }, + { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, + { selector: '.toctree-l1 > a', text: 'Reserved, Capacity Blocks, DWS' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst new file mode 100644 index 00000000000..a6b76e8a53c --- /dev/null +++ b/docs/source/developers/index.rst @@ -0,0 +1,8 @@ +Developer Guides +================= + +.. toctree:: + :maxdepth: 1 + + ../developers/CONTRIBUTING + Guide: Adding a New Cloud diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index f8d329a1431..dbb9a32780d 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -129,8 +129,8 @@ Read the research: ../getting-started/installation ../getting-started/quickstart - ../getting-started/tutorial ../examples/interactive-development + ../getting-started/tutorial .. toctree:: @@ -141,8 +141,16 @@ Read the research: ../examples/managed-jobs ../reference/job-queue ../examples/auto-failover - ../reference/kubernetes/index ../running-jobs/distributed-jobs + ../running-jobs/many-jobs + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Reserved & Existing Clusters + + ../reservations/reservations + ../reference/kubernetes/index .. toctree:: :hidden: @@ -184,14 +192,6 @@ Read the research: SkyPilot vs. Other Systems <../reference/comparison> -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Developer Guides - - ../developers/CONTRIBUTING - Guide: Adding a New Cloud - .. toctree:: :hidden: :maxdepth: 1 @@ -210,4 +210,5 @@ Read the research: ../reference/cli ../reference/api ../reference/config + ../developers/index diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst index 582db94ee79..408a53a6185 100644 --- a/docs/source/examples/docker-containers.rst +++ b/docs/source/examples/docker-containers.rst @@ -161,6 +161,15 @@ Any GPUs assigned to the task will be automatically mapped to your Docker contai 2. The container image must grant sudo permissions without requiring password authentication for the user. Having a root user is also acceptable. +.. note:: + + Using a container with a customized entrypoint as a runtime environment is + supported, with the container's entrypoint being overridden by :code:`/bin/bash`. + Specific commands can be executed in the :code:`setup` and :code:`run` sections + of the task YAML file. However, this approach is not compatible with RunPod due + to limitations in the RunPod API, so ensure that you choose a container with a + default entrypoint (i.e. :code:`/bin/bash`). + Private Registries ^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index bfc6fd17e05..cdef2335dd7 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -219,7 +219,7 @@ Congratulations! In this quickstart, you have launched a cluster, run a task, a Next steps: -- Adapt :ref:`Tutorial: DNN Training ` to start running your own project on SkyPilot! +- Adapt :ref:`Tutorial: AI Training ` to start running your own project on SkyPilot! - See the :ref:`Task YAML reference `, :ref:`CLI reference `, and `more examples `_ - To learn more, try out `SkyPilot Tutorials `_ in Jupyter notebooks diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst index 92ef6f68b2f..175f1391a6d 100644 --- a/docs/source/getting-started/tutorial.rst +++ b/docs/source/getting-started/tutorial.rst @@ -1,6 +1,6 @@ -.. _dnn-training: +.. _ai-training: -Tutorial: DNN Training +Tutorial: AI Training ====================== This example uses SkyPilot to train a Transformer-based language model from HuggingFace. diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst index 5569c6ec145..5a966a0014f 100644 --- a/docs/source/reference/faq.rst +++ b/docs/source/reference/faq.rst @@ -213,20 +213,3 @@ To launch a VS Code tunnel using a SkyPilot task definition, you can use the fol Note that you'll be prompted to authenticate with your GitHub account to launch a VS Code tunnel. -PyTorch 2.2.0 failed on SkyPilot clusters. What should I do? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The latest PyTorch release (2.2.0) has a version conflict with the default cuDNN version on SkyPilot clusters, which may raise a segmentation fault when you run the job. - -To fix this, you can choose one of the following solutions: - -1. Use older version of PyTorch (like 2.1.0) instead of 2.2.0, i.e. :code:`pip install "torch<2.2"`; -2. Remove the cuDNN from the cluster's :code:`LD_LIBRARY_PATH` by adding the following line to your task: - -.. code-block:: yaml - - run: | - export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's|:/usr/local/cuda/lib64||g; s|/usr/local/cuda/lib64:||g; s|/usr/local/cuda/lib64||g') - # Other commands using PyTorch 2.2.0 - ... - diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst index 6397c7bbbb6..c0016c4d6da 100644 --- a/docs/source/reference/job-queue.rst +++ b/docs/source/reference/job-queue.rst @@ -160,7 +160,7 @@ SkyPilot's scheduler serves two goals: 2. **Minimizing resource idleness**: If a resource is idle, SkyPilot will schedule a queued job that can utilize that resource. -We illustrate the scheduling behavior by revisiting :ref:`Tutorial: DNN Training `. +We illustrate the scheduling behavior by revisiting :ref:`Tutorial: AI Training `. In that tutorial, we have a task YAML that specifies these resource requirements: .. code-block:: yaml diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst index bde97615e80..86e153bd8fc 100644 --- a/docs/source/reference/kubernetes/index.rst +++ b/docs/source/reference/kubernetes/index.rst @@ -1,7 +1,7 @@ .. _kubernetes-overview: -Running on Kubernetes -============================= +Using Kubernetes +================ SkyPilot tasks can be run on your private on-prem or cloud Kubernetes clusters. The Kubernetes cluster gets added to the list of "clouds" in SkyPilot and SkyPilot @@ -116,4 +116,4 @@ Kubernetes support is under active development. Some features are in progress an * Multi-node tasks - ✅ Available * Custom images - ✅ Available * Opening ports and exposing services - ✅ Available -* Multiple Kubernetes Clusters - 🚧 In progress \ No newline at end of file +* Multiple Kubernetes Clusters - 🚧 In progress diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 0354d3d0395..228cbd7c88f 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -113,12 +113,14 @@ Available fields: disk_size: 256 # Disk tier to use for OS (optional). - # Could be one of 'low', 'medium', 'high' or 'best' (default: 'medium'). + # Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium'). # if 'best' is specified, use the best disk tier enabled. # Rough performance estimate: - # low: 500 IOPS; read 20MB/s; write 40 MB/s - # medium: 3000 IOPS; read 220 MB/s; write 200 MB/s - # high: 6000 IOPS; 340 MB/s; write 250 MB/s + # low: 1000 IOPS; read 90 MB/s; write 90 MB/s + # medium: 3000 IOPS; read 220 MB/s; write 220 MB/s + # high: 6000 IOPS; read 400 MB/s; write 400 MB/s + # ultra: 60000 IOPS; read 4000 MB/s; write 3000 MB/s + # Measured by examples/perf/storage_rawperf.yaml disk_tier: medium # Ports to expose (optional). @@ -335,8 +337,8 @@ Available fields: .. _task-yaml-experimental: -Experimental ------------- +Experimental Configurations +--------------------------- .. note:: diff --git a/docs/source/reservations/reservations.rst b/docs/source/reservations/reservations.rst new file mode 100644 index 00000000000..8d0625846f7 --- /dev/null +++ b/docs/source/reservations/reservations.rst @@ -0,0 +1,208 @@ + +.. _reservation: + +Reserved, Capacity Blocks, DWS +=================================== + + +With the recent GPU shortage, reservations from cloud providers have become a common way to ensure GPU availability for a specific duration. These reservations can be short-term (e.g., 1-30 days) capacity guarantees, or long-term (e.g., 1-3 years) contracts. + +This guide shows how to use SkyPilot to request resources from reservations and even combine them with on-demand/spot resources to fully +utilize the capacity in your cloud accounts. + +.. image:: https://i.imgur.com/FA0BT0E.png + :width: 95% + :align: center + + +AWS Capacity Reservations & Capacity Blocks +-------------------------------------------- + +AWS **capacity reservations** and **capacity blocks** are ways to reserve a certain amount of compute capacity for a period of time. The latter is for high-end GPUs, such as A100s (P4d instances) and H100s (P5d instances), while the former is for all other instance types. +Instead of committing to a 1-3 year long contract, you can get a capacity reservation or capacity block for as short as 1 second or 1 day, respectively. + + +To request capacity reservations/blocks, see the official docs: + +* `AWS Capacity Reservations `_ +* `AWS Capacity Blocks `_ + +Once you have successfully created a reservation/block, you will get an ID of the reservation/block, such as ``cr-012345678``. + +To use the reservation/block, you can specify two fields in ``~/.sky/config.yaml``: + +* ``aws.prioritize_reservations``: whether to prioritize launching clusters from capacity reservations in any region/zone over on-demand/spot clusters. This is useful to fully utilize your reserved capacity created with ``Instance eligibility: open``. +* ``aws.specific_reservations``: a list of reservation IDs that can be used by SkyPilot. This is useful if you have multiple capacity reservations or blocks with ``Instance eligibility: targeted`` for different instance types in multiple regions/zones. + + +Example: + +.. code-block:: yaml + + aws: + prioritize_reservations: true + specific_reservations: + # 1x H100 capacity block in us-east-1 + - "cr-0123456789" + # 2x A100 reservation in us-east-2 + - "cr-123456789a" + # 2x A100 reservation in us-west-2 + - "cr-23456789ab" + # 2x M5a.16xlarge reservation in us-east-1 + - "cr-3456789abc" + +For more details of the fields, see :ref:`config-yaml`. + +.. note:: + + If any of the fields are specified, SkyPilot optimizer may take around 30 seconds to retrieve the latest reservation/block status on all regions and zones from your AWS account. + + +.. _utilizing-reservations: + +Utilizing Reservations +~~~~~~~~~~~~~~~~~~~~~~ + +By specifying configurations above, SkyPilot will prioritize using any available capacity in reservation/block (i.e., consider them as zero cost) whenever you launch a cluster/job. + +Specifically, SkyPilot's behavior is as follows: + +1. Query reservations/blocks across AWS regions and zones to find all available capacity. (If the task specifies specific regions or zones to use, only those are queried.) +2. For each zone, calculate its cost: any available reserved capacity is considered as zero cost, and if any on-demand/spot resource is needed to supplement the available reserved capacity to fully satisfy the request, their on-demand/spot price is included. +3. :ref:`Automatically failover ` through these zones in increasing per-zone cost order until the requested resources are provisioned. + + +For example, if you are launching a cluster with the following SkyPilot YAML: + +.. code-block:: yaml + + resources: + cloud: aws + accelerators: A100:8 + + num_nodes: 2 + + +SkyPilot will utilize the capacity reservation/block as follows: + +1. Query reservations/blocks in ``us-east-2`` and ``us-west-2`` in reservation ``cr-123456789a`` and ``cr-23456789ab``, respectively. Assume the results are: + + - 1 A100 instance capacity is available in ``us-east-2``, + - No available capacity in ``us-west-2``. +2. SkyPilot calculates the pricing for all zones as described above. The result is ``us-east-2`` zones are cheaper than all other zones, because the former's costs are 1 on-demand node's cost for 2 nodes (by satisfying 1 node using the reserved capacity). +3. SkyPilot will thus try to provision an on-demand A100 instance in ``us-east-2``. On unavailability, SkyPilot will continue to :ref:`automatically failover ` to other clouds/regions/zones for normal on-demand/spot instances. + + +.. hint:: + + If you have a capacity block with a starting time in the future, you can run ``sky jobs launch --region us-east-1 --gpus H100:8 task.yaml`` to let SkyPilot automatically wait until the starting time is reached. Namely, you don't have to wake up at 4:30am PDT to launch your job on a newly available capacity block. + + +GCP reservations +----------------- + +GCP reservations are similar to AWS capacity reservations, where you can reserve a certain amount of compute capacity for any period of time. + +To get a reservation, see the `GCP official docs `__. + +Like AWS, you can specify two fields in ``~/.sky/config.yaml``: + +* ``gcp.prioritize_reservations``: whether to prioritize launching clusters from reservations in any region/zone over on-demand/spot clusters. This is useful to fully utilize your `automatically consumed reservations `__. +* ``gcp.specific_reservations``: a list of reservation IDs that can be used by SkyPilot. This is useful if you have multiple `specific reservations `__ for different instance types in multiple regions/zones. + +Example: + +.. code-block:: yaml + + gcp: + prioritize_reservations: true + specific_reservations: + - projects/my-project/reservations/my-reservation1 + - projects/my-project/reservations/my-reservation2 + + +SkyPilot will utilize the reservations similar to AWS reservations as described in :ref:`utilizing-reservations`. + + +GCP Dynamic Workload Scheduler (DWS) +------------------------------------- + +GCP `Dynamic Workload Scheduler (DWS) `__ is a resource management service that (1) receives a GPU capacity request, (2) automatically provisions the requested resources when they become available, and (3) keeps the resources running for a specified duration. + +.. tip:: + + It has been observed that using DWS can significantly increase the chance of getting a high-end GPU resource, such as A100s and H100s, compared to using on-demand or spot instances. + + +Using DWS for VMs +~~~~~~~~~~~~~~~~~ + +SkyPilot allows you to launch resources via DWS by specifying the ``gcp.managed_instance_group`` field in ``~/.sky/config.yaml``: + +.. code-block:: yaml + + gcp: + managed_instance_group: + run_duration: 3600 + provision_timeout: 900 + + +1. ``run_duration``: duration for a created instance to be kept alive (in seconds, required). +2. ``provision_timeout``: timeout for provisioning an instance with DWS (in seconds, optional). If the timeout is reached without requested resources being provisioned, SkyPilot will automatically :ref:`failover ` to other clouds/regions/zones to get the resources. + +See :ref:`config-yaml` for more details. + +In case you want to specify the DWS configuration for each job/cluster, you can also specify the configuration in the SkyPilot task YAML (see :ref:`here `): + +.. code-block:: yaml + + experimental: + config_overrides: + gcp: + managed_instance_group: + run_duration: 3600 + provision_timeout: 900 + + resources: + cloud: gcp + accelerators: A100:8 + + num_nodes: 4 + +Using DWS on GKE with Kueue +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DWS is also supported on Google Kubernetes Engine (GKE) with Kueue. To enable DWS on GKE, you need to set up your GKE cluster with Kueue and DWS; see the `GCP official docs `__. + +To launch a SkyPilot cluster or job on GKE with DWS, you can specify the DWS configuration in the SkyPilot task YAML: + +.. code-block:: yaml + + experimental: + config_overrides: + kubernetes: + pod_config: + metadata: + annotations: + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "3600" + provision_timeout: 900 + + resourcse: + cloud: kubernetes + accelerators: A100:8 + labels: + kueue.x-k8s.io/queue-name: dws-local-queue + +1. ``kueue.x-k8s.io/queue-name``: name of the Kueue queue to submit your resource request to. +2. ``provreq.kueue.x-k8s.io/maxRunDurationSeconds``: maximum duration for a created instance to be kept alive (in seconds, required). +3. ``provision_timeout``: timeout for provisioning an instance with DWS (in seconds, optional). If the timeout is reached without getting the requested resources, SkyPilot will automatically :ref:`failover ` to other clouds/regions/zones to get the resources. + +Long-term reservations +---------------------- + +Unlike short-term reservations above, long-term reservations are typically more than one month long and can be viewed as a type of *on-prem cluster*. + +SkyPilot supports long-term reservations and on-premise clusters through Kubernetes, i.e., you can set up a Kubernetes cluster on top of your reserved resources and interact with them through SkyPilot. + +See the simple steps to set up a Kubernetes cluster on existing machines in :ref:`kubernetes-overview`. + diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst index 9eb590c10bc..da3ddd8e94f 100644 --- a/docs/source/running-jobs/distributed-jobs.rst +++ b/docs/source/running-jobs/distributed-jobs.rst @@ -1,6 +1,6 @@ .. _dist-jobs: -Distributed Jobs on Many Nodes +Distributed Multi-Node Jobs ================================================ SkyPilot supports multi-node cluster diff --git a/docs/source/running-jobs/many-jobs.rst b/docs/source/running-jobs/many-jobs.rst new file mode 100644 index 00000000000..3d26d74e794 --- /dev/null +++ b/docs/source/running-jobs/many-jobs.rst @@ -0,0 +1,346 @@ + +.. _many-jobs: + +Many Parallel Jobs +====================== + +SkyPilot allows you to easily **run many jobs in parallel** and manage them in a single system. This is useful for hyperparameter tuning sweeps, data processing, and other batch jobs. + +This guide shows a typical workflow for running many jobs with SkyPilot. + + +.. image:: https://i.imgur.com/tvxeNyR.png + :width: 90% + :align: center +.. TODO: Show the components in a GIF. + + +Why Use SkyPilot to Run Many Jobs +------------------------------------- + +- **Unified**: Use any or multiple of your own infrastructure (Kubernetes, cloud VMs, reservations, etc.). +- **Elastic**: Scale up and down based on demands. +- **Cost-effective**: Only pay for the cheapest resources. +- **Robust**: Automatically recover jobs from failures. +- **Observable**: Monitor and manage all jobs in a single pane of glass. + +Write a YAML for One Job +----------------------------------- + +Before scaling up to many jobs, write a SkyPilot YAML for a single job first and ensure it runs correctly. This can save time by avoiding debugging many jobs at once. + +Here is the same example YAML as in :ref:`Tutorial: AI Training `: + +.. raw:: html + +
+ Click to expand: train.yaml + +.. code-block:: yaml + + # train.yaml + name: huggingface + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --max_steps 50 \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 + + +.. raw:: html + +
+ + +First, launch the job to check it successfully launches and runs correctly: + +.. code-block:: bash + + sky launch -c train train.yaml + + +If there is any error, you can fix the code and/or the YAML, and launch the job again on the same cluster: + +.. code-block:: bash + + # Cancel the latest job. + sky cancel train -y + # Run the job again on the same cluster. + sky launch -c train train.yaml + + +Sometimes, it may be more efficient to log into the cluster and interactively debug the job. You can do so by directly :ref:`ssh'ing into the cluster or using VSCode's remote ssh `. + +.. code-block:: bash + + # Log into the cluster. + ssh train + + + +Next, after confirming the job is working correctly, **add (hyper)parameters** to the job YAML so that all job variants can be specified. + +1. Add Hyperparameters +~~~~~~~~~~~~~~~~~~~~~~ + +To launch jobs with different hyperparameters, add them as :ref:`environment variables ` to the SkyPilot YAML, and make your main program read these environment variables: + +.. raw:: html + +
+ Updated SkyPilot YAML: train-template.yaml + +.. code-block:: yaml + :emphasize-lines: 4-6,28-29 + + # train-template.yaml + name: huggingface + + envs: + LR: 2e-5 + MAX_STEPS: 50 + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate ${LR} \ + --max_steps ${MAX_STEPS} \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 + +.. raw:: html + +
+ +You can now use ``--env`` to launch a job with different hyperparameters: + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env LR=1e-5 \ + --env MAX_STEPS=100 + +Alternative, store the environment variable values in a dotenv file and use ``--env-file`` to launch: + +.. code-block:: bash + + # configs/job1 + LR=1e-5 + MAX_STEPS=100 + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env-file configs/job1 + + + +2. Logging Job Outputs +~~~~~~~~~~~~~~~~~~~~~~~ + +When running many jobs, it is useful to log the outputs of all jobs. You can use tools like `W&B `__ for this purpose: + +.. raw:: html + +
+ SkyPilot YAML with W&B: train-template.yaml + +.. code-block:: yaml + :emphasize-lines: 7-7,19-19,34-34 + + # train-template.yaml + name: huggingface + + envs: + LR: 2e-5 + MAX_STEPS: 50 + WANDB_API_KEY: # Empty field means this field is required when launching the job. + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + pip install wandb + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate ${LR} \ + --max_steps ${MAX_STEPS} \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 \ + --report_to wandb + +.. raw:: html + +
+ +You can now launch the job with the following command (``WANDB_API_KEY`` should existing in your local environment variables). + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env-file configs/job1 \ + --env WANDB_API_KEY + + + +Scale Out to Many Jobs +----------------------- + +With the above setup, you can now scale out to run many jobs in parallel. You +can either use SkyPilot CLI with many config files or use SkyPilot Python API. + +With CLI and Config Files +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can run many jobs in parallel by (1) creating multiple config files and (2) +submitting them as :ref:`SkyPilot managed jobs `. + +First, create a config file for each job (for example, in a ``configs`` directory): + +.. code-block:: bash + + # configs/job-1 + LR=1e-5 + MAX_STEPS=100 + + # configs/job-2 + LR=2e-5 + MAX_STEPS=200 + + ... + +.. raw:: html + +
+ An example Python script to generate config files + +.. code-block:: python + + import os + + CONFIG_PATH = 'configs' + LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0] + MAX_STEPS_CANDIDATES = [100, 300, 1000] + + os.makedirs(CONFIG_PATH, exist_ok=True) + + job_idx = 1 + for lr in LR_CANDIDATES: + for max_steps in MAX_STEPS_CANDIDATES: + config_file = f'{CONFIG_PATH}/job-{job_idx}' + with open(config_file, 'w') as f: + print(f'LR={lr}', file=f) + print(f'MAX_STEPS={max_steps}', file=f) + job_idx += 1 + +.. raw:: html + +
+ +Then, submit all jobs by iterating over the config files and calling ``sky jobs launch`` on each: + +.. code-block:: bash + + for config_file in configs/*; do + job_name=$(basename $config_file) + # -y: yes to all prompts. + # -d: detach from the job's logging, so the next job can be submitted + # without waiting for the previous job to finish. + sky jobs launch -n train-$job_name -y -d train-template.yaml \ + --env-file $config_file \ + --env WANDB_API_KEY + done + + +Job statuses can be checked via ``sky jobs queue``: + +.. code-block:: console + + $ sky jobs queue + + Fetching managed job statuses... + Managed jobs + In progress tasks: 10 RUNNING + ID TASK NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS + 10 - train-job10 1x[V100:4] 5 mins ago 5m 5s 1m 12s 0 RUNNING + 9 - train-job9 1x[V100:4] 6 mins ago 6m 11s 2m 23s 0 RUNNING + 8 - train-job8 1x[V100:4] 7 mins ago 7m 15s 3m 31s 0 RUNNING + ... + + +With Python API +~~~~~~~~~~~~~~~ + +To have more customized control over generation of job variants, you can also use SkyPilot Python API to launch the jobs. + +.. code-block:: python + + import os + import sky + + LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0] + MAX_STEPS_CANDIDATES = [100, 300, 1000] + task = sky.Task.from_yaml('train-template.yaml') + + job_idx = 1 + for lr in LR_CANDIDATES: + for max_steps in MAX_STEPS_CANDIDATES: + task.update_envs({'LR': lr, 'MAX_STEPS': max_steps}) + sky.jobs.launch( + task, + name=f'train-job{job_idx}', + detach_run=True, + retry_until_up=True, + ) + job_idx += 1 diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 9986f93275a..e316a1380bb 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -882,6 +882,11 @@ def write_cluster_config( f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\'' ) + # We disable conda auto-activation if the user has specified a docker image + # to use, which is likely to already have a conda environment activated. + conda_auto_activate = ('true' if to_provision.extract_docker_image() is None + else 'false') + # Use a tmp file path to avoid incomplete YAML file being re-used in the # future. tmp_yaml_path = yaml_path + '.tmp' @@ -916,10 +921,11 @@ def write_cluster_config( 'specific_reservations': specific_reservations, # Conda setup - 'conda_installation_commands': - constants.CONDA_INSTALLATION_COMMANDS, # We should not use `.format`, as it contains '{}' as the bash # syntax. + 'conda_installation_commands': + constants.CONDA_INSTALLATION_COMMANDS.replace( + '{conda_auto_activate}', conda_auto_activate), 'ray_skypilot_installation_commands': (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace( '{sky_wheel_hash}', diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 9545436f05c..ca18f44f6da 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -82,7 +82,7 @@ clouds.AWS: 90, clouds.Azure: 90, clouds.GCP: 240, - clouds.Lambda: 150, + clouds.Lambda: 300, clouds.IBM: 160, clouds.OCI: 300, clouds.Paperspace: 600, @@ -1933,7 +1933,7 @@ def provision_with_retries( while True: if (isinstance(to_provision.cloud, clouds.Azure) and to_provision.accelerators is not None and - 'A10' in to_provision.accelerators): + 'A10' in to_provision.accelerators and prev_handle is None): logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch ' 'an A10 cluster on Azure. This may take ~20 ' 'minutes due to driver installation.' @@ -3112,7 +3112,8 @@ def _setup_node(node_id: int) -> None: setup_script = log_lib.make_task_bash_script(setup, env_vars=setup_envs) encoded_script = shlex.quote(setup_script) - if detach_setup or _is_command_length_over_limit(encoded_script): + + def _dump_setup_script(setup_script: str) -> None: with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: f.write(setup_script) f.flush() @@ -3121,6 +3122,9 @@ def _setup_node(node_id: int) -> None: target=remote_setup_file_name, up=True, stream_logs=False) + + if detach_setup or _is_command_length_over_limit(encoded_script): + _dump_setup_script(setup_script) create_script_code = 'true' else: create_script_code = (f'{{ echo {encoded_script} > ' @@ -3128,20 +3132,42 @@ def _setup_node(node_id: int) -> None: if detach_setup: return + setup_log_path = os.path.join(self.log_dir, f'setup-{runner.node_id}.log') - returncode = runner.run( - f'{create_script_code} && {setup_cmd}', - log_path=setup_log_path, - process_stream=False, - # We do not source bashrc for setup, since bashrc is sourced - # in the script already. - # Skip an empty line and two lines due to the /bin/bash -i and - # source ~/.bashrc in the setup_cmd. - # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long - # bash: no job control in this shell - skip_lines=3, - ) + + def _run_setup(setup_cmd: str) -> int: + returncode = runner.run( + setup_cmd, + log_path=setup_log_path, + process_stream=False, + # We do not source bashrc for setup, since bashrc is sourced + # in the script already. + # Skip an empty line and two lines due to the /bin/bash -i + # and source ~/.bashrc in the setup_cmd. + # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long + # bash: no job control in this shell + skip_lines=3) + return returncode + + returncode = _run_setup(f'{create_script_code} && {setup_cmd}',) + if returncode == 255: + is_message_too_long = False + with open(setup_log_path, 'r', encoding='utf-8') as f: + if 'too long' in f.read(): + is_message_too_long = True + + if is_message_too_long: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a + # general length limit check before but it could be + # inaccurate on some systems. + logger.debug( + 'Failed to run setup command inline due to ' + 'command length limit. Dumping setup script to ' + 'file and running it with SSH.') + _dump_setup_script(setup_script) + returncode = _run_setup(setup_cmd) def error_message() -> str: # Use the function to avoid tailing the file in success case @@ -3223,7 +3249,8 @@ def _exec_code_on_head( code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd) job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code]) - if _is_command_length_over_limit(job_submit_cmd): + + def _dump_code_to_file(codegen: str) -> None: runners = handle.get_command_runners() head_runner = runners[0] with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp: @@ -3238,6 +3265,9 @@ def _exec_code_on_head( target=script_path, up=True, stream_logs=False) + + if _is_command_length_over_limit(job_submit_cmd): + _dump_code_to_file(codegen) job_submit_cmd = f'{mkdir_code} && {code}' if managed_job_dag is not None: @@ -3263,6 +3293,16 @@ def _exec_code_on_head( job_submit_cmd, stream_logs=False, require_outputs=True) + if returncode == 255 and 'too long' in stdout + stderr: + # If the setup script is too long, we retry it with dumping + # the script to a file and running it with SSH. We use a general + # length limit check before but it could be inaccurate on some + # systems. + _dump_code_to_file(codegen) + returncode, stdout, stderr = self.run_on_head(handle, + job_submit_cmd, + stream_logs=False, + require_outputs=True) # Happens when someone calls `sky exec` but remote is outdated # necessitating calling `sky launch`. diff --git a/sky/cli.py b/sky/cli.py index e50aca011a6..eb0267f7ced 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -29,6 +29,7 @@ import multiprocessing import os import shlex +import shutil import signal import subprocess import sys @@ -368,7 +369,9 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter, echo "{bashrc_diff}" >> ~/.bashrc' cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || ' - f'[[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd})') + f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || ' + f'(echo "Bash must be version 4 or above." && exit 1))') + reload_cmd = _RELOAD_BASH_CMD elif value == 'fish': @@ -390,7 +393,10 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter, ctx.exit() try: - subprocess.run(cmd, shell=True, check=True, executable='/bin/bash') + subprocess.run(cmd, + shell=True, + check=True, + executable=shutil.which('bash')) click.secho(f'Shell completion installed for {value}', fg='green') click.echo( 'Completion will take effect once you restart the terminal: ' + diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 3a05223574d..693fc142eee 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -798,7 +798,11 @@ def instance_type_exists(self, instance_type): @classmethod def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str: - return 'standard' if disk_tier == resources_utils.DiskTier.LOW else 'gp3' + if disk_tier == resources_utils.DiskTier.LOW: + return 'standard' + if disk_tier == resources_utils.DiskTier.ULTRA: + return 'io2' + return 'gp3' @classmethod def _get_disk_specs( @@ -806,15 +810,19 @@ def _get_disk_specs( disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]: tier = cls._translate_disk_tier(disk_tier) tier2iops = { + resources_utils.DiskTier.ULTRA: 20000, resources_utils.DiskTier.HIGH: 7000, resources_utils.DiskTier.MEDIUM: 3500, - resources_utils.DiskTier.LOW: 0, # only gp3 is required to set iops + resources_utils.DiskTier.LOW: 0, # iops is not required on standard disk } return { 'disk_tier': cls._get_disk_type(tier), - 'disk_iops': tier2iops[tier], - 'disk_throughput': tier2iops[tier] // 16, - 'custom_disk_perf': tier != resources_utils.DiskTier.LOW, + 'disk_iops': tier2iops[tier] + if cls._get_disk_type(tier) != 'standard' else None, + # Custom disk throughput is only available for gp3 + # see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html + 'disk_throughput': tier2iops[tier] // 16 + if cls._get_disk_type(tier) == 'gp3' else None, } @classmethod diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 928ceb5cc52..1768cd6091e 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -60,9 +60,10 @@ class Azure(clouds.Cloud): _MAX_CLUSTER_NAME_LEN_LIMIT = 42 _BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM - # Azure does not support high disk tier. - _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) - - {resources_utils.DiskTier.HIGH}) + # Azure does not support high disk and ultra disk tier. + _SUPPORTED_DISK_TIERS = ( + set(resources_utils.DiskTier) - + {resources_utils.DiskTier.HIGH, resources_utils.DiskTier.ULTRA}) _INDENT_PREFIX = ' ' * 4 @@ -599,9 +600,10 @@ def check_disk_tier( disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST: return True, '' - if disk_tier == resources_utils.DiskTier.HIGH: - return False, ('Azure disk_tier=high is not supported now. ' - 'Please use disk_tier={low, medium} instead.') + if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + return False, ( + 'Azure disk_tier={high, ultra} is not supported now. ' + 'Please use disk_tier={low, medium, best} instead.') # Only S-series supported premium ssd # see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre # pylint: disable=line-too-long if cls._get_disk_type( @@ -628,6 +630,7 @@ def _get_disk_type(cls, # TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two # cannot be used as OS disks so we might need data disk support tier2name = { + resources_utils.DiskTier.ULTRA: 'Disabled', resources_utils.DiskTier.HIGH: 'Disabled', resources_utils.DiskTier.MEDIUM: 'Premium_LRS', resources_utils.DiskTier.LOW: 'Standard_LRS', diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 9775109ac80..7d3eb157c61 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -117,7 +117,7 @@ class Cloud: _REPR = '' _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM - _BEST_DISK_TIER = resources_utils.DiskTier.HIGH + _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST} _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 643d55d7037..79a1453c581 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -7,7 +7,7 @@ import subprocess import time import typing -from typing import Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple import colorama @@ -437,6 +437,7 @@ def make_deploy_resources_variables( 'custom_resources': None, 'use_spot': r.use_spot, 'gcp_project_id': self.get_project_id(dryrun), + **GCP._get_disk_specs(r.disk_tier), } accelerators = r.accelerators if accelerators is not None: @@ -495,8 +496,6 @@ def make_deploy_resources_variables( resources_vars['machine_image'] = image_id resources_vars['image_id'] = None - resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier) - firewall_rule = None if resources.ports is not None: firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format( @@ -917,12 +916,24 @@ def _get_disk_type(cls, disk_tier: Optional[resources_utils.DiskTier]) -> str: tier = cls._translate_disk_tier(disk_tier) tier2name = { + resources_utils.DiskTier.ULTRA: 'pd-extreme', resources_utils.DiskTier.HIGH: 'pd-ssd', resources_utils.DiskTier.MEDIUM: 'pd-balanced', resources_utils.DiskTier.LOW: 'pd-standard', } return tier2name[tier] + @classmethod + def _get_disk_specs( + cls, + disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]: + specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)} + if disk_tier == resources_utils.DiskTier.ULTRA: + # Only pd-extreme supports custom iops. + # see https://cloud.google.com/compute/docs/disks#disk-types + specs['disk_iops'] = 20000 + return specs + @classmethod def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str: return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items()) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 7875e26d9cc..57f3a9ffe16 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -42,7 +42,9 @@ class OCI(clouds.Cloud): _INDENT_PREFIX = ' ' - _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier) + _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) - + {resources_utils.DiskTier.ULTRA}) + _BEST_DISK_TIER = resources_utils.DiskTier.HIGH @classmethod def _unsupported_features_for_resources( @@ -414,6 +416,19 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: f'{cls._INDENT_PREFIX}Error details: ' f'{common_utils.format_exception(e, use_bracket=True)}') + @classmethod + def check_disk_tier( + cls, instance_type: Optional[str], + disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]: + del instance_type # Unused. + if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST: + return True, '' + if disk_tier == resources_utils.DiskTier.ULTRA: + return False, ('OCI disk_tier=ultra is not supported now. ' + 'Please use disk_tier={low, medium, high, best} ' + 'instead.') + return True, '' + def get_credential_file_mounts(self) -> Dict[str, str]: """Returns a dict of credential file paths to mount paths.""" oci_cfg_file = oci_adaptor.get_config_file() diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 141b356712e..2d323cbac5f 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -110,7 +110,8 @@ def get_default_instance_type( _DEFAULT_INSTANCE_FAMILY)] def _filter_disk_type(instance_type: str) -> bool: - return Azure.check_disk_tier(instance_type, disk_tier)[0] + valid, _ = Azure.check_disk_tier(instance_type, disk_tier) + return valid df = df.loc[df['InstanceType'].apply(_filter_disk_type)] return common.get_instance_type_for_cpus_mem_impl(df, cpus, diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index fbbe0fdcef1..1b5fec9e8e8 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -58,7 +58,9 @@ class InstanceTypeInfo(NamedTuple): def get_catalog_path(filename: str) -> str: - return os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, filename) + catalog_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, filename) + os.makedirs(os.path.dirname(catalog_path), exist_ok=True) + return catalog_path def is_catalog_modified(filename: str) -> bool: @@ -225,7 +227,7 @@ def _update_catalog(): with open(meta_path + '.md5', 'w', encoding='utf-8') as f: f.write(hashlib.md5(r.text.encode()).hexdigest()) - logger.info(f'Updated {cloud} catalog.') + logger.debug(f'Updated {cloud} catalog {filename}.') return LazyDataFrame(catalog_path, update_func=_update_catalog) diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py index 2561b913dcf..a18dee79be5 100644 --- a/sky/clouds/service_catalog/oci_catalog.py +++ b/sky/clouds/service_catalog/oci_catalog.py @@ -15,6 +15,7 @@ from typing import Dict, List, Optional, Tuple from sky.adaptors import oci as oci_adaptor +from sky.clouds import OCI from sky.clouds.service_catalog import common from sky.clouds.utils import oci_utils from sky.utils import resources_utils @@ -102,7 +103,6 @@ def get_default_instance_type( cpus: Optional[str] = None, memory: Optional[str] = None, disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]: - del disk_tier # unused if cpus is None: cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+' @@ -111,12 +111,17 @@ def get_default_instance_type( else: memory_gb_or_ratio = memory + def _filter_disk_type(instance_type: str) -> bool: + valid, _ = OCI.check_disk_tier(instance_type, disk_tier) + return valid + instance_type_prefix = tuple( f'{family}' for family in oci_utils.oci_config.DEFAULT_INSTANCE_FAMILY) df = _get_df() df = df[df['InstanceType'].notna()] df = df[df['InstanceType'].str.startswith(instance_type_prefix)] + df = df.loc[df['InstanceType'].apply(_filter_disk_type)] logger.debug(f'# get_default_instance_type: {df}') return common.get_instance_type_for_cpus_mem_impl(df, cpus, diff --git a/sky/exceptions.py b/sky/exceptions.py index 99784a8c96d..15f3ea3f34e 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -100,9 +100,13 @@ def __init__(self, returncode: int, command: str, error_msg: str, self.command = command self.error_msg = error_msg self.detailed_reason = detailed_reason + if not command: message = error_msg else: + if len(command) > 100: + # Chunck the command to avoid overflow. + command = command[:100] + '...' message = (f'Command {command} failed with return code ' f'{returncode}.\n{error_msg}') super().__init__(message) diff --git a/sky/optimizer.py b/sky/optimizer.py index 10aa697258b..4326329579d 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -19,6 +19,7 @@ from sky.adaptors import common as adaptors_common from sky.utils import env_options from sky.utils import log_utils +from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import ux_utils @@ -935,6 +936,15 @@ def sort_key(row, accelerator_spot_list=accelerator_spot_list): table.add_rows(rows) logger.info(f'{table}\n') + # Warning message for using disk_tier=ultra + # TODO(yi): Consider price of disks in optimizer and + # move this warning there. + if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA: + logger.warning( + 'Using disk_tier=ultra will utilize more advanced disks ' + '(io2 Block Express on AWS and extreme persistent disk on ' + 'GCP), which can lead to significant higher costs (~$2/h).') + @staticmethod def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): for node, candidate_set in node_to_candidate_map.items(): diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index e989fbc085a..7bfa1724b83 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -110,8 +110,8 @@ def docker_start_cmds( '--cap-add=SYS_ADMIN', '--device=/dev/fuse', '--security-opt=apparmor:unconfined', + '--entrypoint=/bin/bash', image, - 'bash', ] return ' '.join(docker_run) diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7ad3d72e46b..8ac3ab1d4ca 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -110,8 +110,9 @@ def get_gke_accelerator_name(accelerator: str) -> str: if accelerator == 'H100': # H100 is named as H100-80GB in GKE. accelerator = 'H100-80GB' - if accelerator in ('A100-80GB', 'L4', 'H100-80GB'): - # A100-80GB, L4 and H100-80GB have a different name pattern. + if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'): + # A100-80GB, L4, H100-80GB and H100-MEGA-80GB + # have a different name pattern. return 'nvidia-{}'.format(accelerator.lower()) else: return 'nvidia-tesla-{}'.format(accelerator.lower()) @@ -194,13 +195,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str: return value.replace('nvidia-tesla-', '').upper() elif value.startswith('nvidia-'): acc = value.replace('nvidia-', '').upper() - if acc in ['H100-80GB', 'H100-MEGA-80GB']: - # H100 is named H100-80GB or H100-MEGA-80GB in GKE, - # where the latter has improved bandwidth. - # See a3-mega instances on GCP. - # TODO: we do not distinguish the two GPUs for simplicity, - # but we can evaluate whether we should distinguish - # them based on users' requests. + if acc == 'H100-80GB': + # H100 can be either H100-80GB or H100-MEGA-80GB in GKE + # we map H100 ---> H100-80GB and keep H100-MEGA-80GB + # to distinguish between a3-high and a3-mega instances return 'H100' return acc else: diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py index 24af263f13c..f1587463e84 100644 --- a/sky/provision/runpod/utils.py +++ b/sky/provision/runpod/utils.py @@ -77,7 +77,11 @@ def list_instances() -> Dict[str, Dict[str, Any]]: info['name'] = instance['name'] info['port2endpoint'] = {} - if instance['desiredStatus'] == 'RUNNING' and instance.get('runtime'): + # Sometimes when the cluster is in the process of being created, + # the `port` field in the runtime is None and we need to check for it. + if (instance['desiredStatus'] == 'RUNNING' and + instance.get('runtime') and + instance.get('runtime').get('ports')): for port in instance['runtime']['ports']: if port['isIpPublic']: if port['privatePort'] == 22: diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 30820a3a91e..f23dc8100b5 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -135,8 +135,9 @@ # true. '{ bash Miniconda3-Linux-x86_64.sh -b; ' 'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && ' - 'conda config --set auto_activate_base true && ' - f'conda activate base; }}; }}; ' + # Caller should replace {conda_auto_activate} with either true or false. + 'conda config --set auto_activate_base {conda_auto_activate} && ' + 'conda activate base; }; }; ' 'grep "# >>> conda initialize >>>" ~/.bashrc || ' '{ conda init && source ~/.bashrc; };' # If Python version is larger then equal to 3.12, create a new conda env @@ -145,7 +146,7 @@ # costly to create a new conda env, and venv should be a lightweight and # faster alternative when the python version satisfies the requirement. '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && ' - f'echo "Creating conda env with Python 3.10" && ' + 'echo "Creating conda env with Python 3.10" && ' f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && ' f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};' # Create a separate conda environment for SkyPilot dependencies. diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index 06c5d6d48af..4f66ef54383 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -65,8 +65,8 @@ def docker_start_cmds( '--cap-add=SYS_ADMIN', '--device=/dev/fuse', '--security-opt=apparmor:unconfined', + '--entrypoint=/bin/bash', image, - 'bash', ] return ' '.join(docker_run) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 7e9dfccdaf1..6afdf381cc0 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -73,8 +73,10 @@ available_node_types: VolumeSize: {{disk_size}} VolumeType: {{disk_tier}} Encrypted: {{disk_encrypted}} - {% if custom_disk_perf %} + {% if disk_iops %} Iops: {{disk_iops}} + {% endif %} + {% if disk_throughput %} Throughput: {{disk_throughput}} {% endif %} {% if use_spot %} diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index bcc16bac531..5f06eef05c7 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -124,6 +124,9 @@ available_node_types: sourceImage: {{image_id}} {%- endif %} diskType: zones/{{zones}}/diskTypes/{{disk_tier}} + {%- if disk_iops %} + provisionedIops: {{disk_iops}} + {%- endif %} {%- if gpu is not none %} guestAccelerators: - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}} diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index 95c784143cc..6f5c07f7d25 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -24,6 +24,7 @@ class DiskTier(enum.Enum): LOW = 'low' MEDIUM = 'medium' HIGH = 'high' + ULTRA = 'ultra' BEST = 'best' @classmethod diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py index becf3ba461a..dfda65e23da 100644 --- a/tests/test_optimizer_dryruns.py +++ b/tests/test_optimizer_dryruns.py @@ -771,3 +771,10 @@ def _get_all_candidate_cloud(r: sky.Resources) -> Set[clouds.Cloud]: assert high_tier_candidates == set( map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp', 'oci'])), high_tier_candidates + + # Only AWS, GCP supports ULTRA disk tier. + ultra_tier_resources = sky.Resources( + disk_tier=resources_utils.DiskTier.ULTRA) + ultra_tier_candidates = _get_all_candidate_cloud(ultra_tier_resources) + assert ultra_tier_candidates == set( + map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp'])), ultra_tier_candidates diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f338de2dda7..63ccd19857d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -34,6 +34,7 @@ import subprocess import sys import tempfile +import textwrap import time from typing import Dict, List, NamedTuple, Optional, Tuple import urllib.parse @@ -3304,11 +3305,11 @@ def _get_aws_query_command(region, instance_id, field, expected): f'Reservations[].Instances[].InstanceId --output text`; ' + _get_aws_query_command(region, '$id', 'VolumeType', specs['disk_tier']) + - ('' if disk_tier == resources_utils.DiskTier.LOW else - (_get_aws_query_command(region, '$id', 'Iops', - specs['disk_iops']) + - _get_aws_query_command(region, '$id', 'Throughput', - specs['disk_throughput']))), + ('' if specs['disk_tier'] + == 'standard' else _get_aws_query_command( + region, '$id', 'Iops', specs['disk_iops'])) + + ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command( + region, '$id', 'Throughput', specs['disk_throughput'])), ], f'sky down -y {name}', timeout=10 * 60, # 10 mins (it takes around ~6 mins) @@ -3344,8 +3345,8 @@ def test_gcp_disk_tier(): @pytest.mark.azure def test_azure_disk_tier(): for disk_tier in list(resources_utils.DiskTier): - if disk_tier == resources_utils.DiskTier.HIGH: - # Azure does not support high disk tier. + if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA: + # Azure does not support high and ultra disk tier. continue type = Azure._get_disk_type(disk_tier) name = _get_cluster_name() + '-' + disk_tier.value @@ -3436,6 +3437,43 @@ def test_gcp_zero_quota_failover(): run_one_test(test) +def test_long_setup_run_script(generic_cloud: str): + name = _get_cluster_name() + with tempfile.NamedTemporaryFile('w', prefix='sky_app_', + suffix='.yaml') as f: + f.write( + textwrap.dedent(""" \ + setup: | + echo "start long setup" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end long setup"\n') + f.write( + textwrap.dedent(""" \ + run: | + echo "run" + """)) + for i in range(1024 * 120): + f.write(f' echo {i}\n') + f.write(' echo "end run"\n') + f.flush() + + test = Test( + 'long-setup-run-script', + [ + f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup --detach-run --cpus 2+ {f.name}', + f'sky exec --detach-run {name} "echo hello"', + f'sky exec --detach-run {name} {f.name}', + f'sky logs {name} --status 1', + f'sky logs {name} --status 2', + f'sky logs {name} --status 3', + ], + f'sky down -y {name}', + ) + run_one_test(test) + + # ---------- Testing skyserve ---------- diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index 6fb9f1bcd14..70da0532e9b 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -125,7 +125,6 @@ def test_aws_make_deploy_variables(*mocks) -> None: 'disk_tier': 'gp3', 'disk_throughput': 218, 'disk_iops': 3500, - 'custom_disk_perf': True, 'docker_image': None, 'docker_container_name': 'sky_container', 'docker_login_config': None,