diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
index 4bb1b67f9e3..d5bbdd6cb51 100644
--- a/docs/source/_static/custom.css
+++ b/docs/source/_static/custom.css
@@ -115,7 +115,7 @@ html[data-theme="dark"] {
     padding: 2px 5px; /* Reduced padding for a more compact label */
     margin-left: 6px; /* Space between the text and the label */
 
-    vertical-align: middle;
+    vertical-align: text-bottom;
     line-height: 1; /* Adjust line height to ensure vertical alignment */
 }
 
diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index e0de1b50d51..31b3692d8b8 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -27,8 +27,9 @@ document.addEventListener('DOMContentLoaded', () => {
     const newItems = [
         { selector: '.caption-text', text: 'SkyServe: Model Serving' },
         { selector: '.toctree-l1 > a', text: 'Managed Jobs' },
-        { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' },
         { selector: '.toctree-l1 > a', text: 'Llama-3.1 (Meta)' },
+        { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' },
+        { selector: '.toctree-l1 > a', text: 'Reserved, Capacity Blocks, DWS' },
     ];
     newItems.forEach(({ selector, text }) => {
         document.querySelectorAll(selector).forEach((el) => {
diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst
new file mode 100644
index 00000000000..a6b76e8a53c
--- /dev/null
+++ b/docs/source/developers/index.rst
@@ -0,0 +1,8 @@
+Developer Guides
+=================
+
+.. toctree::
+   :maxdepth: 1
+
+   ../developers/CONTRIBUTING
+   Guide: Adding a New Cloud <https://docs.google.com/document/d/1oWox3qb3Kz3wXXSGg9ZJWwijoa99a3PIQUHBR8UgEGs/edit?usp=sharing>
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index f8d329a1431..dbb9a32780d 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -129,8 +129,8 @@ Read the research:
 
    ../getting-started/installation
    ../getting-started/quickstart
-   ../getting-started/tutorial
    ../examples/interactive-development
+   ../getting-started/tutorial
 
 
 .. toctree::
@@ -141,8 +141,16 @@ Read the research:
    ../examples/managed-jobs
    ../reference/job-queue
    ../examples/auto-failover
-   ../reference/kubernetes/index
    ../running-jobs/distributed-jobs
+   ../running-jobs/many-jobs
+
+.. toctree::
+   :hidden:
+   :maxdepth: 1
+   :caption: Reserved & Existing Clusters
+
+   ../reservations/reservations
+   ../reference/kubernetes/index
 
 .. toctree::
    :hidden:
@@ -184,14 +192,6 @@ Read the research:
    SkyPilot vs. Other Systems <../reference/comparison>
 
 
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-   :caption: Developer Guides
-
-   ../developers/CONTRIBUTING
-   Guide: Adding a New Cloud <https://docs.google.com/document/d/1oWox3qb3Kz3wXXSGg9ZJWwijoa99a3PIQUHBR8UgEGs/edit?usp=sharing>
-
 .. toctree::
    :hidden:
    :maxdepth: 1
@@ -210,4 +210,5 @@ Read the research:
    ../reference/cli
    ../reference/api
    ../reference/config
+   ../developers/index
 
diff --git a/docs/source/examples/docker-containers.rst b/docs/source/examples/docker-containers.rst
index 582db94ee79..408a53a6185 100644
--- a/docs/source/examples/docker-containers.rst
+++ b/docs/source/examples/docker-containers.rst
@@ -161,6 +161,15 @@ Any GPUs assigned to the task will be automatically mapped to your Docker contai
 
     2. The container image must grant sudo permissions without requiring password authentication for the user. Having a root user is also acceptable.
 
+.. note::
+
+  Using a container with a customized entrypoint as a runtime environment is
+  supported, with the container's entrypoint being overridden by :code:`/bin/bash`.
+  Specific commands can be executed in the :code:`setup` and :code:`run` sections
+  of the task YAML file. However, this approach is not compatible with RunPod due
+  to limitations in the RunPod API, so ensure that you choose a container with a
+  default entrypoint (i.e. :code:`/bin/bash`).
+
 Private Registries
 ^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index bfc6fd17e05..cdef2335dd7 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -219,7 +219,7 @@ Congratulations!  In this quickstart, you have launched a cluster, run a task, a
 
 Next steps:
 
-- Adapt :ref:`Tutorial: DNN Training <dnn-training>` to start running your own project on SkyPilot!
+- Adapt :ref:`Tutorial: AI Training <ai-training>` to start running your own project on SkyPilot!
 - See the :ref:`Task YAML reference <yaml-spec>`, :ref:`CLI reference <cli>`, and `more examples <https://github.com/skypilot-org/skypilot/tree/master/examples>`_
 - To learn more, try out `SkyPilot Tutorials <https://github.com/skypilot-org/skypilot-tutorial>`_ in Jupyter notebooks
 
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
index 92ef6f68b2f..175f1391a6d 100644
--- a/docs/source/getting-started/tutorial.rst
+++ b/docs/source/getting-started/tutorial.rst
@@ -1,6 +1,6 @@
-.. _dnn-training:
+.. _ai-training:
 
-Tutorial: DNN Training
+Tutorial: AI Training
 ======================
 This example uses SkyPilot to train a Transformer-based language model from HuggingFace.
 
diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst
index 5569c6ec145..5a966a0014f 100644
--- a/docs/source/reference/faq.rst
+++ b/docs/source/reference/faq.rst
@@ -213,20 +213,3 @@ To launch a VS Code tunnel using a SkyPilot task definition, you can use the fol
 
 Note that you'll be prompted to authenticate with your GitHub account to launch a VS Code tunnel.
 
-PyTorch 2.2.0 failed on SkyPilot clusters. What should I do?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The latest PyTorch release (2.2.0) has a version conflict with the default cuDNN version on SkyPilot clusters, which may raise a segmentation fault when you run the job.
-
-To fix this, you can choose one of the following solutions:
-
-1. Use older version of PyTorch (like 2.1.0) instead of 2.2.0, i.e. :code:`pip install "torch<2.2"`;
-2. Remove the cuDNN from the cluster's :code:`LD_LIBRARY_PATH` by adding the following line to your task:
-
-.. code-block:: yaml
-
-  run: |
-    export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's|:/usr/local/cuda/lib64||g; s|/usr/local/cuda/lib64:||g; s|/usr/local/cuda/lib64||g')
-    # Other commands using PyTorch 2.2.0
-    ...
-
diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst
index 6397c7bbbb6..c0016c4d6da 100644
--- a/docs/source/reference/job-queue.rst
+++ b/docs/source/reference/job-queue.rst
@@ -160,7 +160,7 @@ SkyPilot's scheduler serves two goals:
 2. **Minimizing resource idleness**: If a resource is idle, SkyPilot will schedule a
    queued job that can utilize that resource.
 
-We illustrate the scheduling behavior by revisiting :ref:`Tutorial: DNN Training <dnn-training>`.
+We illustrate the scheduling behavior by revisiting :ref:`Tutorial: AI Training <ai-training>`.
 In that tutorial, we have a task YAML that specifies these resource requirements:
 
 .. code-block:: yaml
diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst
index bde97615e80..86e153bd8fc 100644
--- a/docs/source/reference/kubernetes/index.rst
+++ b/docs/source/reference/kubernetes/index.rst
@@ -1,7 +1,7 @@
 .. _kubernetes-overview:
 
-Running on Kubernetes
-=============================
+Using Kubernetes
+================
 
 SkyPilot tasks can be run on your private on-prem or cloud Kubernetes clusters.
 The Kubernetes cluster gets added to the list of "clouds" in SkyPilot and SkyPilot
@@ -116,4 +116,4 @@ Kubernetes support is under active development. Some features are in progress an
 * Multi-node tasks - ✅ Available
 * Custom images - ✅ Available
 * Opening ports and exposing services - ✅ Available
-* Multiple Kubernetes Clusters - 🚧 In progress
\ No newline at end of file
+* Multiple Kubernetes Clusters - 🚧 In progress
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
index 0354d3d0395..228cbd7c88f 100644
--- a/docs/source/reference/yaml-spec.rst
+++ b/docs/source/reference/yaml-spec.rst
@@ -113,12 +113,14 @@ Available fields:
       disk_size: 256
 
       # Disk tier to use for OS (optional).
-      # Could be one of 'low', 'medium', 'high' or 'best' (default: 'medium').
+      # Could be one of 'low', 'medium', 'high', 'ultra' or 'best' (default: 'medium').
       # if 'best' is specified, use the best disk tier enabled.
       # Rough performance estimate:
-      #   low: 500 IOPS; read 20MB/s; write 40 MB/s
-      #   medium: 3000 IOPS; read 220 MB/s; write 200 MB/s
-      #   high: 6000 IOPS; 340 MB/s; write 250 MB/s
+      #   low: 1000 IOPS; read 90 MB/s; write 90 MB/s
+      #   medium: 3000 IOPS; read 220 MB/s; write 220 MB/s
+      #   high: 6000 IOPS; read 400 MB/s; write 400 MB/s
+      #   ultra: 60000 IOPS;  read 4000 MB/s; write 3000 MB/s
+      # Measured by examples/perf/storage_rawperf.yaml
       disk_tier: medium
 
       # Ports to expose (optional).
@@ -335,8 +337,8 @@ Available fields:
 
 .. _task-yaml-experimental:
 
-Experimental
-------------
+Experimental Configurations
+---------------------------
 
 .. note::
 
diff --git a/docs/source/reservations/reservations.rst b/docs/source/reservations/reservations.rst
new file mode 100644
index 00000000000..8d0625846f7
--- /dev/null
+++ b/docs/source/reservations/reservations.rst
@@ -0,0 +1,208 @@
+
+.. _reservation:
+
+Reserved, Capacity Blocks, DWS
+===================================
+
+
+With the recent GPU shortage, reservations from cloud providers have become a common way to ensure GPU availability for a specific duration. These reservations can be short-term (e.g., 1-30 days) capacity guarantees, or long-term (e.g., 1-3 years) contracts.
+
+This guide shows how to use SkyPilot to request resources from reservations and even combine them with on-demand/spot resources to fully
+utilize the capacity in your cloud accounts.
+
+.. image:: https://i.imgur.com/FA0BT0E.png
+  :width: 95%
+  :align: center
+
+
+AWS Capacity Reservations & Capacity Blocks
+--------------------------------------------
+
+AWS **capacity reservations** and **capacity blocks** are ways to reserve a certain amount of compute capacity for a period of time. The latter is for high-end GPUs, such as A100s (P4d instances) and H100s (P5d instances), while the former is for all other instance types.
+Instead of committing to a 1-3 year long contract, you can get a capacity reservation or capacity block for as short as 1 second or 1 day, respectively.
+
+
+To request capacity reservations/blocks, see the official docs:
+
+* `AWS Capacity Reservations <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-capacity-reservations.html>`_
+* `AWS Capacity Blocks <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-capacity-blocks.html>`_
+
+Once you have successfully created a reservation/block, you will get an ID of the reservation/block, such as ``cr-012345678``.
+
+To use the reservation/block, you can specify two fields in ``~/.sky/config.yaml``:
+
+* ``aws.prioritize_reservations``: whether to prioritize launching clusters from capacity reservations in any region/zone over on-demand/spot clusters. This is useful to fully utilize your reserved capacity created with ``Instance eligibility: open``.
+* ``aws.specific_reservations``: a list of reservation IDs that can be used by SkyPilot. This is useful if you have multiple capacity reservations or blocks with ``Instance eligibility: targeted`` for different instance types in multiple regions/zones.
+
+
+Example:
+
+.. code-block:: yaml
+
+    aws:
+      prioritize_reservations: true
+      specific_reservations:
+        # 1x H100 capacity block in us-east-1
+        - "cr-0123456789"
+        # 2x A100 reservation in us-east-2
+        - "cr-123456789a"
+        # 2x A100 reservation in us-west-2
+        - "cr-23456789ab"
+        # 2x M5a.16xlarge reservation in us-east-1
+        - "cr-3456789abc"
+
+For more details of the fields, see :ref:`config-yaml`.
+
+.. note::
+
+    If any of the fields are specified, SkyPilot optimizer may take around 30 seconds to retrieve the latest reservation/block status on all regions and zones from your AWS account.
+
+
+.. _utilizing-reservations:
+
+Utilizing Reservations
+~~~~~~~~~~~~~~~~~~~~~~
+
+By specifying configurations above, SkyPilot will prioritize using any available capacity in reservation/block (i.e., consider them as zero cost) whenever you launch a cluster/job.
+
+Specifically, SkyPilot's behavior is as follows:
+
+1. Query reservations/blocks across AWS regions and zones to find all available capacity. (If the task specifies specific regions or zones to use, only those are queried.)
+2. For each zone, calculate its cost: any available reserved capacity is considered as zero cost, and if any on-demand/spot resource is needed to supplement the available reserved capacity to fully satisfy the request, their on-demand/spot price is included.
+3. :ref:`Automatically failover <auto-failover>` through these zones in increasing per-zone cost order until the requested resources are provisioned.
+
+
+For example, if you are launching a cluster with the following SkyPilot YAML:
+
+.. code-block:: yaml
+
+    resources:
+      cloud: aws
+      accelerators: A100:8
+    
+    num_nodes: 2
+
+
+SkyPilot will utilize the capacity reservation/block as follows:
+
+1. Query reservations/blocks in ``us-east-2`` and ``us-west-2`` in reservation ``cr-123456789a`` and ``cr-23456789ab``, respectively. Assume the results are:
+
+   - 1 A100 instance capacity is available in ``us-east-2``,
+   - No available capacity in ``us-west-2``.
+2. SkyPilot calculates the pricing for all zones as described above.  The result is ``us-east-2`` zones are cheaper than  all other zones, because the former's costs are 1 on-demand node's cost for 2 nodes (by satisfying 1 node using the reserved capacity).
+3. SkyPilot will thus try to provision an on-demand A100 instance in ``us-east-2``. On unavailability, SkyPilot will continue to :ref:`automatically failover <auto-failover>` to other clouds/regions/zones for normal on-demand/spot instances.
+
+
+.. hint::
+
+    If you have a capacity block with a starting time in the future, you can run ``sky jobs launch --region us-east-1 --gpus H100:8 task.yaml`` to let SkyPilot automatically wait until the starting time is reached. Namely, you don't have to wake up at 4:30am PDT to launch your job on a newly available capacity block.
+
+
+GCP reservations
+-----------------
+
+GCP reservations are similar to AWS capacity reservations, where you can reserve a certain amount of compute capacity for any period of time.
+
+To get a reservation, see the `GCP official docs <https://cloud.google.com/compute/docs/instances/reservations-single-project>`__.
+
+Like AWS, you can specify two fields in ``~/.sky/config.yaml``:
+
+* ``gcp.prioritize_reservations``: whether to prioritize launching clusters from reservations in any region/zone over on-demand/spot clusters. This is useful to fully utilize your `automatically consumed reservations <https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_any_matching_reservation>`__.
+* ``gcp.specific_reservations``: a list of reservation IDs that can be used by SkyPilot. This is useful if you have multiple `specific reservations <https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation>`__ for different instance types in multiple regions/zones.
+
+Example:
+
+.. code-block:: yaml
+
+    gcp:
+      prioritize_reservations: true
+      specific_reservations:
+        - projects/my-project/reservations/my-reservation1
+        - projects/my-project/reservations/my-reservation2
+
+
+SkyPilot will utilize the reservations similar to AWS reservations as described in :ref:`utilizing-reservations`.
+
+
+GCP Dynamic Workload Scheduler (DWS)
+-------------------------------------
+
+GCP `Dynamic Workload Scheduler (DWS) <https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler>`__ is a resource management service that (1) receives a GPU capacity request, (2) automatically provisions the requested resources when they become available, and (3) keeps the resources running for a specified duration.
+
+.. tip::
+
+    It has been observed that using DWS can significantly increase the chance of getting a high-end GPU resource, such as A100s and H100s, compared to using on-demand or spot instances.
+
+
+Using DWS for VMs
+~~~~~~~~~~~~~~~~~
+
+SkyPilot allows you to launch resources via DWS by specifying the ``gcp.managed_instance_group`` field in ``~/.sky/config.yaml``:
+
+.. code-block:: yaml
+
+    gcp:
+      managed_instance_group:
+        run_duration: 3600
+        provision_timeout: 900
+
+
+1. ``run_duration``: duration for a created instance to be kept alive (in seconds, required).
+2. ``provision_timeout``: timeout for provisioning an instance with DWS (in seconds, optional). If the timeout is reached without requested resources being provisioned, SkyPilot will automatically :ref:`failover <auto-failover>` to other clouds/regions/zones to get the resources.
+
+See :ref:`config-yaml` for more details.
+
+In case you want to specify the DWS configuration for each job/cluster, you can also specify the configuration in the SkyPilot task YAML (see :ref:`here <task-yaml-experimental>`):
+
+.. code-block:: yaml
+
+    experimental:
+      config_overrides:
+        gcp:
+          managed_instance_group:
+            run_duration: 3600
+            provision_timeout: 900
+
+    resources:
+      cloud: gcp
+      accelerators: A100:8
+    
+    num_nodes: 4
+    
+Using DWS on GKE with Kueue
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DWS is also supported on Google Kubernetes Engine (GKE) with Kueue. To enable DWS on GKE, you need to set up your GKE cluster with Kueue and DWS; see the `GCP official docs <https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest>`__.
+
+To launch a SkyPilot cluster or job on GKE with DWS, you can specify the DWS configuration in the SkyPilot task YAML:
+
+.. code-block:: yaml
+
+    experimental:
+      config_overrides:
+        kubernetes:
+          pod_config:
+            metadata:
+              annotations:
+                provreq.kueue.x-k8s.io/maxRunDurationSeconds: "3600"
+          provision_timeout: 900
+
+    resourcse:
+      cloud: kubernetes
+      accelerators: A100:8
+      labels:
+        kueue.x-k8s.io/queue-name: dws-local-queue
+
+1. ``kueue.x-k8s.io/queue-name``: name of the Kueue queue to submit your resource request to.
+2. ``provreq.kueue.x-k8s.io/maxRunDurationSeconds``: maximum duration for a created instance to be kept alive (in seconds, required).
+3. ``provision_timeout``: timeout for provisioning an instance with DWS (in seconds, optional). If the timeout is reached without getting the requested resources, SkyPilot will automatically :ref:`failover <auto-failover>` to other clouds/regions/zones to get the resources.
+
+Long-term reservations
+----------------------
+
+Unlike short-term reservations above, long-term reservations are typically more than one month long and can be viewed as a type of *on-prem cluster*.
+
+SkyPilot supports long-term reservations and on-premise clusters through Kubernetes, i.e., you can set up a Kubernetes cluster on top of your reserved resources and interact with them through SkyPilot.
+
+See the simple steps to set up a Kubernetes cluster on existing machines in :ref:`kubernetes-overview`.
+
diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst
index 9eb590c10bc..da3ddd8e94f 100644
--- a/docs/source/running-jobs/distributed-jobs.rst
+++ b/docs/source/running-jobs/distributed-jobs.rst
@@ -1,6 +1,6 @@
 .. _dist-jobs:
 
-Distributed Jobs on Many Nodes
+Distributed Multi-Node Jobs
 ================================================
 
 SkyPilot supports multi-node cluster
diff --git a/docs/source/running-jobs/many-jobs.rst b/docs/source/running-jobs/many-jobs.rst
new file mode 100644
index 00000000000..3d26d74e794
--- /dev/null
+++ b/docs/source/running-jobs/many-jobs.rst
@@ -0,0 +1,346 @@
+
+.. _many-jobs:
+
+Many Parallel Jobs
+======================
+
+SkyPilot allows you to easily **run many jobs in parallel** and manage them in a single system. This is useful for hyperparameter tuning sweeps, data processing, and other batch jobs.
+
+This guide shows a typical workflow for running many jobs with SkyPilot.
+
+
+.. image:: https://i.imgur.com/tvxeNyR.png
+  :width: 90%
+  :align: center
+.. TODO: Show the components in a GIF.
+
+
+Why Use SkyPilot to Run Many Jobs
+-------------------------------------
+
+- **Unified**: Use any or multiple of your own infrastructure (Kubernetes, cloud VMs, reservations, etc.).
+- **Elastic**: Scale up and down based on demands.
+- **Cost-effective**: Only pay for the cheapest resources.
+- **Robust**: Automatically recover jobs from failures.
+- **Observable**: Monitor and manage all jobs in a single pane of glass.
+
+Write a YAML for One Job
+-----------------------------------
+
+Before scaling up to many jobs, write a SkyPilot YAML for a single job first and ensure it runs correctly. This can save time by avoiding debugging many jobs at once.
+
+Here is the same example YAML as in :ref:`Tutorial: AI Training <ai-training>`:
+
+.. raw:: html
+
+    <details>
+    <summary>Click to expand: <code>train.yaml</code></summary>
+
+.. code-block:: yaml
+
+  # train.yaml
+  name: huggingface
+
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate 2e-5 \
+      --max_steps 50 \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16
+
+
+.. raw:: html
+
+    </details>
+
+
+First, launch the job to check it successfully launches and runs correctly:
+
+.. code-block:: bash
+
+  sky launch -c train train.yaml
+
+
+If there is any error, you can fix the code and/or the YAML, and launch the job again on the same cluster:
+
+.. code-block:: bash
+
+  # Cancel the latest job.
+  sky cancel train -y
+  # Run the job again on the same cluster.
+  sky launch -c train train.yaml
+
+
+Sometimes, it may be more efficient to log into the cluster and interactively debug the job. You can do so by directly :ref:`ssh'ing into the cluster or using VSCode's remote ssh <dev-connect>`.
+
+.. code-block:: bash
+
+  # Log into the cluster.
+  ssh train
+
+
+
+Next, after confirming the job is working correctly, **add (hyper)parameters** to the job YAML so that all job variants can be specified.
+
+1. Add Hyperparameters
+~~~~~~~~~~~~~~~~~~~~~~
+
+To launch jobs with different hyperparameters, add them as :ref:`environment variables <env-vars>` to the SkyPilot YAML, and make your main program read these environment variables:
+
+.. raw:: html
+
+    <details>
+    <summary>Updated SkyPilot YAML: <code>train-template.yaml</code></summary>
+
+.. code-block:: yaml
+  :emphasize-lines: 4-6,28-29
+
+  # train-template.yaml
+  name: huggingface
+
+  envs:
+    LR: 2e-5
+    MAX_STEPS: 50
+    
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate ${LR} \
+      --max_steps ${MAX_STEPS} \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16
+
+.. raw:: html
+    
+    </details>
+
+You can now use ``--env`` to launch a job with different hyperparameters:
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env LR=1e-5 \
+    --env MAX_STEPS=100
+
+Alternative, store the environment variable values in a dotenv file and use ``--env-file`` to launch:
+
+.. code-block:: bash
+
+  # configs/job1
+  LR=1e-5
+  MAX_STEPS=100
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env-file configs/job1
+
+
+
+2. Logging Job Outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+When running many jobs, it is useful to log the outputs of all jobs. You can use tools like `W&B <https://wandb.ai>`__ for this purpose:
+
+.. raw:: html
+
+    <details>
+    <summary>SkyPilot YAML with W&B: <code>train-template.yaml</code></summary>
+
+.. code-block:: yaml
+  :emphasize-lines: 7-7,19-19,34-34
+
+  # train-template.yaml
+  name: huggingface
+
+  envs:
+    LR: 2e-5
+    MAX_STEPS: 50
+    WANDB_API_KEY: # Empty field means this field is required when launching the job.
+      
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+    pip install wandb
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate ${LR} \
+      --max_steps ${MAX_STEPS} \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16 \
+      --report_to wandb
+
+.. raw:: html
+
+    </details>
+
+You can now launch the job with the following command (``WANDB_API_KEY`` should existing in your local environment variables).
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env-file configs/job1 \
+    --env WANDB_API_KEY
+
+
+
+Scale Out to Many Jobs
+-----------------------
+
+With the above setup, you can now scale out to run many jobs in parallel. You
+can either use SkyPilot CLI with many config files or use SkyPilot Python API.
+
+With CLI and Config Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can run many jobs in parallel by (1) creating multiple config files and (2)
+submitting them as :ref:`SkyPilot managed jobs <managed-jobs>`.
+
+First, create a config file for each job (for example, in a ``configs`` directory):
+
+.. code-block:: bash
+
+  # configs/job-1
+  LR=1e-5
+  MAX_STEPS=100
+
+  # configs/job-2
+  LR=2e-5
+  MAX_STEPS=200
+
+  ...
+
+.. raw:: html
+
+  <details>
+  <summary>An example Python script to generate config files</summary>
+
+.. code-block:: python
+
+  import os
+
+  CONFIG_PATH = 'configs'
+  LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0]
+  MAX_STEPS_CANDIDATES = [100, 300, 1000]
+
+  os.makedirs(CONFIG_PATH, exist_ok=True)
+
+  job_idx = 1
+  for lr in LR_CANDIDATES:
+    for max_steps in MAX_STEPS_CANDIDATES:
+      config_file = f'{CONFIG_PATH}/job-{job_idx}'
+      with open(config_file, 'w') as f:
+        print(f'LR={lr}', file=f)
+        print(f'MAX_STEPS={max_steps}', file=f)
+      job_idx += 1
+
+.. raw:: html
+
+  </details>
+
+Then, submit all jobs by iterating over the config files and calling ``sky jobs launch`` on each:
+
+.. code-block:: bash
+
+  for config_file in configs/*; do
+    job_name=$(basename $config_file)
+    # -y: yes to all prompts.
+    # -d: detach from the job's logging, so the next job can be submitted
+    #      without waiting for the previous job to finish.
+    sky jobs launch -n train-$job_name -y -d train-template.yaml \
+      --env-file $config_file \
+      --env WANDB_API_KEY
+  done
+
+
+Job statuses can be checked via ``sky jobs queue``:
+
+.. code-block:: console
+
+  $ sky jobs queue
+
+  Fetching managed job statuses...
+  Managed jobs
+  In progress tasks: 10 RUNNING
+  ID  TASK  NAME        RESOURCES  SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS   
+  10  -     train-job10 1x[V100:4] 5 mins ago   5m 5s          1m 12s        0            RUNNING
+  9   -     train-job9  1x[V100:4] 6 mins ago   6m 11s         2m 23s        0            RUNNING
+  8   -     train-job8  1x[V100:4] 7 mins ago   7m 15s         3m 31s        0            RUNNING
+  ...
+
+
+With Python API
+~~~~~~~~~~~~~~~
+
+To have more customized control over generation of job variants, you can also use SkyPilot Python API to launch the jobs.
+
+.. code-block:: python
+
+  import os
+  import sky
+
+  LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0]
+  MAX_STEPS_CANDIDATES = [100, 300, 1000]
+  task = sky.Task.from_yaml('train-template.yaml')
+
+  job_idx = 1
+  for lr in LR_CANDIDATES:
+    for max_steps in MAX_STEPS_CANDIDATES:
+      task.update_envs({'LR': lr, 'MAX_STEPS': max_steps})
+      sky.jobs.launch(
+        task,
+        name=f'train-job{job_idx}',
+        detach_run=True,
+        retry_until_up=True,
+      )
+      job_idx += 1
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
index 9986f93275a..e316a1380bb 100644
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -882,6 +882,11 @@ def write_cluster_config(
         f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
     )
 
+    # We disable conda auto-activation if the user has specified a docker image
+    # to use, which is likely to already have a conda environment activated.
+    conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
+                           else 'false')
+
     # Use a tmp file path to avoid incomplete YAML file being re-used in the
     # future.
     tmp_yaml_path = yaml_path + '.tmp'
@@ -916,10 +921,11 @@ def write_cluster_config(
                 'specific_reservations': specific_reservations,
 
                 # Conda setup
-                'conda_installation_commands':
-                    constants.CONDA_INSTALLATION_COMMANDS,
                 # We should not use `.format`, as it contains '{}' as the bash
                 # syntax.
+                'conda_installation_commands':
+                    constants.CONDA_INSTALLATION_COMMANDS.replace(
+                        '{conda_auto_activate}', conda_auto_activate),
                 'ray_skypilot_installation_commands':
                     (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
                         '{sky_wheel_hash}',
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 9545436f05c..ca18f44f6da 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -82,7 +82,7 @@
     clouds.AWS: 90,
     clouds.Azure: 90,
     clouds.GCP: 240,
-    clouds.Lambda: 150,
+    clouds.Lambda: 300,
     clouds.IBM: 160,
     clouds.OCI: 300,
     clouds.Paperspace: 600,
@@ -1933,7 +1933,7 @@ def provision_with_retries(
         while True:
             if (isinstance(to_provision.cloud, clouds.Azure) and
                     to_provision.accelerators is not None and
-                    'A10' in to_provision.accelerators):
+                    'A10' in to_provision.accelerators and prev_handle is None):
                 logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
                                'an A10 cluster on Azure. This may take ~20 '
                                'minutes due to driver installation.'
@@ -3112,7 +3112,8 @@ def _setup_node(node_id: int) -> None:
             setup_script = log_lib.make_task_bash_script(setup,
                                                          env_vars=setup_envs)
             encoded_script = shlex.quote(setup_script)
-            if detach_setup or _is_command_length_over_limit(encoded_script):
+
+            def _dump_setup_script(setup_script: str) -> None:
                 with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
                     f.write(setup_script)
                     f.flush()
@@ -3121,6 +3122,9 @@ def _setup_node(node_id: int) -> None:
                                  target=remote_setup_file_name,
                                  up=True,
                                  stream_logs=False)
+
+            if detach_setup or _is_command_length_over_limit(encoded_script):
+                _dump_setup_script(setup_script)
                 create_script_code = 'true'
             else:
                 create_script_code = (f'{{ echo {encoded_script} > '
@@ -3128,20 +3132,42 @@ def _setup_node(node_id: int) -> None:
 
             if detach_setup:
                 return
+
             setup_log_path = os.path.join(self.log_dir,
                                           f'setup-{runner.node_id}.log')
-            returncode = runner.run(
-                f'{create_script_code} && {setup_cmd}',
-                log_path=setup_log_path,
-                process_stream=False,
-                # We do not source bashrc for setup, since bashrc is sourced
-                # in the script already.
-                # Skip an empty line and two lines due to the /bin/bash -i and
-                # source ~/.bashrc in the setup_cmd.
-                #   bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
-                #   bash: no job control in this shell
-                skip_lines=3,
-            )
+
+            def _run_setup(setup_cmd: str) -> int:
+                returncode = runner.run(
+                    setup_cmd,
+                    log_path=setup_log_path,
+                    process_stream=False,
+                    # We do not source bashrc for setup, since bashrc is sourced
+                    # in the script already.
+                    # Skip an empty line and two lines due to the /bin/bash -i
+                    # and source ~/.bashrc in the setup_cmd.
+                    #   bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
+                    #   bash: no job control in this shell
+                    skip_lines=3)
+                return returncode
+
+            returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
+            if returncode == 255:
+                is_message_too_long = False
+                with open(setup_log_path, 'r', encoding='utf-8') as f:
+                    if 'too long' in f.read():
+                        is_message_too_long = True
+
+                if is_message_too_long:
+                    # If the setup script is too long, we retry it with dumping
+                    # the script to a file and running it with SSH. We use a
+                    # general length limit check before but it could be
+                    # inaccurate on some systems.
+                    logger.debug(
+                        'Failed to run setup command inline due to '
+                        'command length limit. Dumping setup script to '
+                        'file and running it with SSH.')
+                    _dump_setup_script(setup_script)
+                    returncode = _run_setup(setup_cmd)
 
             def error_message() -> str:
                 # Use the function to avoid tailing the file in success case
@@ -3223,7 +3249,8 @@ def _exec_code_on_head(
 
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
-        if _is_command_length_over_limit(job_submit_cmd):
+
+        def _dump_code_to_file(codegen: str) -> None:
             runners = handle.get_command_runners()
             head_runner = runners[0]
             with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
@@ -3238,6 +3265,9 @@ def _exec_code_on_head(
                                   target=script_path,
                                   up=True,
                                   stream_logs=False)
+
+        if _is_command_length_over_limit(job_submit_cmd):
+            _dump_code_to_file(codegen)
             job_submit_cmd = f'{mkdir_code} && {code}'
 
         if managed_job_dag is not None:
@@ -3263,6 +3293,16 @@ def _exec_code_on_head(
                                                       job_submit_cmd,
                                                       stream_logs=False,
                                                       require_outputs=True)
+        if returncode == 255 and 'too long' in stdout + stderr:
+            # If the setup script is too long, we retry it with dumping
+            # the script to a file and running it with SSH. We use a general
+            # length limit check before but it could be inaccurate on some
+            # systems.
+            _dump_code_to_file(codegen)
+            returncode, stdout, stderr = self.run_on_head(handle,
+                                                          job_submit_cmd,
+                                                          stream_logs=False,
+                                                          require_outputs=True)
 
         # Happens when someone calls `sky exec` but remote is outdated
         # necessitating calling `sky launch`.
diff --git a/sky/cli.py b/sky/cli.py
index e50aca011a6..eb0267f7ced 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -29,6 +29,7 @@
 import multiprocessing
 import os
 import shlex
+import shutil
 import signal
 import subprocess
 import sys
@@ -368,7 +369,9 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
                 echo "{bashrc_diff}" >> ~/.bashrc'
 
         cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || '
-               f'[[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd})')
+               f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || '
+               f'(echo "Bash must be version 4 or above." && exit 1))')
+
         reload_cmd = _RELOAD_BASH_CMD
 
     elif value == 'fish':
@@ -390,7 +393,10 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
         ctx.exit()
 
     try:
-        subprocess.run(cmd, shell=True, check=True, executable='/bin/bash')
+        subprocess.run(cmd,
+                       shell=True,
+                       check=True,
+                       executable=shutil.which('bash'))
         click.secho(f'Shell completion installed for {value}', fg='green')
         click.echo(
             'Completion will take effect once you restart the terminal: ' +
diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
index 3a05223574d..693fc142eee 100644
--- a/sky/clouds/aws.py
+++ b/sky/clouds/aws.py
@@ -798,7 +798,11 @@ def instance_type_exists(self, instance_type):
 
     @classmethod
     def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
-        return 'standard' if disk_tier == resources_utils.DiskTier.LOW else 'gp3'
+        if disk_tier == resources_utils.DiskTier.LOW:
+            return 'standard'
+        if disk_tier == resources_utils.DiskTier.ULTRA:
+            return 'io2'
+        return 'gp3'
 
     @classmethod
     def _get_disk_specs(
@@ -806,15 +810,19 @@ def _get_disk_specs(
             disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
         tier = cls._translate_disk_tier(disk_tier)
         tier2iops = {
+            resources_utils.DiskTier.ULTRA: 20000,
             resources_utils.DiskTier.HIGH: 7000,
             resources_utils.DiskTier.MEDIUM: 3500,
-            resources_utils.DiskTier.LOW: 0,  # only gp3 is required to set iops
+            resources_utils.DiskTier.LOW: 0,  # iops is not required on standard disk
         }
         return {
             'disk_tier': cls._get_disk_type(tier),
-            'disk_iops': tier2iops[tier],
-            'disk_throughput': tier2iops[tier] // 16,
-            'custom_disk_perf': tier != resources_utils.DiskTier.LOW,
+            'disk_iops': tier2iops[tier]
+                         if cls._get_disk_type(tier) != 'standard' else None,
+            # Custom disk throughput is only available for gp3
+            # see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html
+            'disk_throughput': tier2iops[tier] // 16
+                               if cls._get_disk_type(tier) == 'gp3' else None,
         }
 
     @classmethod
diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py
index 928ceb5cc52..1768cd6091e 100644
--- a/sky/clouds/azure.py
+++ b/sky/clouds/azure.py
@@ -60,9 +60,10 @@ class Azure(clouds.Cloud):
     _MAX_CLUSTER_NAME_LEN_LIMIT = 42
     _BEST_DISK_TIER = resources_utils.DiskTier.MEDIUM
     _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
-    # Azure does not support high disk tier.
-    _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
-                             {resources_utils.DiskTier.HIGH})
+    # Azure does not support high disk and ultra disk tier.
+    _SUPPORTED_DISK_TIERS = (
+        set(resources_utils.DiskTier) -
+        {resources_utils.DiskTier.HIGH, resources_utils.DiskTier.ULTRA})
 
     _INDENT_PREFIX = ' ' * 4
 
@@ -599,9 +600,10 @@ def check_disk_tier(
             disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
         if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
             return True, ''
-        if disk_tier == resources_utils.DiskTier.HIGH:
-            return False, ('Azure disk_tier=high is not supported now. '
-                           'Please use disk_tier={low, medium} instead.')
+        if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
+            return False, (
+                'Azure disk_tier={high, ultra} is not supported now. '
+                'Please use disk_tier={low, medium, best} instead.')
         # Only S-series supported premium ssd
         # see https://stackoverflow.com/questions/48590520/azure-requested-operation-cannot-be-performed-because-storage-account-type-pre  # pylint: disable=line-too-long
         if cls._get_disk_type(
@@ -628,6 +630,7 @@ def _get_disk_type(cls,
         # TODO(tian): Maybe use PremiumV2_LRS/UltraSSD_LRS? Notice these two
         # cannot be used as OS disks so we might need data disk support
         tier2name = {
+            resources_utils.DiskTier.ULTRA: 'Disabled',
             resources_utils.DiskTier.HIGH: 'Disabled',
             resources_utils.DiskTier.MEDIUM: 'Premium_LRS',
             resources_utils.DiskTier.LOW: 'Standard_LRS',
diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py
index 9775109ac80..7d3eb157c61 100644
--- a/sky/clouds/cloud.py
+++ b/sky/clouds/cloud.py
@@ -117,7 +117,7 @@ class Cloud:
 
     _REPR = '<Cloud>'
     _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
-    _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
+    _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
     _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
     _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
 
diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py
index 643d55d7037..79a1453c581 100644
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -7,7 +7,7 @@
 import subprocess
 import time
 import typing
-from typing import Dict, Iterator, List, Optional, Set, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
 
 import colorama
 
@@ -437,6 +437,7 @@ def make_deploy_resources_variables(
             'custom_resources': None,
             'use_spot': r.use_spot,
             'gcp_project_id': self.get_project_id(dryrun),
+            **GCP._get_disk_specs(r.disk_tier),
         }
         accelerators = r.accelerators
         if accelerators is not None:
@@ -495,8 +496,6 @@ def make_deploy_resources_variables(
             resources_vars['machine_image'] = image_id
             resources_vars['image_id'] = None
 
-        resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier)
-
         firewall_rule = None
         if resources.ports is not None:
             firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format(
@@ -917,12 +916,24 @@ def _get_disk_type(cls,
                        disk_tier: Optional[resources_utils.DiskTier]) -> str:
         tier = cls._translate_disk_tier(disk_tier)
         tier2name = {
+            resources_utils.DiskTier.ULTRA: 'pd-extreme',
             resources_utils.DiskTier.HIGH: 'pd-ssd',
             resources_utils.DiskTier.MEDIUM: 'pd-balanced',
             resources_utils.DiskTier.LOW: 'pd-standard',
         }
         return tier2name[tier]
 
+    @classmethod
+    def _get_disk_specs(
+            cls,
+            disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
+        specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)}
+        if disk_tier == resources_utils.DiskTier.ULTRA:
+            # Only pd-extreme supports custom iops.
+            # see https://cloud.google.com/compute/docs/disks#disk-types
+            specs['disk_iops'] = 20000
+        return specs
+
     @classmethod
     def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
         return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py
index 7875e26d9cc..57f3a9ffe16 100644
--- a/sky/clouds/oci.py
+++ b/sky/clouds/oci.py
@@ -42,7 +42,9 @@ class OCI(clouds.Cloud):
 
     _INDENT_PREFIX = '    '
 
-    _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
+    _SUPPORTED_DISK_TIERS = (set(resources_utils.DiskTier) -
+                             {resources_utils.DiskTier.ULTRA})
+    _BEST_DISK_TIER = resources_utils.DiskTier.HIGH
 
     @classmethod
     def _unsupported_features_for_resources(
@@ -414,6 +416,19 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]:
                 f'{cls._INDENT_PREFIX}Error details: '
                 f'{common_utils.format_exception(e, use_bracket=True)}')
 
+    @classmethod
+    def check_disk_tier(
+            cls, instance_type: Optional[str],
+            disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
+        del instance_type  # Unused.
+        if disk_tier is None or disk_tier == resources_utils.DiskTier.BEST:
+            return True, ''
+        if disk_tier == resources_utils.DiskTier.ULTRA:
+            return False, ('OCI disk_tier=ultra is not supported now. '
+                           'Please use disk_tier={low, medium, high, best} '
+                           'instead.')
+        return True, ''
+
     def get_credential_file_mounts(self) -> Dict[str, str]:
         """Returns a dict of credential file paths to mount paths."""
         oci_cfg_file = oci_adaptor.get_config_file()
diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py
index 141b356712e..2d323cbac5f 100644
--- a/sky/clouds/service_catalog/azure_catalog.py
+++ b/sky/clouds/service_catalog/azure_catalog.py
@@ -110,7 +110,8 @@ def get_default_instance_type(
         _DEFAULT_INSTANCE_FAMILY)]
 
     def _filter_disk_type(instance_type: str) -> bool:
-        return Azure.check_disk_tier(instance_type, disk_tier)[0]
+        valid, _ = Azure.check_disk_tier(instance_type, disk_tier)
+        return valid
 
     df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py
index fbbe0fdcef1..1b5fec9e8e8 100644
--- a/sky/clouds/service_catalog/common.py
+++ b/sky/clouds/service_catalog/common.py
@@ -58,7 +58,9 @@ class InstanceTypeInfo(NamedTuple):
 
 
 def get_catalog_path(filename: str) -> str:
-    return os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, filename)
+    catalog_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, filename)
+    os.makedirs(os.path.dirname(catalog_path), exist_ok=True)
+    return catalog_path
 
 
 def is_catalog_modified(filename: str) -> bool:
@@ -225,7 +227,7 @@ def _update_catalog():
                         with open(meta_path + '.md5', 'w',
                                   encoding='utf-8') as f:
                             f.write(hashlib.md5(r.text.encode()).hexdigest())
-                logger.info(f'Updated {cloud} catalog.')
+                logger.debug(f'Updated {cloud} catalog {filename}.')
 
     return LazyDataFrame(catalog_path, update_func=_update_catalog)
 
diff --git a/sky/clouds/service_catalog/oci_catalog.py b/sky/clouds/service_catalog/oci_catalog.py
index 2561b913dcf..a18dee79be5 100644
--- a/sky/clouds/service_catalog/oci_catalog.py
+++ b/sky/clouds/service_catalog/oci_catalog.py
@@ -15,6 +15,7 @@
 from typing import Dict, List, Optional, Tuple
 
 from sky.adaptors import oci as oci_adaptor
+from sky.clouds import OCI
 from sky.clouds.service_catalog import common
 from sky.clouds.utils import oci_utils
 from sky.utils import resources_utils
@@ -102,7 +103,6 @@ def get_default_instance_type(
         cpus: Optional[str] = None,
         memory: Optional[str] = None,
         disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
-    del disk_tier  # unused
     if cpus is None:
         cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+'
 
@@ -111,12 +111,17 @@ def get_default_instance_type(
     else:
         memory_gb_or_ratio = memory
 
+    def _filter_disk_type(instance_type: str) -> bool:
+        valid, _ = OCI.check_disk_tier(instance_type, disk_tier)
+        return valid
+
     instance_type_prefix = tuple(
         f'{family}' for family in oci_utils.oci_config.DEFAULT_INSTANCE_FAMILY)
 
     df = _get_df()
     df = df[df['InstanceType'].notna()]
     df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
+    df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
 
     logger.debug(f'# get_default_instance_type: {df}')
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
diff --git a/sky/exceptions.py b/sky/exceptions.py
index 99784a8c96d..15f3ea3f34e 100644
--- a/sky/exceptions.py
+++ b/sky/exceptions.py
@@ -100,9 +100,13 @@ def __init__(self, returncode: int, command: str, error_msg: str,
         self.command = command
         self.error_msg = error_msg
         self.detailed_reason = detailed_reason
+
         if not command:
             message = error_msg
         else:
+            if len(command) > 100:
+                # Chunck the command to avoid overflow.
+                command = command[:100] + '...'
             message = (f'Command {command} failed with return code '
                        f'{returncode}.\n{error_msg}')
         super().__init__(message)
diff --git a/sky/optimizer.py b/sky/optimizer.py
index 10aa697258b..4326329579d 100644
--- a/sky/optimizer.py
+++ b/sky/optimizer.py
@@ -19,6 +19,7 @@
 from sky.adaptors import common as adaptors_common
 from sky.utils import env_options
 from sky.utils import log_utils
+from sky.utils import resources_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
@@ -935,6 +936,15 @@ def sort_key(row, accelerator_spot_list=accelerator_spot_list):
             table.add_rows(rows)
             logger.info(f'{table}\n')
 
+            # Warning message for using disk_tier=ultra
+            # TODO(yi): Consider price of disks in optimizer and
+            # move this warning there.
+            if chosen_resources.disk_tier == resources_utils.DiskTier.ULTRA:
+                logger.warning(
+                    'Using disk_tier=ultra will utilize more advanced disks '
+                    '(io2 Block Express on AWS and extreme persistent disk on '
+                    'GCP), which can lead to significant higher costs (~$2/h).')
+
     @staticmethod
     def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
         for node, candidate_set in node_to_candidate_map.items():
diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py
index e989fbc085a..7bfa1724b83 100644
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -110,8 +110,8 @@ def docker_start_cmds(
         '--cap-add=SYS_ADMIN',
         '--device=/dev/fuse',
         '--security-opt=apparmor:unconfined',
+        '--entrypoint=/bin/bash',
         image,
-        'bash',
     ]
     return ' '.join(docker_run)
 
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 7ad3d72e46b..8ac3ab1d4ca 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -110,8 +110,9 @@ def get_gke_accelerator_name(accelerator: str) -> str:
     if accelerator == 'H100':
         # H100 is named as H100-80GB in GKE.
         accelerator = 'H100-80GB'
-    if accelerator in ('A100-80GB', 'L4', 'H100-80GB'):
-        # A100-80GB, L4 and H100-80GB have a different name pattern.
+    if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
+        # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
+        # have a different name pattern.
         return 'nvidia-{}'.format(accelerator.lower())
     else:
         return 'nvidia-tesla-{}'.format(accelerator.lower())
@@ -194,13 +195,10 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
             return value.replace('nvidia-tesla-', '').upper()
         elif value.startswith('nvidia-'):
             acc = value.replace('nvidia-', '').upper()
-            if acc in ['H100-80GB', 'H100-MEGA-80GB']:
-                # H100 is named H100-80GB or H100-MEGA-80GB in GKE,
-                # where the latter has improved bandwidth.
-                # See a3-mega instances on GCP.
-                # TODO: we do not distinguish the two GPUs for simplicity,
-                # but we can evaluate whether we should distinguish
-                # them based on users' requests.
+            if acc == 'H100-80GB':
+                # H100 can be either H100-80GB or H100-MEGA-80GB in GKE
+                # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
+                # to distinguish between a3-high and a3-mega instances
                 return 'H100'
             return acc
         else:
diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py
index 24af263f13c..f1587463e84 100644
--- a/sky/provision/runpod/utils.py
+++ b/sky/provision/runpod/utils.py
@@ -77,7 +77,11 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
         info['name'] = instance['name']
         info['port2endpoint'] = {}
 
-        if instance['desiredStatus'] == 'RUNNING' and instance.get('runtime'):
+        # Sometimes when the cluster is in the process of being created,
+        # the `port` field in the runtime is None and we need to check for it.
+        if (instance['desiredStatus'] == 'RUNNING' and
+                instance.get('runtime') and
+                instance.get('runtime').get('ports')):
             for port in instance['runtime']['ports']:
                 if port['isIpPublic']:
                     if port['privatePort'] == 22:
diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py
index 30820a3a91e..f23dc8100b5 100644
--- a/sky/skylet/constants.py
+++ b/sky/skylet/constants.py
@@ -135,8 +135,9 @@
     # true.
     '{ bash Miniconda3-Linux-x86_64.sh -b; '
     'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
-    'conda config --set auto_activate_base true && '
-    f'conda activate base; }}; }}; '
+    # Caller should replace {conda_auto_activate} with either true or false.
+    'conda config --set auto_activate_base {conda_auto_activate} && '
+    'conda activate base; }; }; '
     'grep "# >>> conda initialize >>>" ~/.bashrc || '
     '{ conda init && source ~/.bashrc; };'
     # If Python version is larger then equal to 3.12, create a new conda env
@@ -145,7 +146,7 @@
     # costly to create a new conda env, and venv should be a lightweight and
     # faster alternative when the python version satisfies the requirement.
     '[[ $(python3 --version | cut -d " " -f 2 | cut -d "." -f 2) -ge 12 ]] && '
-    f'echo "Creating conda env with Python 3.10" && '
+    'echo "Creating conda env with Python 3.10" && '
     f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
     f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
     # Create a separate conda environment for SkyPilot dependencies.
diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py
index 06c5d6d48af..4f66ef54383 100644
--- a/sky/skylet/providers/command_runner.py
+++ b/sky/skylet/providers/command_runner.py
@@ -65,8 +65,8 @@ def docker_start_cmds(
         '--cap-add=SYS_ADMIN',
         '--device=/dev/fuse',
         '--security-opt=apparmor:unconfined',
+        '--entrypoint=/bin/bash',
         image,
-        'bash',
     ]
     return ' '.join(docker_run)
 
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index 7e9dfccdaf1..6afdf381cc0 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -73,8 +73,10 @@ available_node_types:
             VolumeSize: {{disk_size}}
             VolumeType: {{disk_tier}}
             Encrypted: {{disk_encrypted}}
-            {% if custom_disk_perf %}
+            {% if disk_iops %}
             Iops: {{disk_iops}}
+            {% endif %}
+            {% if disk_throughput %}
             Throughput: {{disk_throughput}}
             {% endif %}
       {% if use_spot %}
diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2
index bcc16bac531..5f06eef05c7 100644
--- a/sky/templates/gcp-ray.yml.j2
+++ b/sky/templates/gcp-ray.yml.j2
@@ -124,6 +124,9 @@ available_node_types:
             sourceImage: {{image_id}}
           {%- endif %}
             diskType: zones/{{zones}}/diskTypes/{{disk_tier}}
+          {%- if disk_iops %}
+            provisionedIops: {{disk_iops}}
+          {%- endif %}
   {%- if gpu is not none %}
       guestAccelerators:
         - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py
index 95c784143cc..6f5c07f7d25 100644
--- a/sky/utils/resources_utils.py
+++ b/sky/utils/resources_utils.py
@@ -24,6 +24,7 @@ class DiskTier(enum.Enum):
     LOW = 'low'
     MEDIUM = 'medium'
     HIGH = 'high'
+    ULTRA = 'ultra'
     BEST = 'best'
 
     @classmethod
diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py
index becf3ba461a..dfda65e23da 100644
--- a/tests/test_optimizer_dryruns.py
+++ b/tests/test_optimizer_dryruns.py
@@ -771,3 +771,10 @@ def _get_all_candidate_cloud(r: sky.Resources) -> Set[clouds.Cloud]:
     assert high_tier_candidates == set(
         map(clouds.CLOUD_REGISTRY.get,
             ['aws', 'gcp', 'oci'])), high_tier_candidates
+
+    # Only AWS, GCP supports ULTRA disk tier.
+    ultra_tier_resources = sky.Resources(
+        disk_tier=resources_utils.DiskTier.ULTRA)
+    ultra_tier_candidates = _get_all_candidate_cloud(ultra_tier_resources)
+    assert ultra_tier_candidates == set(
+        map(clouds.CLOUD_REGISTRY.get, ['aws', 'gcp'])), ultra_tier_candidates
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index f338de2dda7..63ccd19857d 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -34,6 +34,7 @@
 import subprocess
 import sys
 import tempfile
+import textwrap
 import time
 from typing import Dict, List, NamedTuple, Optional, Tuple
 import urllib.parse
@@ -3304,11 +3305,11 @@ def _get_aws_query_command(region, instance_id, field, expected):
                 f'Reservations[].Instances[].InstanceId --output text`; ' +
                 _get_aws_query_command(region, '$id', 'VolumeType',
                                        specs['disk_tier']) +
-                ('' if disk_tier == resources_utils.DiskTier.LOW else
-                 (_get_aws_query_command(region, '$id', 'Iops',
-                                         specs['disk_iops']) +
-                  _get_aws_query_command(region, '$id', 'Throughput',
-                                         specs['disk_throughput']))),
+                ('' if specs['disk_tier']
+                 == 'standard' else _get_aws_query_command(
+                     region, '$id', 'Iops', specs['disk_iops'])) +
+                ('' if specs['disk_tier'] != 'gp3' else _get_aws_query_command(
+                    region, '$id', 'Throughput', specs['disk_throughput'])),
             ],
             f'sky down -y {name}',
             timeout=10 * 60,  # 10 mins  (it takes around ~6 mins)
@@ -3344,8 +3345,8 @@ def test_gcp_disk_tier():
 @pytest.mark.azure
 def test_azure_disk_tier():
     for disk_tier in list(resources_utils.DiskTier):
-        if disk_tier == resources_utils.DiskTier.HIGH:
-            # Azure does not support high disk tier.
+        if disk_tier == resources_utils.DiskTier.HIGH or disk_tier == resources_utils.DiskTier.ULTRA:
+            # Azure does not support high and ultra disk tier.
             continue
         type = Azure._get_disk_type(disk_tier)
         name = _get_cluster_name() + '-' + disk_tier.value
@@ -3436,6 +3437,43 @@ def test_gcp_zero_quota_failover():
     run_one_test(test)
 
 
+def test_long_setup_run_script(generic_cloud: str):
+    name = _get_cluster_name()
+    with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
+                                     suffix='.yaml') as f:
+        f.write(
+            textwrap.dedent(""" \
+            setup: |
+              echo "start long setup"
+            """))
+        for i in range(1024 * 120):
+            f.write(f'  echo {i}\n')
+        f.write('  echo "end long setup"\n')
+        f.write(
+            textwrap.dedent(""" \
+            run: |
+              echo "run"
+        """))
+        for i in range(1024 * 120):
+            f.write(f'  echo {i}\n')
+        f.write('  echo "end run"\n')
+        f.flush()
+
+        test = Test(
+            'long-setup-run-script',
+            [
+                f'sky launch -y -c {name} --cloud {generic_cloud} --detach-setup --detach-run --cpus 2+ {f.name}',
+                f'sky exec --detach-run {name} "echo hello"',
+                f'sky exec --detach-run {name} {f.name}',
+                f'sky logs {name} --status 1',
+                f'sky logs {name} --status 2',
+                f'sky logs {name} --status 3',
+            ],
+            f'sky down -y {name}',
+        )
+        run_one_test(test)
+
+
 # ---------- Testing skyserve ----------
 
 
diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py
index 6fb9f1bcd14..70da0532e9b 100644
--- a/tests/unit_tests/test_resources.py
+++ b/tests/unit_tests/test_resources.py
@@ -125,7 +125,6 @@ def test_aws_make_deploy_variables(*mocks) -> None:
         'disk_tier': 'gp3',
         'disk_throughput': 218,
         'disk_iops': 3500,
-        'custom_disk_perf': True,
         'docker_image': None,
         'docker_container_name': 'sky_container',
         'docker_login_config': None,