From 95b52c0c5a655e6b1ba2ef65426f4e2fc9fd46c2 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Thu, 29 Aug 2024 11:11:21 -0700
Subject: [PATCH] [Docs] Add docs for run many jobs (#3847)

* Add docs for running N jobs

* Fix language

* Fix job queue

* Add link to managed jobs

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Add script for generating config files

* Fix comments

* fix title

* fix title

* fix

* reduce image size

* restructure

* rename

* adopt comments

* Add benefits

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* update

* rename

* fix

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Update docs/source/running-jobs/many-jobs.rst

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>

* Minor fix for comments

---------

Co-authored-by: Zongheng Yang <zongheng.y@gmail.com>
---
 docs/source/_static/custom.js                 |   2 +-
 docs/source/developers/index.rst              |   8 +
 docs/source/docs/index.rst                    |  12 +-
 docs/source/getting-started/quickstart.rst    |   2 +-
 docs/source/getting-started/tutorial.rst      |   4 +-
 docs/source/reference/job-queue.rst           |   2 +-
 docs/source/running-jobs/distributed-jobs.rst |   2 +-
 docs/source/running-jobs/many-jobs.rst        | 346 ++++++++++++++++++
 8 files changed, 363 insertions(+), 15 deletions(-)
 create mode 100644 docs/source/developers/index.rst
 create mode 100644 docs/source/running-jobs/many-jobs.rst

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index e0de1b50d51..06b0cb2886a 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -27,8 +27,8 @@ document.addEventListener('DOMContentLoaded', () => {
     const newItems = [
         { selector: '.caption-text', text: 'SkyServe: Model Serving' },
         { selector: '.toctree-l1 > a', text: 'Managed Jobs' },
-        { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' },
         { selector: '.toctree-l1 > a', text: 'Llama-3.1 (Meta)' },
+        { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' },
     ];
     newItems.forEach(({ selector, text }) => {
         document.querySelectorAll(selector).forEach((el) => {
diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst
new file mode 100644
index 00000000000..a6b76e8a53c
--- /dev/null
+++ b/docs/source/developers/index.rst
@@ -0,0 +1,8 @@
+Developer Guides
+=================
+
+.. toctree::
+   :maxdepth: 1
+
+   ../developers/CONTRIBUTING
+   Guide: Adding a New Cloud <https://docs.google.com/document/d/1oWox3qb3Kz3wXXSGg9ZJWwijoa99a3PIQUHBR8UgEGs/edit?usp=sharing>
diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst
index f8d329a1431..31eaf0c9106 100644
--- a/docs/source/docs/index.rst
+++ b/docs/source/docs/index.rst
@@ -129,8 +129,8 @@ Read the research:
 
    ../getting-started/installation
    ../getting-started/quickstart
-   ../getting-started/tutorial
    ../examples/interactive-development
+   ../getting-started/tutorial
 
 
 .. toctree::
@@ -143,6 +143,7 @@ Read the research:
    ../examples/auto-failover
    ../reference/kubernetes/index
    ../running-jobs/distributed-jobs
+   ../running-jobs/many-jobs
 
 .. toctree::
    :hidden:
@@ -184,14 +185,6 @@ Read the research:
    SkyPilot vs. Other Systems <../reference/comparison>
 
 
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-   :caption: Developer Guides
-
-   ../developers/CONTRIBUTING
-   Guide: Adding a New Cloud <https://docs.google.com/document/d/1oWox3qb3Kz3wXXSGg9ZJWwijoa99a3PIQUHBR8UgEGs/edit?usp=sharing>
-
 .. toctree::
    :hidden:
    :maxdepth: 1
@@ -210,4 +203,5 @@ Read the research:
    ../reference/cli
    ../reference/api
    ../reference/config
+   ../developers/index
 
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index bfc6fd17e05..cdef2335dd7 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -219,7 +219,7 @@ Congratulations!  In this quickstart, you have launched a cluster, run a task, a
 
 Next steps:
 
-- Adapt :ref:`Tutorial: DNN Training <dnn-training>` to start running your own project on SkyPilot!
+- Adapt :ref:`Tutorial: AI Training <ai-training>` to start running your own project on SkyPilot!
 - See the :ref:`Task YAML reference <yaml-spec>`, :ref:`CLI reference <cli>`, and `more examples <https://github.com/skypilot-org/skypilot/tree/master/examples>`_
 - To learn more, try out `SkyPilot Tutorials <https://github.com/skypilot-org/skypilot-tutorial>`_ in Jupyter notebooks
 
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
index 92ef6f68b2f..175f1391a6d 100644
--- a/docs/source/getting-started/tutorial.rst
+++ b/docs/source/getting-started/tutorial.rst
@@ -1,6 +1,6 @@
-.. _dnn-training:
+.. _ai-training:
 
-Tutorial: DNN Training
+Tutorial: AI Training
 ======================
 This example uses SkyPilot to train a Transformer-based language model from HuggingFace.
 
diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst
index 6397c7bbbb6..c0016c4d6da 100644
--- a/docs/source/reference/job-queue.rst
+++ b/docs/source/reference/job-queue.rst
@@ -160,7 +160,7 @@ SkyPilot's scheduler serves two goals:
 2. **Minimizing resource idleness**: If a resource is idle, SkyPilot will schedule a
    queued job that can utilize that resource.
 
-We illustrate the scheduling behavior by revisiting :ref:`Tutorial: DNN Training <dnn-training>`.
+We illustrate the scheduling behavior by revisiting :ref:`Tutorial: AI Training <ai-training>`.
 In that tutorial, we have a task YAML that specifies these resource requirements:
 
 .. code-block:: yaml
diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst
index 9eb590c10bc..da3ddd8e94f 100644
--- a/docs/source/running-jobs/distributed-jobs.rst
+++ b/docs/source/running-jobs/distributed-jobs.rst
@@ -1,6 +1,6 @@
 .. _dist-jobs:
 
-Distributed Jobs on Many Nodes
+Distributed Multi-Node Jobs
 ================================================
 
 SkyPilot supports multi-node cluster
diff --git a/docs/source/running-jobs/many-jobs.rst b/docs/source/running-jobs/many-jobs.rst
new file mode 100644
index 00000000000..3d26d74e794
--- /dev/null
+++ b/docs/source/running-jobs/many-jobs.rst
@@ -0,0 +1,346 @@
+
+.. _many-jobs:
+
+Many Parallel Jobs
+======================
+
+SkyPilot allows you to easily **run many jobs in parallel** and manage them in a single system. This is useful for hyperparameter tuning sweeps, data processing, and other batch jobs.
+
+This guide shows a typical workflow for running many jobs with SkyPilot.
+
+
+.. image:: https://i.imgur.com/tvxeNyR.png
+  :width: 90%
+  :align: center
+.. TODO: Show the components in a GIF.
+
+
+Why Use SkyPilot to Run Many Jobs
+-------------------------------------
+
+- **Unified**: Use any or multiple of your own infrastructure (Kubernetes, cloud VMs, reservations, etc.).
+- **Elastic**: Scale up and down based on demands.
+- **Cost-effective**: Only pay for the cheapest resources.
+- **Robust**: Automatically recover jobs from failures.
+- **Observable**: Monitor and manage all jobs in a single pane of glass.
+
+Write a YAML for One Job
+-----------------------------------
+
+Before scaling up to many jobs, write a SkyPilot YAML for a single job first and ensure it runs correctly. This can save time by avoiding debugging many jobs at once.
+
+Here is the same example YAML as in :ref:`Tutorial: AI Training <ai-training>`:
+
+.. raw:: html
+
+    <details>
+    <summary>Click to expand: <code>train.yaml</code></summary>
+
+.. code-block:: yaml
+
+  # train.yaml
+  name: huggingface
+
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate 2e-5 \
+      --max_steps 50 \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16
+
+
+.. raw:: html
+
+    </details>
+
+
+First, launch the job to check it successfully launches and runs correctly:
+
+.. code-block:: bash
+
+  sky launch -c train train.yaml
+
+
+If there is any error, you can fix the code and/or the YAML, and launch the job again on the same cluster:
+
+.. code-block:: bash
+
+  # Cancel the latest job.
+  sky cancel train -y
+  # Run the job again on the same cluster.
+  sky launch -c train train.yaml
+
+
+Sometimes, it may be more efficient to log into the cluster and interactively debug the job. You can do so by directly :ref:`ssh'ing into the cluster or using VSCode's remote ssh <dev-connect>`.
+
+.. code-block:: bash
+
+  # Log into the cluster.
+  ssh train
+
+
+
+Next, after confirming the job is working correctly, **add (hyper)parameters** to the job YAML so that all job variants can be specified.
+
+1. Add Hyperparameters
+~~~~~~~~~~~~~~~~~~~~~~
+
+To launch jobs with different hyperparameters, add them as :ref:`environment variables <env-vars>` to the SkyPilot YAML, and make your main program read these environment variables:
+
+.. raw:: html
+
+    <details>
+    <summary>Updated SkyPilot YAML: <code>train-template.yaml</code></summary>
+
+.. code-block:: yaml
+  :emphasize-lines: 4-6,28-29
+
+  # train-template.yaml
+  name: huggingface
+
+  envs:
+    LR: 2e-5
+    MAX_STEPS: 50
+    
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate ${LR} \
+      --max_steps ${MAX_STEPS} \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16
+
+.. raw:: html
+    
+    </details>
+
+You can now use ``--env`` to launch a job with different hyperparameters:
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env LR=1e-5 \
+    --env MAX_STEPS=100
+
+Alternative, store the environment variable values in a dotenv file and use ``--env-file`` to launch:
+
+.. code-block:: bash
+
+  # configs/job1
+  LR=1e-5
+  MAX_STEPS=100
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env-file configs/job1
+
+
+
+2. Logging Job Outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+When running many jobs, it is useful to log the outputs of all jobs. You can use tools like `W&B <https://wandb.ai>`__ for this purpose:
+
+.. raw:: html
+
+    <details>
+    <summary>SkyPilot YAML with W&B: <code>train-template.yaml</code></summary>
+
+.. code-block:: yaml
+  :emphasize-lines: 7-7,19-19,34-34
+
+  # train-template.yaml
+  name: huggingface
+
+  envs:
+    LR: 2e-5
+    MAX_STEPS: 50
+    WANDB_API_KEY: # Empty field means this field is required when launching the job.
+      
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    set -e  # Exit if any command failed.
+    git clone https://github.com/huggingface/transformers/ || true
+    cd transformers
+    pip install .
+    cd examples/pytorch/text-classification
+    pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
+    pip install wandb
+
+  run: |
+    set -e  # Exit if any command failed.
+    cd transformers/examples/pytorch/text-classification
+    python run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate ${LR} \
+      --max_steps ${MAX_STEPS} \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16 \
+      --report_to wandb
+
+.. raw:: html
+
+    </details>
+
+You can now launch the job with the following command (``WANDB_API_KEY`` should existing in your local environment variables).
+
+.. code-block:: bash
+
+  sky launch -c train train-template.yaml \
+    --env-file configs/job1 \
+    --env WANDB_API_KEY
+
+
+
+Scale Out to Many Jobs
+-----------------------
+
+With the above setup, you can now scale out to run many jobs in parallel. You
+can either use SkyPilot CLI with many config files or use SkyPilot Python API.
+
+With CLI and Config Files
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can run many jobs in parallel by (1) creating multiple config files and (2)
+submitting them as :ref:`SkyPilot managed jobs <managed-jobs>`.
+
+First, create a config file for each job (for example, in a ``configs`` directory):
+
+.. code-block:: bash
+
+  # configs/job-1
+  LR=1e-5
+  MAX_STEPS=100
+
+  # configs/job-2
+  LR=2e-5
+  MAX_STEPS=200
+
+  ...
+
+.. raw:: html
+
+  <details>
+  <summary>An example Python script to generate config files</summary>
+
+.. code-block:: python
+
+  import os
+
+  CONFIG_PATH = 'configs'
+  LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0]
+  MAX_STEPS_CANDIDATES = [100, 300, 1000]
+
+  os.makedirs(CONFIG_PATH, exist_ok=True)
+
+  job_idx = 1
+  for lr in LR_CANDIDATES:
+    for max_steps in MAX_STEPS_CANDIDATES:
+      config_file = f'{CONFIG_PATH}/job-{job_idx}'
+      with open(config_file, 'w') as f:
+        print(f'LR={lr}', file=f)
+        print(f'MAX_STEPS={max_steps}', file=f)
+      job_idx += 1
+
+.. raw:: html
+
+  </details>
+
+Then, submit all jobs by iterating over the config files and calling ``sky jobs launch`` on each:
+
+.. code-block:: bash
+
+  for config_file in configs/*; do
+    job_name=$(basename $config_file)
+    # -y: yes to all prompts.
+    # -d: detach from the job's logging, so the next job can be submitted
+    #      without waiting for the previous job to finish.
+    sky jobs launch -n train-$job_name -y -d train-template.yaml \
+      --env-file $config_file \
+      --env WANDB_API_KEY
+  done
+
+
+Job statuses can be checked via ``sky jobs queue``:
+
+.. code-block:: console
+
+  $ sky jobs queue
+
+  Fetching managed job statuses...
+  Managed jobs
+  In progress tasks: 10 RUNNING
+  ID  TASK  NAME        RESOURCES  SUBMITTED    TOT. DURATION  JOB DURATION  #RECOVERIES  STATUS   
+  10  -     train-job10 1x[V100:4] 5 mins ago   5m 5s          1m 12s        0            RUNNING
+  9   -     train-job9  1x[V100:4] 6 mins ago   6m 11s         2m 23s        0            RUNNING
+  8   -     train-job8  1x[V100:4] 7 mins ago   7m 15s         3m 31s        0            RUNNING
+  ...
+
+
+With Python API
+~~~~~~~~~~~~~~~
+
+To have more customized control over generation of job variants, you can also use SkyPilot Python API to launch the jobs.
+
+.. code-block:: python
+
+  import os
+  import sky
+
+  LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0]
+  MAX_STEPS_CANDIDATES = [100, 300, 1000]
+  task = sky.Task.from_yaml('train-template.yaml')
+
+  job_idx = 1
+  for lr in LR_CANDIDATES:
+    for max_steps in MAX_STEPS_CANDIDATES:
+      task.update_envs({'LR': lr, 'MAX_STEPS': max_steps})
+      sky.jobs.launch(
+        task,
+        name=f'train-job{job_idx}',
+        detach_run=True,
+        retry_until_up=True,
+      )
+      job_idx += 1