diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index e0de1b50d51..06b0cb2886a 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -27,8 +27,8 @@ document.addEventListener('DOMContentLoaded', () => { const newItems = [ { selector: '.caption-text', text: 'SkyServe: Model Serving' }, { selector: '.toctree-l1 > a', text: 'Managed Jobs' }, - { selector: '.toctree-l1 > a', text: 'Running on Kubernetes' }, { selector: '.toctree-l1 > a', text: 'Llama-3.1 (Meta)' }, + { selector: '.toctree-l1 > a', text: 'Many Parallel Jobs' }, ]; newItems.forEach(({ selector, text }) => { document.querySelectorAll(selector).forEach((el) => { diff --git a/docs/source/developers/index.rst b/docs/source/developers/index.rst new file mode 100644 index 00000000000..a6b76e8a53c --- /dev/null +++ b/docs/source/developers/index.rst @@ -0,0 +1,8 @@ +Developer Guides +================= + +.. toctree:: + :maxdepth: 1 + + ../developers/CONTRIBUTING + Guide: Adding a New Cloud diff --git a/docs/source/docs/index.rst b/docs/source/docs/index.rst index f8d329a1431..31eaf0c9106 100644 --- a/docs/source/docs/index.rst +++ b/docs/source/docs/index.rst @@ -129,8 +129,8 @@ Read the research: ../getting-started/installation ../getting-started/quickstart - ../getting-started/tutorial ../examples/interactive-development + ../getting-started/tutorial .. toctree:: @@ -143,6 +143,7 @@ Read the research: ../examples/auto-failover ../reference/kubernetes/index ../running-jobs/distributed-jobs + ../running-jobs/many-jobs .. toctree:: :hidden: @@ -184,14 +185,6 @@ Read the research: SkyPilot vs. Other Systems <../reference/comparison> -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Developer Guides - - ../developers/CONTRIBUTING - Guide: Adding a New Cloud - .. toctree:: :hidden: :maxdepth: 1 @@ -210,4 +203,5 @@ Read the research: ../reference/cli ../reference/api ../reference/config + ../developers/index diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index bfc6fd17e05..cdef2335dd7 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -219,7 +219,7 @@ Congratulations! In this quickstart, you have launched a cluster, run a task, a Next steps: -- Adapt :ref:`Tutorial: DNN Training ` to start running your own project on SkyPilot! +- Adapt :ref:`Tutorial: AI Training ` to start running your own project on SkyPilot! - See the :ref:`Task YAML reference `, :ref:`CLI reference `, and `more examples `_ - To learn more, try out `SkyPilot Tutorials `_ in Jupyter notebooks diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst index 92ef6f68b2f..175f1391a6d 100644 --- a/docs/source/getting-started/tutorial.rst +++ b/docs/source/getting-started/tutorial.rst @@ -1,6 +1,6 @@ -.. _dnn-training: +.. _ai-training: -Tutorial: DNN Training +Tutorial: AI Training ====================== This example uses SkyPilot to train a Transformer-based language model from HuggingFace. diff --git a/docs/source/reference/job-queue.rst b/docs/source/reference/job-queue.rst index 6397c7bbbb6..c0016c4d6da 100644 --- a/docs/source/reference/job-queue.rst +++ b/docs/source/reference/job-queue.rst @@ -160,7 +160,7 @@ SkyPilot's scheduler serves two goals: 2. **Minimizing resource idleness**: If a resource is idle, SkyPilot will schedule a queued job that can utilize that resource. -We illustrate the scheduling behavior by revisiting :ref:`Tutorial: DNN Training `. +We illustrate the scheduling behavior by revisiting :ref:`Tutorial: AI Training `. In that tutorial, we have a task YAML that specifies these resource requirements: .. code-block:: yaml diff --git a/docs/source/running-jobs/distributed-jobs.rst b/docs/source/running-jobs/distributed-jobs.rst index 9eb590c10bc..da3ddd8e94f 100644 --- a/docs/source/running-jobs/distributed-jobs.rst +++ b/docs/source/running-jobs/distributed-jobs.rst @@ -1,6 +1,6 @@ .. _dist-jobs: -Distributed Jobs on Many Nodes +Distributed Multi-Node Jobs ================================================ SkyPilot supports multi-node cluster diff --git a/docs/source/running-jobs/many-jobs.rst b/docs/source/running-jobs/many-jobs.rst new file mode 100644 index 00000000000..3d26d74e794 --- /dev/null +++ b/docs/source/running-jobs/many-jobs.rst @@ -0,0 +1,346 @@ + +.. _many-jobs: + +Many Parallel Jobs +====================== + +SkyPilot allows you to easily **run many jobs in parallel** and manage them in a single system. This is useful for hyperparameter tuning sweeps, data processing, and other batch jobs. + +This guide shows a typical workflow for running many jobs with SkyPilot. + + +.. image:: https://i.imgur.com/tvxeNyR.png + :width: 90% + :align: center +.. TODO: Show the components in a GIF. + + +Why Use SkyPilot to Run Many Jobs +------------------------------------- + +- **Unified**: Use any or multiple of your own infrastructure (Kubernetes, cloud VMs, reservations, etc.). +- **Elastic**: Scale up and down based on demands. +- **Cost-effective**: Only pay for the cheapest resources. +- **Robust**: Automatically recover jobs from failures. +- **Observable**: Monitor and manage all jobs in a single pane of glass. + +Write a YAML for One Job +----------------------------------- + +Before scaling up to many jobs, write a SkyPilot YAML for a single job first and ensure it runs correctly. This can save time by avoiding debugging many jobs at once. + +Here is the same example YAML as in :ref:`Tutorial: AI Training `: + +.. raw:: html + +
+ Click to expand: train.yaml + +.. code-block:: yaml + + # train.yaml + name: huggingface + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --max_steps 50 \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 + + +.. raw:: html + +
+ + +First, launch the job to check it successfully launches and runs correctly: + +.. code-block:: bash + + sky launch -c train train.yaml + + +If there is any error, you can fix the code and/or the YAML, and launch the job again on the same cluster: + +.. code-block:: bash + + # Cancel the latest job. + sky cancel train -y + # Run the job again on the same cluster. + sky launch -c train train.yaml + + +Sometimes, it may be more efficient to log into the cluster and interactively debug the job. You can do so by directly :ref:`ssh'ing into the cluster or using VSCode's remote ssh `. + +.. code-block:: bash + + # Log into the cluster. + ssh train + + + +Next, after confirming the job is working correctly, **add (hyper)parameters** to the job YAML so that all job variants can be specified. + +1. Add Hyperparameters +~~~~~~~~~~~~~~~~~~~~~~ + +To launch jobs with different hyperparameters, add them as :ref:`environment variables ` to the SkyPilot YAML, and make your main program read these environment variables: + +.. raw:: html + +
+ Updated SkyPilot YAML: train-template.yaml + +.. code-block:: yaml + :emphasize-lines: 4-6,28-29 + + # train-template.yaml + name: huggingface + + envs: + LR: 2e-5 + MAX_STEPS: 50 + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate ${LR} \ + --max_steps ${MAX_STEPS} \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 + +.. raw:: html + +
+ +You can now use ``--env`` to launch a job with different hyperparameters: + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env LR=1e-5 \ + --env MAX_STEPS=100 + +Alternative, store the environment variable values in a dotenv file and use ``--env-file`` to launch: + +.. code-block:: bash + + # configs/job1 + LR=1e-5 + MAX_STEPS=100 + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env-file configs/job1 + + + +2. Logging Job Outputs +~~~~~~~~~~~~~~~~~~~~~~~ + +When running many jobs, it is useful to log the outputs of all jobs. You can use tools like `W&B `__ for this purpose: + +.. raw:: html + +
+ SkyPilot YAML with W&B: train-template.yaml + +.. code-block:: yaml + :emphasize-lines: 7-7,19-19,34-34 + + # train-template.yaml + name: huggingface + + envs: + LR: 2e-5 + MAX_STEPS: 50 + WANDB_API_KEY: # Empty field means this field is required when launching the job. + + resources: + accelerators: V100:4 + + setup: | + set -e # Exit if any command failed. + git clone https://github.com/huggingface/transformers/ || true + cd transformers + pip install . + cd examples/pytorch/text-classification + pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 + pip install wandb + + run: | + set -e # Exit if any command failed. + cd transformers/examples/pytorch/text-classification + python run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate ${LR} \ + --max_steps ${MAX_STEPS} \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 \ + --report_to wandb + +.. raw:: html + +
+ +You can now launch the job with the following command (``WANDB_API_KEY`` should existing in your local environment variables). + +.. code-block:: bash + + sky launch -c train train-template.yaml \ + --env-file configs/job1 \ + --env WANDB_API_KEY + + + +Scale Out to Many Jobs +----------------------- + +With the above setup, you can now scale out to run many jobs in parallel. You +can either use SkyPilot CLI with many config files or use SkyPilot Python API. + +With CLI and Config Files +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can run many jobs in parallel by (1) creating multiple config files and (2) +submitting them as :ref:`SkyPilot managed jobs `. + +First, create a config file for each job (for example, in a ``configs`` directory): + +.. code-block:: bash + + # configs/job-1 + LR=1e-5 + MAX_STEPS=100 + + # configs/job-2 + LR=2e-5 + MAX_STEPS=200 + + ... + +.. raw:: html + +
+ An example Python script to generate config files + +.. code-block:: python + + import os + + CONFIG_PATH = 'configs' + LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0] + MAX_STEPS_CANDIDATES = [100, 300, 1000] + + os.makedirs(CONFIG_PATH, exist_ok=True) + + job_idx = 1 + for lr in LR_CANDIDATES: + for max_steps in MAX_STEPS_CANDIDATES: + config_file = f'{CONFIG_PATH}/job-{job_idx}' + with open(config_file, 'w') as f: + print(f'LR={lr}', file=f) + print(f'MAX_STEPS={max_steps}', file=f) + job_idx += 1 + +.. raw:: html + +
+ +Then, submit all jobs by iterating over the config files and calling ``sky jobs launch`` on each: + +.. code-block:: bash + + for config_file in configs/*; do + job_name=$(basename $config_file) + # -y: yes to all prompts. + # -d: detach from the job's logging, so the next job can be submitted + # without waiting for the previous job to finish. + sky jobs launch -n train-$job_name -y -d train-template.yaml \ + --env-file $config_file \ + --env WANDB_API_KEY + done + + +Job statuses can be checked via ``sky jobs queue``: + +.. code-block:: console + + $ sky jobs queue + + Fetching managed job statuses... + Managed jobs + In progress tasks: 10 RUNNING + ID TASK NAME RESOURCES SUBMITTED TOT. DURATION JOB DURATION #RECOVERIES STATUS + 10 - train-job10 1x[V100:4] 5 mins ago 5m 5s 1m 12s 0 RUNNING + 9 - train-job9 1x[V100:4] 6 mins ago 6m 11s 2m 23s 0 RUNNING + 8 - train-job8 1x[V100:4] 7 mins ago 7m 15s 3m 31s 0 RUNNING + ... + + +With Python API +~~~~~~~~~~~~~~~ + +To have more customized control over generation of job variants, you can also use SkyPilot Python API to launch the jobs. + +.. code-block:: python + + import os + import sky + + LR_CANDIDATES = [0.01, 0.03, 0.1, 0.3, 1.0] + MAX_STEPS_CANDIDATES = [100, 300, 1000] + task = sky.Task.from_yaml('train-template.yaml') + + job_idx = 1 + for lr in LR_CANDIDATES: + for max_steps in MAX_STEPS_CANDIDATES: + task.update_envs({'LR': lr, 'MAX_STEPS': max_steps}) + sky.jobs.launch( + task, + name=f'train-job{job_idx}', + detach_run=True, + retry_until_up=True, + ) + job_idx += 1