diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 2b0f1cec788..99f29ee258a 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -5,7 +5,7 @@ tests/smoke_tests ├── test_*.py -> release pipeline -├── test_pre_merge.py -> pre-merge pipeline +├── test_quick_tests_core.py -> run quick tests on PR before merging run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py` to generate the pipeline for testing. The CI will run this script as a pre-step, @@ -208,8 +208,8 @@ def _convert_release(test_files: List[str]): extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP}) -def _convert_pre_merge(test_files: List[str]): - yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' +def _convert_quick_tests_core(test_files: List[str]): + yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml' output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') @@ -234,18 +234,18 @@ def _convert_pre_merge(test_files: List[str]): def main(): test_files = os.listdir('tests/smoke_tests') release_files = [] - pre_merge_files = [] + quick_tests_core_files = [] for test_file in test_files: if not test_file.startswith('test_'): continue test_file_path = os.path.join('tests/smoke_tests', test_file) - if "test_pre_merge" in test_file: - pre_merge_files.append(test_file_path) + if "test_quick_tests_core" in test_file: + quick_tests_core_files.append(test_file_path) else: release_files.append(test_file_path) _convert_release(release_files) - _convert_pre_merge(pre_merge_files) + _convert_quick_tests_core(quick_tests_core_files) if __name__ == '__main__': diff --git a/README.md b/README.md index f29b57be9ca..1ed99325df5 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@

- + Documentation @@ -43,7 +43,7 @@

Archived - [Jul 2024] [**Finetune**](./llm/llama-3_1-finetuning/) and [**serve**](./llm/llama-3_1/) **Llama 3.1** on your infra -- [Apr 2024] Serve and finetune [**Llama 3**](https://skypilot.readthedocs.io/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) +- [Apr 2024] Serve and finetune [**Llama 3**](https://docs.skypilot.co/en/latest/gallery/llms/llama-3.html) on any cloud or Kubernetes: [**example**](./llm/llama-3/) - [Mar 2024] Serve and deploy [**Databricks DBRX**](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) on your infra: [**example**](./llm/dbrx/) - [Feb 2024] Speed up your LLM deployments with [**SGLang**](https://github.com/sgl-project/sglang) for 5x throughput on SkyServe: [**example**](./llm/sglang/) - [Dec 2023] Using [**LoRAX**](https://github.com/predibase/lorax) to serve 1000s of finetuned LLMs on a single instance in the cloud: [**example**](./llm/lorax/) @@ -60,17 +60,17 @@ SkyPilot is a framework for running AI and batch workloads on any infra, offering unified execution, high cost savings, and high GPU availability. SkyPilot **abstracts away infra burdens**: -- Launch [dev clusters](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html), [jobs](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html), and [serving](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) on any infra +- Launch [dev clusters](https://docs.skypilot.co/en/latest/examples/interactive-development.html), [jobs](https://docs.skypilot.co/en/latest/examples/managed-jobs.html), and [serving](https://docs.skypilot.co/en/latest/serving/sky-serve.html) on any infra - Easy job management: queue, run, and auto-recover many jobs SkyPilot **supports multiple clusters, clouds, and hardware** ([the Sky](https://arxiv.org/abs/2205.07147)): - Bring your reserved GPUs, Kubernetes clusters, or 12+ clouds -- [Flexible provisioning](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html) of GPUs, TPUs, CPUs, with auto-retry +- [Flexible provisioning](https://docs.skypilot.co/en/latest/examples/auto-failover.html) of GPUs, TPUs, CPUs, with auto-retry SkyPilot **cuts your cloud costs & maximizes GPU availability**: -* [Autostop](https://skypilot.readthedocs.io/en/latest/reference/auto-stop.html): automatic cleanup of idle resources -* [Managed Spot](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html): 3-6x cost savings using spot instances, with preemption auto-recovery -* [Optimizer](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html): 2x cost savings by auto-picking the cheapest & most available infra +* [Autostop](https://docs.skypilot.co/en/latest/reference/auto-stop.html): automatic cleanup of idle resources +* [Managed Spot](https://docs.skypilot.co/en/latest/examples/managed-jobs.html): 3-6x cost savings using spot instances, with preemption auto-recovery +* [Optimizer](https://docs.skypilot.co/en/latest/examples/auto-failover.html): 2x cost savings by auto-picking the cheapest & most available infra SkyPilot supports your existing GPU, TPU, and CPU workloads, with no code changes. @@ -79,13 +79,13 @@ Install with pip: # Choose your clouds: pip install -U "skypilot[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]" ``` -To get the latest features and fixes, use the nightly build or [install from source](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html): +To get the latest features and fixes, use the nightly build or [install from source](https://docs.skypilot.co/en/latest/getting-started/installation.html): ```bash # Choose your clouds: pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidstack,paperspace,cudo,ibm,scp]" ``` -[Current supported infra](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Paperspace, Cloudflare, Samsung, IBM, VMware vSphere): +[Current supported infra](https://docs.skypilot.co/en/latest/getting-started/installation.html) (Kubernetes; AWS, GCP, Azure, OCI, Lambda Cloud, Fluidstack, RunPod, Cudo, Paperspace, Cloudflare, Samsung, IBM, VMware vSphere):

@@ -95,16 +95,16 @@ pip install "skypilot-nightly[kubernetes,aws,gcp,azure,oci,lambda,runpod,fluidst ## Getting Started -You can find our documentation [here](https://skypilot.readthedocs.io/en/latest/). -- [Installation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) -- [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html) -- [CLI reference](https://skypilot.readthedocs.io/en/latest/reference/cli.html) +You can find our documentation [here](https://docs.skypilot.co/). +- [Installation](https://docs.skypilot.co/en/latest/getting-started/installation.html) +- [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html) +- [CLI reference](https://docs.skypilot.co/en/latest/reference/cli.html) ## SkyPilot in 1 Minute A SkyPilot task specifies: resource requirements, data to be synced, setup commands, and the task commands. -Once written in this [**unified interface**](https://skypilot.readthedocs.io/en/latest/reference/yaml-spec.html) (YAML or Python API), the task can be launched on any available cloud. This avoids vendor lock-in, and allows easily moving jobs to a different provider. +Once written in this [**unified interface**](https://docs.skypilot.co/en/latest/reference/yaml-spec.html) (YAML or Python API), the task can be launched on any available cloud. This avoids vendor lock-in, and allows easily moving jobs to a different provider. Paste the following into a file `my_task.yaml`: @@ -135,7 +135,7 @@ Prepare the workdir by cloning: git clone https://github.com/pytorch/examples.git ~/torch_examples ``` -Launch with `sky launch` (note: [access to GPU instances](https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html) is needed for this example): +Launch with `sky launch` (note: [access to GPU instances](https://docs.skypilot.co/en/latest/cloud-setup/quota.html) is needed for this example): ```bash sky launch my_task.yaml ``` @@ -152,10 +152,10 @@ SkyPilot then performs the heavy-lifting for you, including:

-Refer to [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html) to get started with SkyPilot. +Refer to [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html) to get started with SkyPilot. ## More Information -To learn more, see [Concept: Sky Computing](https://docs.skypilot.co/en/latest/sky-computing.html), [SkyPilot docs](https://skypilot.readthedocs.io/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/). +To learn more, see [Concept: Sky Computing](https://docs.skypilot.co/en/latest/sky-computing.html), [SkyPilot docs](https://docs.skypilot.co/en/latest/), and [SkyPilot blog](https://blog.skypilot.co/). Runnable examples: diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 7627218e451..9bc7052771f 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -10,6 +10,7 @@ myst-parser==2.0.0 sphinx-autodoc-typehints==1.25.2 sphinx-book-theme==1.1.0 sphinx-togglebutton==0.3.2 +sphinx-notfound-page==1.0.4 sphinxcontrib-applehelp==1.0.7 sphinxcontrib-devhelp==1.0.5 sphinxcontrib-googleanalytics==0.4 diff --git a/docs/source/conf.py b/docs/source/conf.py index a8ce3270e88..3c0b62c9947 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -41,6 +41,7 @@ 'sphinxemoji.sphinxemoji', 'sphinx_design', 'myst_parser', + 'notfound.extension', ] intersphinx_mapping = { diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index 61c33b5c43e..99fa461249d 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -499,7 +499,7 @@ To achieve the above, you can specify custom configs in :code:`~/.sky/config.yam # Specify the disk_size in GB of the jobs controller. disk_size: 100 -The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. +The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. .. note:: These settings will not take effect if you have an existing controller (either diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 286788625bd..d5ee4d2134a 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -22,7 +22,7 @@ Available fields and semantics: # # These take effects only when a managed jobs controller does not already exist. # - # Ref: https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#customizing-job-controller-resources + # Ref: https://docs.skypilot.co/en/latest/examples/managed-jobs.html#customizing-job-controller-resources jobs: controller: resources: # same spec as 'resources' in a task YAML @@ -478,13 +478,13 @@ Available fields and semantics: # This must be either: 'loadbalancer', 'ingress' or 'podip'. # # loadbalancer: Creates services of type `LoadBalancer` to expose ports. - # See https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service. + # See https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service. # This mode is supported out of the box on most cloud managed Kubernetes # environments (e.g., GKE, EKS). # # ingress: Creates an ingress and a ClusterIP service for each port opened. # Requires an Nginx ingress controller to be configured on the Kubernetes cluster. - # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress + # Refer to https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress # for details on deploying the NGINX ingress controller. # # podip: Directly returns the IP address of the pod. This mode does not @@ -513,7 +513,7 @@ Available fields and semantics: # # : The name of a service account to use for all Kubernetes pods. # This service account must exist in the user's namespace and have all - # necessary permissions. Refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # necessary permissions. Refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html # for details on the roles required by the service account. # # Using SERVICE_ACCOUNT or a custom service account only affects Kubernetes @@ -581,7 +581,7 @@ Available fields and semantics: # gke: uses cloud.google.com/gke-accelerator label to identify GPUs on nodes # karpenter: uses karpenter.k8s.aws/instance-gpu-name label to identify GPUs on nodes # generic: uses skypilot.co/accelerator labels to identify GPUs on nodes - # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support + # Refer to https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support # for more details on setting up labels for GPU support. # # Default: null (no autoscaler, autodetect label format for GPU nodes) diff --git a/docs/source/reference/kubernetes/index.rst b/docs/source/reference/kubernetes/index.rst index 89a57862c88..639b5b633ed 100644 --- a/docs/source/reference/kubernetes/index.rst +++ b/docs/source/reference/kubernetes/index.rst @@ -39,7 +39,7 @@ Why use SkyPilot on Kubernetes? .. grid-item-card:: 🖼 Run popular models on Kubernetes :text-align: center - Train and serve `Llama-3 `_, `Mixtral `_, and more on your Kubernetes with ready-to-use recipes from the :ref:`AI gallery `. + Train and serve `Llama-3 `_, `Mixtral `_, and more on your Kubernetes with ready-to-use recipes from the :ref:`AI gallery `. .. tab-item:: For Infrastructure Admins diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 455ee5909c9..0be708305c8 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -23,7 +23,7 @@ Available fields: # which `sky` is called. # # To exclude files from syncing, see - # https://skypilot.readthedocs.io/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files + # https://docs.skypilot.co/en/latest/examples/syncing-code-artifacts.html#exclude-uploading-files workdir: ~/my-task-code # Number of nodes (optional; defaults to 1) to launch including the head node. @@ -357,7 +357,7 @@ In additional to the above fields, SkyPilot also supports the following experime # # The following fields can be overridden. Please refer to docs of Advanced # Configuration for more details of those fields: - # https://skypilot.readthedocs.io/en/latest/reference/config.html + # https://docs.skypilot.co/en/latest/reference/config.html config_overrides: docker: run_options: ... diff --git a/docs/source/reservations/existing-machines.rst b/docs/source/reservations/existing-machines.rst index 10962ecd639..717043bfd25 100644 --- a/docs/source/reservations/existing-machines.rst +++ b/docs/source/reservations/existing-machines.rst @@ -42,7 +42,7 @@ Prerequisites **Local machine (typically your laptop):** * `kubectl `_ -* `SkyPilot `_ +* `SkyPilot `_ **Remote machines (your cluster, optionally with GPUs):** diff --git a/docs/source/serving/sky-serve.rst b/docs/source/serving/sky-serve.rst index c00fa427bd6..5a1a913b7ea 100644 --- a/docs/source/serving/sky-serve.rst +++ b/docs/source/serving/sky-serve.rst @@ -515,7 +515,7 @@ To achieve the above, you can specify custom configs in :code:`~/.sky/config.yam # Specify the disk_size in GB of the SkyServe controller. disk_size: 1024 -The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. +The :code:`resources` field has the same spec as a normal SkyPilot job; see `here `__. .. note:: These settings will not take effect if you have an existing controller (either diff --git a/examples/airflow/shared_state/README.md b/examples/airflow/shared_state/README.md index 5f39471351a..917a45862a7 100644 --- a/examples/airflow/shared_state/README.md +++ b/examples/airflow/shared_state/README.md @@ -12,7 +12,7 @@ In this guide, we demonstrate how some simple SkyPilot operations, such as launc * Airflow installed on a [Kubernetes cluster](https://airflow.apache.org/docs/helm-chart/stable/index.html) or [locally](https://airflow.apache.org/docs/apache-airflow/stable/start.html) (`SequentialExecutor`) * A Kubernetes cluster to run tasks on. We'll use GKE in this example. - * You can use our guide on [setting up a Kubernetes cluster](https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html). + * You can use our guide on [setting up a Kubernetes cluster](https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html). * A persistent volume storage class should be available that supports at least `ReadWriteOnce` access mode. GKE has this supported by default. ## Preparing the Kubernetes Cluster @@ -39,7 +39,7 @@ In this guide, we demonstrate how some simple SkyPilot operations, such as launc name: sky-airflow-sa namespace: default roleRef: - # For minimal permissions, refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # For minimal permissions, refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html kind: ClusterRole name: cluster-admin apiGroup: rbac.authorization.k8s.io @@ -163,7 +163,7 @@ with DAG(dag_id='sky_k8s_example', ## Tips 1. **Persistent Volume**: If you have many concurrent tasks, you may want to use a storage class that supports [`ReadWriteMany`](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) access mode. -2. **Cloud credentials**: If you wish to run tasks on different clouds, you can configure cloud credentials in Kubernetes secrets and mount them in the Sky pod defined in the DAG. See [SkyPilot docs on setting up cloud credentials](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup) for more on how to configure credentials in the pod. +2. **Cloud credentials**: If you wish to run tasks on different clouds, you can configure cloud credentials in Kubernetes secrets and mount them in the Sky pod defined in the DAG. See [SkyPilot docs on setting up cloud credentials](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup) for more on how to configure credentials in the pod. 3. **Logging**: All SkyPilot logs are written to container stdout, which is captured as task logs in Airflow and displayed in the UI. You can also write logs to a file and read them in subsequent tasks. 4. **XComs for shared state**: Airflow also provides [XComs](https://airflow.apache.org/docs/apache-airflow/stable/concepts/xcoms.html) for cross-task communication. [`sky_k8s_example_xcoms.py`](sky_k8s_example_xcoms.py) demonstrates how to use XComs to share state between tasks. diff --git a/examples/airflow/training_workflow/README.md b/examples/airflow/training_workflow/README.md index dad08d8d3b0..71cb10bef50 100644 --- a/examples/airflow/training_workflow/README.md +++ b/examples/airflow/training_workflow/README.md @@ -7,7 +7,7 @@ In this guide, we show how a training workflow involving data preprocessing, tra

-**💡 Tip:** SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html#job-pipelines) for more information. +**💡 Tip:** SkyPilot also supports defining and running pipelines without Airflow. Check out [Jobs Pipelines](https://docs.skypilot.co/en/latest/examples/managed-jobs.html#job-pipelines) for more information. ## Why use SkyPilot with Airflow? In AI workflows, **the transition from development to production is hard**. @@ -24,7 +24,7 @@ production Airflow cluster. Behind the scenes, SkyPilot handles environment setu Here's how you can use SkyPilot to take your dev workflows to production in Airflow: 1. **Define and test your workflow as SkyPilot tasks**. - - Use `sky launch` and [Sky VSCode integration](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code. + - Use `sky launch` and [Sky VSCode integration](https://docs.skypilot.co/en/latest/examples/interactive-development.html#dev-vscode) to run, debug and iterate on your code. 2. **Orchestrate SkyPilot tasks in Airflow** by invoking `sky launch` on their YAMLs as a task in the Airflow DAG. - Airflow does the scheduling, logging, and monitoring, while SkyPilot handles the infra setup and task execution. @@ -34,7 +34,7 @@ Here's how you can use SkyPilot to take your dev workflows to production in Airf * Airflow installed on a [Kubernetes cluster](https://airflow.apache.org/docs/helm-chart/stable/index.html) or [locally](https://airflow.apache.org/docs/apache-airflow/stable/start.html) (`SequentialExecutor`) * A Kubernetes cluster to run tasks on. We'll use GKE in this example. * A Google cloud account with GCS access to store the data for task. - * Follow [SkyPilot instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp) to set up Google Cloud credentials. + * Follow [SkyPilot instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp) to set up Google Cloud credentials. ## Preparing the Kubernetes Cluster @@ -60,7 +60,7 @@ Here's how you can use SkyPilot to take your dev workflows to production in Airf name: sky-airflow-sa namespace: default roleRef: - # For minimal permissions, refer to https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/kubernetes.html + # For minimal permissions, refer to https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/kubernetes.html kind: ClusterRole name: cluster-admin apiGroup: rbac.authorization.k8s.io @@ -103,7 +103,7 @@ The train and eval step can be run in a similar way: sky launch -c train --env DATA_BUCKET_URL=gs:// train.yaml ``` -Hint: You can use `ssh` and VSCode to [interactively develop](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html) and debug the tasks. +Hint: You can use `ssh` and VSCode to [interactively develop](https://docs.skypilot.co/en/latest/examples/interactive-development.html) and debug the tasks. Note: `eval` can be optionally run on the same cluster as `train` with `sky exec`. Refer to the `shared_state` airflow example on how to do this. diff --git a/examples/cog/README.md b/examples/cog/README.md index b2193e2e18f..97d886e2d2c 100644 --- a/examples/cog/README.md +++ b/examples/cog/README.md @@ -17,7 +17,7 @@ curl http://$IP:5000/predictions -X POST \ ``` ## Scale up the deployment using SkyServe -We can use SkyServe (`sky serve`) to scale up the deployment to multiple instances, while enjoying load balancing, autoscaling, and other [SkyServe features](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +We can use SkyServe (`sky serve`) to scale up the deployment to multiple instances, while enjoying load balancing, autoscaling, and other [SkyServe features](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ```console sky serve up -n cog ./sky.yaml ``` diff --git a/examples/distributed-pytorch/README.md b/examples/distributed-pytorch/README.md new file mode 100644 index 00000000000..6c2f7092269 --- /dev/null +++ b/examples/distributed-pytorch/README.md @@ -0,0 +1,81 @@ +# Distributed Training with PyTorch + +This example demonstrates how to run distributed training with PyTorch using SkyPilot. + +**The example is based on [PyTorch's official minGPT example](https://github.com/pytorch/examples/tree/main/distributed/minGPT-ddp)** + + +## Overview + +There are two ways to run distributed training with PyTorch: + +1. Using normal `torchrun` +2. Using `rdvz` backend + +The main difference between the two for fixed-size distributed training is that `rdvz` backend automatically handles the rank for each node, while `torchrun` requires the rank to be set manually. + +SkyPilot offers convinient built-in environment variables to help you start distributed training easily. + +### Using normal `torchrun` + + +The following command will spawn 2 nodes with 2 L4 GPU each: +``` +sky launch -c train train.yaml +``` + +In [train.yaml](./train.yaml), we use `torchrun` to launch the training and set the arguments for distributed training using [environment variables](https://docs.skypilot.co/en/latest/running-jobs/environment-variables.html#skypilot-environment-variables) provided by SkyPilot. + +```yaml +run: | + cd examples/mingpt + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=8008 \ + --node_rank=${SKYPILOT_NODE_RANK} \ + main.py +``` + + + +### Using `rdzv` backend + +`rdzv` is an alternative backend for distributed training: + +``` +sky launch -c train-rdzv train-rdzv.yaml +``` + +In [train-rdzv.yaml](./train-rdzv.yaml), we use `torchrun` to launch the training and set the arguments for distributed training using [environment variables](https://docs.skypilot.co/en/latest/running-jobs/environment-variables.html#skypilot-environment-variables) provided by SkyPilot. + +```yaml +run: | + cd examples/mingpt + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$MASTER_ADDR:29500 \ + --rdzv_id $SKYPILOT_TASK_ID \ + main.py +``` + + +## Scale up + +If you would like to scale up the training, you can simply change the resources requirement, and SkyPilot's built-in environment variables will be set automatically. + +For example, the following command will spawn 4 nodes with 4 L4 GPUs each. + +``` +sky launch -c train train.yaml --num-nodes 4 --gpus L4:4 --cpus 8+ +``` + +We increase the `--cpus` to 8+ as well to avoid the performance to be bottlenecked by the CPU. + diff --git a/examples/distributed-pytorch/train-rdzv.yaml b/examples/distributed-pytorch/train-rdzv.yaml new file mode 100644 index 00000000000..3bcd63dde4c --- /dev/null +++ b/examples/distributed-pytorch/train-rdzv.yaml @@ -0,0 +1,29 @@ +name: minGPT-ddp-rdzv + +resources: + cpus: 4+ + accelerators: L4 + +num_nodes: 2 + +setup: | + git clone --depth 1 https://github.com/pytorch/examples || true + cd examples + git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp + # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5). + uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113 + +run: | + cd examples/mingpt + export LOGLEVEL=INFO + + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --rdzv_backend=c10d \ + --rdzv_endpoint=$MASTER_ADDR:29500 \ + --rdzv_id $SKYPILOT_TASK_ID \ + main.py diff --git a/examples/distributed-pytorch/train.yaml b/examples/distributed-pytorch/train.yaml new file mode 100644 index 00000000000..b45941e1485 --- /dev/null +++ b/examples/distributed-pytorch/train.yaml @@ -0,0 +1,29 @@ +name: minGPT-ddp + +resources: + cpus: 4+ + accelerators: L4 + +num_nodes: 2 + +setup: | + git clone --depth 1 https://github.com/pytorch/examples || true + cd examples + git filter-branch --prune-empty --subdirectory-filter distributed/minGPT-ddp + # SkyPilot's default image on AWS/GCP has CUDA 11.6 (Azure 11.5). + uv pip install -r requirements.txt "numpy<2" "torch==1.12.1+cu113" --extra-index-url https://download.pytorch.org/whl/cu113 + +run: | + cd examples/mingpt + export LOGLEVEL=INFO + + MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1) + echo "Starting distributed training, head node: $MASTER_ADDR" + + torchrun \ + --nnodes=$SKYPILOT_NUM_NODES \ + --nproc_per_node=$SKYPILOT_NUM_GPUS_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=8008 \ + --node_rank=${SKYPILOT_NODE_RANK} \ + main.py diff --git a/examples/k8s_cloud_deploy/README.md b/examples/k8s_cloud_deploy/README.md index 5ba42cbe836..9b0d46249d4 100644 --- a/examples/k8s_cloud_deploy/README.md +++ b/examples/k8s_cloud_deploy/README.md @@ -56,11 +56,11 @@ NODE_NAME GPU_NAME TOTAL_GPUS FREE_GPUS ## Run AI workloads on your Kubernetes cluster with SkyPilot ### Development clusters -To launch a [GPU enabled development cluster](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html), run `sky launch -c mycluster --cloud kubernetes --gpus A10:1`. +To launch a [GPU enabled development cluster](https://docs.skypilot.co/en/latest/examples/interactive-development.html), run `sky launch -c mycluster --cloud kubernetes --gpus A10:1`. SkyPilot will setup SSH config for you. -* [SSH access](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#ssh): `ssh mycluster` -* [VSCode remote development](https://skypilot.readthedocs.io/en/latest/examples/interactive-development.html#vscode): `code --remote ssh-remote+mycluster "/"` +* [SSH access](https://docs.skypilot.co/en/latest/examples/interactive-development.html#ssh): `ssh mycluster` +* [VSCode remote development](https://docs.skypilot.co/en/latest/examples/interactive-development.html#vscode): `code --remote ssh-remote+mycluster "/"` ### Jobs @@ -87,7 +87,7 @@ sky-cmd-1-2ea4-head 1/1 Running 0 8m36s sky-jobs-controller-2ea485ea-2ea4-head 1/1 Running 0 10m ``` -Refer to [SkyPilot docs](https://skypilot.readthedocs.io/) for more. +Refer to [SkyPilot docs](https://docs.skypilot.co/) for more. ## Teardown To teardown the Kubernetes cluster, run: diff --git a/examples/stable_diffusion/README.md b/examples/stable_diffusion/README.md index 2a4383f1347..56af44df91e 100644 --- a/examples/stable_diffusion/README.md +++ b/examples/stable_diffusion/README.md @@ -1,6 +1,6 @@ ## Setup -1. Install skypilot package by following these [instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +1. Install skypilot package by following these [instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Run `git clone https://github.com/skypilot-org/skypilot.git && cd examples/stable_diffusion` diff --git a/examples/stable_diffusion/pushing_docker_image.md b/examples/stable_diffusion/pushing_docker_image.md index 80b285fa832..0585d566543 100644 --- a/examples/stable_diffusion/pushing_docker_image.md +++ b/examples/stable_diffusion/pushing_docker_image.md @@ -1,6 +1,6 @@ ## GCR -1. Install skypilot package by following these [instructions](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +1. Install skypilot package by following these [instructions](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Run `git clone https://github.com/skypilot-org/skypilot.git `. diff --git a/llm/codellama/README.md b/llm/codellama/README.md index f145fd062ff..54019bd6d2a 100644 --- a/llm/codellama/README.md +++ b/llm/codellama/README.md @@ -38,7 +38,7 @@ The followings are the demos of Code Llama 70B hosted by SkyPilot Serve (aka Sky ## Running your own Code Llama with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Code Llama on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Code Llama on vLLM with SkyPilot in 1-click: 1. Start serving Code Llama 70B on a single instance with any available GPU in the list specified in [endpoint.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/codellama/endpoint.yaml) with a vLLM powered OpenAI-compatible endpoint: ```console @@ -100,7 +100,7 @@ This returns the following completion: ## Scale up the service with SkyServe -1. With [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Code Llama service is as simple as running: +1. With [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Code Llama service is as simple as running: ```bash sky serve up -n code-llama ./endpoint.yaml ``` diff --git a/llm/dbrx/README.md b/llm/dbrx/README.md index 3011af9d4e6..2845634b287 100644 --- a/llm/dbrx/README.md +++ b/llm/dbrx/README.md @@ -11,7 +11,7 @@ In this recipe, you will serve `databricks/dbrx-instruct` on your own infra -- ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/databricks/dbrx-instruct) and request access to the model `databricks/dbrx-instruct`. -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -278,6 +278,6 @@ To shut down all resources: sky serve down dbrx ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). diff --git a/llm/gemma/README.md b/llm/gemma/README.md index ef5027b2807..7296f7c7e31 100644 --- a/llm/gemma/README.md +++ b/llm/gemma/README.md @@ -24,7 +24,7 @@ Generate a read-only access token on huggingface [here](https://huggingface.co/s ```bash pip install "skypilot-nightly[all]" ``` -For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +For detailed installation instructions, please refer to the [installation guide](https://docs.skypilot.co/en/latest/getting-started/installation.html). ### Host on a Single Instance diff --git a/llm/gpt-2/README.md b/llm/gpt-2/README.md index 10fa2cf6998..b8e656e2353 100644 --- a/llm/gpt-2/README.md +++ b/llm/gpt-2/README.md @@ -13,7 +13,7 @@ pip install "skypilot-nightly[aws,gcp,azure,kubernetes,lambda,fluidstack]" # Cho ```bash sky check ``` -Please check the instructions for enabling clouds at [SkyPilot doc](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +Please check the instructions for enabling clouds at [SkyPilot doc](https://docs.skypilot.co/en/latest/getting-started/installation.html). 3. Download the YAML for starting the training: ```bash diff --git a/llm/llama-3/README.md b/llm/llama-3/README.md index 8ffcb3087a9..c4cf9066f63 100644 --- a/llm/llama-3/README.md +++ b/llm/llama-3/README.md @@ -29,7 +29,7 @@ ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) and request access to the model `meta-llama/Meta-Llama-3-70B-Instruct`. -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -326,7 +326,7 @@ To shut down all resources: sky serve down llama3 ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ### **Optional**: Connect a GUI to your Llama-3 endpoint @@ -349,4 +349,4 @@ sky launch -c llama3-gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint ## Finetuning Llama-3 -You can finetune Llama-3 on your own data. We have an tutorial for finetunning Llama-2 for Vicuna on SkyPilot, which can be adapted for Llama-3. You can find the tutorial [here](https://skypilot.readthedocs.io/en/latest/gallery/tutorials/finetuning.html) and a detailed blog post [here](https://blog.skypilot.co/finetuning-llama2-operational-guide/). +You can finetune Llama-3 on your own data. We have an tutorial for finetunning Llama-2 for Vicuna on SkyPilot, which can be adapted for Llama-3. You can find the tutorial [here](https://docs.skypilot.co/en/latest/gallery/tutorials/finetuning.html) and a detailed blog post [here](https://blog.skypilot.co/finetuning-llama2-operational-guide/). diff --git a/llm/llama-3_1-finetuning/readme.md b/llm/llama-3_1-finetuning/readme.md index 935dccde84e..ddc2b9e2463 100644 --- a/llm/llama-3_1-finetuning/readme.md +++ b/llm/llama-3_1-finetuning/readme.md @@ -7,10 +7,10 @@ On July 23, 2024, Meta released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. Llama 3.1 405B became _the first open LLM that closely rivals top proprietary models_ like GPT-4o and Claude 3.5 Sonnet. -This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: +This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://docs.skypilot.co/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra: - Local GPU workstation - Kubernetes cluster -- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) +- Cloud accounts ([12 clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html))
@@ -233,7 +233,7 @@ export HF_TOKEN="xxxx" ```bash pip install skypilot-nightly[aws,gcp,kubernetes] # or other clouds (12 clouds + kubernetes supported) you have setup -# See: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html +# See: https://docs.skypilot.co/en/latest/getting-started/installation.html ``` 5. Check your infra setup: @@ -262,6 +262,6 @@ sky check ## What's next * [AI on Kubernetes Without the Pain](https://blog.skypilot.co/ai-on-kubernetes/) -* [SkyPilot AI Gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html) -* [SkyPilot Docs](https://skypilot.readthedocs.io/en/latest/docs/index.html) +* [SkyPilot AI Gallery](https://docs.skypilot.co/en/latest/gallery/index.html) +* [SkyPilot Docs](https://docs.skypilot.co) * [SkyPilot GitHub](https://github.com/skypilot-org/skypilot) diff --git a/llm/llama-3_1/README.md b/llm/llama-3_1/README.md index 6cfeb8dc5f9..2634811d8a1 100644 --- a/llm/llama-3_1/README.md +++ b/llm/llama-3_1/README.md @@ -13,7 +13,7 @@ This guide walks through how to serve Llama 3.1 models **completely on your infr - Local GPU workstation - Kubernetes cluster -- Cloud accounts ([12 clouds supported](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)) +- Cloud accounts ([12 clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html)) SkyPilot will be used as the unified framework to launch serving on any (or multiple) infra that you bring. @@ -64,7 +64,7 @@ sky check kubernetes sky check ``` -See [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for details. +See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for details. ### Step 1: Get a GPU dev node (pod or VM) @@ -155,7 +155,7 @@ Now that we verified the model is working, let's package it for hands-free deplo Whichever infra you use for GPUs, SkyPilot abstracts away the mundane infra tasks (e.g., setting up services on K8s, opening up ports for cloud VMs), making AI models super easy to deploy via one command. -[Deploying via SkyPilot](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) has several key benefits: +[Deploying via SkyPilot](https://docs.skypilot.co/en/latest/serving/sky-serve.html) has several key benefits: - Control node & replicas completely stay in your infra - Automatic load-balancing across multiple replicas - Automatic recovery of replicas @@ -296,7 +296,7 @@ curl -L http://$ENDPOINT/v1/chat/completions \ 🎉 **Congratulations!** You are now serving a Llama 3.1 8B model across two replicas. To recap, all model replicas **stay in your own private infrastructure** and SkyPilot ensures they are **healthy and available**. -Details on autoscaling, rolling updates, and more in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +Details on autoscaling, rolling updates, and more in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). When you are done, shut down all resources: diff --git a/llm/llama-3_2/README.md b/llm/llama-3_2/README.md index 987dc0d90c5..f6c2a54ce6a 100644 --- a/llm/llama-3_2/README.md +++ b/llm/llama-3_2/README.md @@ -26,7 +26,7 @@ ## Prerequisites - Go to the [HuggingFace model page](https://huggingface.co/meta-llama/) and request access to the model [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) and [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision). -- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)). - Check that `sky check` shows clouds or Kubernetes are enabled. ## SkyPilot YAML @@ -346,7 +346,7 @@ To shut down all resources: sky serve down llama3 ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). ## Developing and Finetuning Llama 3 series diff --git a/llm/llama-chatbots/README.md b/llm/llama-chatbots/README.md index 418d3d39d15..272cc24d288 100644 --- a/llm/llama-chatbots/README.md +++ b/llm/llama-chatbots/README.md @@ -17,12 +17,12 @@ It will automatically perform the following: [**LLaMA**](https://github.com/facebookresearch/llama) is a set of Large Language Models (LLMs) recently released by Meta. Trained on more than 1 trillion tokens from public datasets, LLaMA achieves high quality and is space-efficient. You can [fill out a form to request access from Meta](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform) to download the open model weights. In the steps below we assume either (1) you have an unexpired download URL, or (2) the weights have been downloaded and stored on the local machine. -[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [Lambda Labs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#lambda-cloud) (low-cost GPU cloud), [AWS](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#aws), [GCP](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#gcp), and [Azure](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#azure) are supported. See [docs](https://skypilot.readthedocs.io/en/latest/) to learn more. +[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [Lambda Labs](https://docs.skypilot.co/en/latest/getting-started/installation.html#lambda-cloud) (low-cost GPU cloud), [AWS](https://docs.skypilot.co/en/latest/getting-started/installation.html#aws), [GCP](https://docs.skypilot.co/en/latest/getting-started/installation.html#gcp), and [Azure](https://docs.skypilot.co/en/latest/getting-started/installation.html#azure) are supported. See [docs](https://docs.skypilot.co/en/latest/) to learn more. ## Steps All YAML files used below live in [the SkyPilot repo](https://github.com/skypilot-org/skypilot/tree/master/llm/llama-chatbots), and the chatbot code is [here](https://github.com/skypilot-org/sky-llama). -0. Install SkyPilot and [check that cloud credentials exist](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup): +0. Install SkyPilot and [check that cloud credentials exist](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup): ```bash pip install "skypilot[aws,gcp,azure,lambda]" # pick your clouds sky check @@ -120,7 +120,7 @@ sky launch llama-30b.yaml -c llama-30b -s --env LLAMA_URL=$LLAMA_URL sky launch llama-65b.yaml -c llama-65b -s --env LLAMA_URL=$LLAMA_URL ``` -To see details about these flags, see [CLI docs](https://skypilot.readthedocs.io/en/latest/reference/cli.html) or run `sky launch -h`. +To see details about these flags, see [CLI docs](https://docs.skypilot.co/en/latest/reference/cli.html) or run `sky launch -h`. ## Cleaning up When you are done, you can stop or tear down the cluster: @@ -140,7 +140,7 @@ When you are done, you can stop or tear down the cluster: ``` **To see your clusters**, run `sky status`, which is a single pane of glass for all your clusters across regions/clouds. -To learn more about various SkyPilot commands, see [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +To learn more about various SkyPilot commands, see [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). ## Why SkyPilot? @@ -166,12 +166,12 @@ SkyPilot's `sky launch` command makes this entirely automatic. It performs *auto - low-cost GPU cloud (Lambda; >3x cheaper than AWS/Azure/GCP) - spot instances (>3x cheaper than on-demand) - automatically choosing the cheapest cloud/region/zone -- auto-stopping & auto-termination of instances ([docs](https://skypilot.readthedocs.io/en/latest/reference/auto-stop.html)) +- auto-stopping & auto-termination of instances ([docs](https://docs.skypilot.co/en/latest/reference/auto-stop.html)) ## Recap Congratulations! You have used SkyPilot to launch a LLaMA-based chatbot on the cloud with just one command. The system automatically handles setting up instances and it offers cloud portability, higher GPU availability, and cost reduction. -LLaMA chatbots are just one example app. To leverage these benefits for your own ML projects on the cloud, we recommend the [Quickstart guide](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +LLaMA chatbots are just one example app. To leverage these benefits for your own ML projects on the cloud, we recommend the [Quickstart guide](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). *Feedback or questions? Want to run other LLM models?* Feel free to drop a note to the SkyPilot team on [GitHub](https://github.com/skypilot-org/skypilot/) or [Slack](http://slack.skypilot.co/) and we're happy to chat! diff --git a/llm/localgpt/README.md b/llm/localgpt/README.md index 17b3332ee30..c52f1b08851 100644 --- a/llm/localgpt/README.md +++ b/llm/localgpt/README.md @@ -13,7 +13,7 @@ Install SkyPilot and check your setup of cloud credentials: pip install git+https://github.com/skypilot-org/skypilot.git sky check ``` -See [docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html) for more. +See [docs](https://docs.skypilot.co/en/latest/getting-started/installation.html) for more. Once you are done, we will use [SkyPilot YAML for localGPT](https://github.com/skypilot-org/skypilot/tree/master/llm/localgpt/localgpt.yaml) to define our task and run it. diff --git a/llm/lorax/README.md b/llm/lorax/README.md index edd153d45f1..b1d5def6e78 100644 --- a/llm/lorax/README.md +++ b/llm/lorax/README.md @@ -40,7 +40,7 @@ sky launch -c lorax-cluster lorax.yaml By default, this config will deploy `Mistral-7B-Instruct`, but this can be overridden by running `sky launch` with the argument `--env MODEL_ID=`. -**NOTE:** This config will launch the instance on a public IP. It's highly recommended to secure the instance within a private subnet. See the [Advanced Configurations](https://skypilot.readthedocs.io/en/latest/reference/config.html#config-yaml) section of the SkyPilot docs for options to run within VPC and setup private IPs. +**NOTE:** This config will launch the instance on a public IP. It's highly recommended to secure the instance within a private subnet. See the [Advanced Configurations](https://docs.skypilot.co/en/latest/reference/config.html#config-yaml) section of the SkyPilot docs for options to run within VPC and setup private IPs. ## Prompt LoRAX w/ base model diff --git a/llm/mixtral/README.md b/llm/mixtral/README.md index 0bddb77c665..8456dbb5fcf 100644 --- a/llm/mixtral/README.md +++ b/llm/mixtral/README.md @@ -15,7 +15,7 @@ SkyPilot can help you serve Mixtral by automatically finding available resources sky launch -c mixtral ./serve.yaml ``` -Note that we specify the following resources, so that SkyPilot will automatically find any of the available GPUs specified by automatically [failover](https://skypilot.readthedocs.io/en/latest/examples/auto-failover.html) through all the candidates (in the order of the prices): +Note that we specify the following resources, so that SkyPilot will automatically find any of the available GPUs specified by automatically [failover](https://docs.skypilot.co/en/latest/examples/auto-failover.html) through all the candidates (in the order of the prices): ```yaml resources: @@ -82,7 +82,7 @@ curl http://$IP:8000/v1/chat/completions \ ## 2. Serve with multiple instances -When scaling up is required, [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) is the library built on top of SkyPilot, which can help you scale up the serving with multiple instances, while still providing a single endpoint. To serve Mixtral with multiple instances, run the following command: +When scaling up is required, [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) is the library built on top of SkyPilot, which can help you scale up the serving with multiple instances, while still providing a single endpoint. To serve Mixtral with multiple instances, run the following command: ```bash sky serve up -n mixtral ./serve.yaml diff --git a/llm/ollama/README.md b/llm/ollama/README.md index 16a8a9ea8e4..2d15b598381 100644 --- a/llm/ollama/README.md +++ b/llm/ollama/README.md @@ -17,7 +17,7 @@ To get started, install the latest version of SkyPilot: pip install "skypilot-nightly[all]" ``` -For detailed installation instructions, please refer to the [installation guide](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +For detailed installation instructions, please refer to the [installation guide](https://docs.skypilot.co/en/latest/getting-started/installation.html). Once installed, run `sky check` to verify you have cloud access. @@ -296,4 +296,4 @@ To shut down all resources: sky serve down ollama ``` -See more details in [SkyServe docs](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html). +See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html). diff --git a/llm/pixtral/README.md b/llm/pixtral/README.md index fccde1de7ad..987769c892a 100644 --- a/llm/pixtral/README.md +++ b/llm/pixtral/README.md @@ -57,7 +57,7 @@ This guide shows how to use run and deploy this multimodal model on your own clo pip install 'skypilot[all]' sky check ``` -Detailed instructions for installation and cloud setup [here](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html). +Detailed instructions for installation and cloud setup [here](https://docs.skypilot.co/en/latest/getting-started/installation.html). 2. Launch the model on any cloud or Kubernetes: ```bash @@ -150,7 +150,7 @@ These descriptions should give you a clear picture of the scenes depicted in the ## Scale Up Pixtral Endpoint as a Service -1. Start a service with [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html): +1. Start a service with [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html): ```bash sky serve up -n pixtral pixtral.yaml ``` diff --git a/llm/qwen/README.md b/llm/qwen/README.md index 6846fc71f2f..d4c73edb842 100644 --- a/llm/qwen/README.md +++ b/llm/qwen/README.md @@ -27,7 +27,7 @@ As of Jun 2024, Qwen1.5-110B-Chat is ranked higher than GPT-4-0613 on the [LMSYS ## Running your own Qwen with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Qwen model on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Qwen model on vLLM with SkyPilot in 1-click: 1. Start serving Qwen 110B on a single instance with any available GPU in the list specified in [qwen15-110b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen15-110b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [qwen25-72b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen25-72b.yaml) or [qwen25-7b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/qwen/qwen25-7b.yaml) for a smaller model): @@ -98,7 +98,7 @@ curl http://$ENDPOINT/v1/chat/completions \ ## Scale up the service with SkyServe -1. With [SkyPilot Serving](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Qwen service is as simple as running: +1. With [SkyPilot Serving](https://docs.skypilot.co/en/latest/serving/sky-serve.html), a serving library built on top of SkyPilot, scaling up the Qwen service is as simple as running: ```bash sky serve up -n qwen ./qwen25-72b.yaml ``` diff --git a/llm/sglang/README.md b/llm/sglang/README.md index 7d41b8fc168..f6bac3c71ad 100644 --- a/llm/sglang/README.md +++ b/llm/sglang/README.md @@ -21,7 +21,7 @@ sky check ``` ## Serving vision-language model LLaVA with SGLang for more traffic using SkyServe -1. Create a [`SkyServe Service YAML`](https://skypilot.readthedocs.io/en/latest/serving/service-yaml-spec.html) with a `service` section: +1. Create a [`SkyServe Service YAML`](https://docs.skypilot.co/en/latest/serving/service-yaml-spec.html) with a `service` section: ```yaml service: @@ -33,7 +33,7 @@ service: The entire Service YAML can be found here: [llava.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang/llava.yaml). -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n sglang-llava llava.yaml ``` @@ -117,7 +117,7 @@ You should get a similar response as the following: ## Serving Llama-2 with SGLang for more traffic using SkyServe 1. The process is the same as serving LLaVA, but with the model path changed to Llama-2. Below are example commands for reference. -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n sglang-llama2 llama2.yaml --env HF_TOKEN= ``` diff --git a/llm/tabby/README.md b/llm/tabby/README.md index 569b64538c1..9aa4ca4c803 100644 --- a/llm/tabby/README.md +++ b/llm/tabby/README.md @@ -17,13 +17,13 @@ This post shows how to use SkyPilot to host an ai coding assistant with just one - OpenAPI interface, easy to integrate with existing infrastructure (e.g Cloud IDE). - Supports consumer-grade GPUs. -[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [AWS](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#aws), [GCP](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#gcp), [Azure](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#azure), [Lambda Cloud](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#lambda-cloud), [IBM](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#ibm), [Oracle Cloud Infrastructure (OCI)](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci), [Cloudflare R2](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloudflare-r2) and [Samsung Cloud Platform (SCP)](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#samsung-cloud-platform-scp) are supported. See [docs](https://skypilot.readthedocs.io/en/latest/) to learn more. +[**SkyPilot**](https://github.com/skypilot-org/skypilot) is an open-source framework from UC Berkeley for seamlessly running machine learning on any cloud. With a simple CLI, users can easily launch many clusters and jobs, while substantially lowering their cloud bills. Currently, [AWS](https://docs.skypilot.co/en/latest/getting-started/installation.html#aws), [GCP](https://docs.skypilot.co/en/latest/getting-started/installation.html#gcp), [Azure](https://docs.skypilot.co/en/latest/getting-started/installation.html#azure), [Lambda Cloud](https://docs.skypilot.co/en/latest/getting-started/installation.html#lambda-cloud), [IBM](https://docs.skypilot.co/en/latest/getting-started/installation.html#ibm), [Oracle Cloud Infrastructure (OCI)](https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci), [Cloudflare R2](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2) and [Samsung Cloud Platform (SCP)](https://docs.skypilot.co/en/latest/getting-started/installation.html#samsung-cloud-platform-scp) are supported. See [docs](https://docs.skypilot.co/en/latest/) to learn more. ## Steps All YAML files used below live in [the SkyPilot repo](https://github.com/skypilot-org/skypilot/tree/master/llm/tabby). -1. Install SkyPilot and [check that cloud credentials exist](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloud-account-setup): +1. Install SkyPilot and [check that cloud credentials exist](https://docs.skypilot.co/en/latest/getting-started/installation.html#cloud-account-setup): ```bash # pip install skypilot @@ -94,4 +94,4 @@ When you are done, you can stop or tear down the cluster: ``` **To see your clusters**, run `sky status`, which is a single pane of glass for all your clusters across regions/clouds. -To learn more about various SkyPilot commands, see [Quickstart](https://skypilot.readthedocs.io/en/latest/getting-started/quickstart.html). +To learn more about various SkyPilot commands, see [Quickstart](https://docs.skypilot.co/en/latest/getting-started/quickstart.html). diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index e392b231e64..31d78a243cb 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -120,7 +120,7 @@ sky launch --no-use-spot ... ### Reducing costs by 3x with spot instances -[SkyPilot Managed Jobs](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. +[SkyPilot Managed Jobs](https://docs.skypilot.co/en/latest/examples/managed-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. To use SkyPilot Managed Spot Jobs, you can simply replace `sky launch` with `sky jobs launch` in the above command: diff --git a/llm/vicuna/README.md b/llm/vicuna/README.md index 6d9f46127d4..b8c6ab100d8 100644 --- a/llm/vicuna/README.md +++ b/llm/vicuna/README.md @@ -4,7 +4,7 @@ Vicuna LLM

-This README contains instructions to run and train Vicuna, an open-source LLM chatbot with quality comparable to ChatGPT. The Vicuna release was trained using SkyPilot on [cloud spot instances](https://skypilot.readthedocs.io/en/latest/examples/spot-jobs.html), with a cost of ~$300. +This README contains instructions to run and train Vicuna, an open-source LLM chatbot with quality comparable to ChatGPT. The Vicuna release was trained using SkyPilot on [cloud spot instances](https://docs.skypilot.co/en/latest/examples/spot-jobs.html), with a cost of ~$300. * [Blog post](https://lmsys.org/blog/2023-03-30-vicuna/) * [Demo](https://chat.lmsys.org/) diff --git a/llm/vllm/README.md b/llm/vllm/README.md index 78617f3746d..c150ae46e2d 100644 --- a/llm/vllm/README.md +++ b/llm/vllm/README.md @@ -112,7 +112,7 @@ curl http://$IP:8000/v1/chat/completions \ ## Serving Llama-2 with vLLM for more traffic using SkyServe To scale up the model serving for more traffic, we introduced SkyServe to enable a user to easily deploy multiple replica of the model: -1. Adding an `service` section in the above `serve-openai-api.yaml` file to make it an [`SkyServe Service YAML`](https://skypilot.readthedocs.io/en/latest/serving/service-yaml-spec.html): +1. Adding an `service` section in the above `serve-openai-api.yaml` file to make it an [`SkyServe Service YAML`](https://docs.skypilot.co/en/latest/serving/service-yaml-spec.html): ```yaml # The newly-added `service` section to the `serve-openai-api.yaml` file. @@ -125,7 +125,7 @@ service: The entire Service YAML can be found here: [service.yaml](https://github.com/skypilot-org/skypilot/tree/master/llm/vllm/service.yaml). -2. Start serving by using [SkyServe](https://skypilot.readthedocs.io/en/latest/serving/sky-serve.html) CLI: +2. Start serving by using [SkyServe](https://docs.skypilot.co/en/latest/serving/sky-serve.html) CLI: ```bash sky serve up -n vllm-llama2 service.yaml ``` diff --git a/llm/yi/README.md b/llm/yi/README.md index 1353320aa9f..b9d5c4a761d 100644 --- a/llm/yi/README.md +++ b/llm/yi/README.md @@ -19,7 +19,7 @@ ## Running Yi model with SkyPilot -After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), run your own Yi model on vLLM with SkyPilot in 1-click: +After [installing SkyPilot](https://docs.skypilot.co/en/latest/getting-started/installation.html), run your own Yi model on vLLM with SkyPilot in 1-click: 1. Start serving Yi-1.5 34B on a single instance with any available GPU in the list specified in [yi15-34b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yi15-34b.yaml) with a vLLM powered OpenAI-compatible endpoint (You can also switch to [yicoder-9b.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/yi/yicoder-9b.yaml) or [other model](https://github.com/skypilot-org/skypilot/tree/master/llm/yi) for a smaller model): diff --git a/sky/adaptors/cloudflare.py b/sky/adaptors/cloudflare.py index 864248614f3..e9c5613c97e 100644 --- a/sky/adaptors/cloudflare.py +++ b/sky/adaptors/cloudflare.py @@ -177,7 +177,7 @@ def check_credentials() -> Tuple[bool, Optional[str]]: hints += f'\n{_INDENT_PREFIX} $ mkdir -p ~/.cloudflare' hints += f'\n{_INDENT_PREFIX} $ echo > ~/.cloudflare/accountid' # pylint: disable=line-too-long hints += f'\n{_INDENT_PREFIX}For more info: ' - hints += 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long + hints += 'https://docs.skypilot.co/en/latest/getting-started/installation.html#cloudflare-r2' # pylint: disable=line-too-long return (False, hints) if hints else (True, hints) diff --git a/sky/adaptors/oci.py b/sky/adaptors/oci.py index 7a5fafa854a..8fe09479a38 100644 --- a/sky/adaptors/oci.py +++ b/sky/adaptors/oci.py @@ -1,9 +1,16 @@ """Oracle OCI cloud adaptor""" +import logging import os from sky.adaptors import common +# Suppress OCI circuit breaker logging before lazy import, because +# oci modules prints additional message during imports, i.e., the +# set_logger in the LazyImport called after imports will not take +# effect. +logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING) + CONFIG_PATH = '~/.oci/config' ENV_VAR_OCI_CONFIG = 'OCI_CONFIG' diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index a3651bdba9a..0f55b8a7f17 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -173,6 +173,16 @@ ('available_node_types', 'ray.head.default', 'node_config', 'azure_arm_parameters', 'cloudInitSetupCommands'), ] +# These keys are expected to change when provisioning on an existing cluster, +# but they don't actually represent a change that requires re-provisioning the +# cluster. If the cluster yaml is the same except for these keys, we can safely +# skip reprovisioning. See _deterministic_cluster_yaml_hash. +_RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [ + # On first launch, availability_zones will include all possible zones. Once + # the cluster exists, it will only include the zone that the cluster is + # actually in. + ('provider', 'availability_zone'), +] def is_ip(s: str) -> bool: @@ -1087,7 +1097,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str: yaml file and all the files in the file mounts, then hash the byte sequence. The format of the byte sequence is: - 32 bytes - sha256 hash of the yaml file + 32 bytes - sha256 hash of the yaml for each file mount: file mount remote destination (UTF-8), \0 if the file mount source is a file: @@ -1111,14 +1121,29 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str: we construct it incrementally by using hash.update() to add new bytes. """ + # Load the yaml contents so that we can directly remove keys. + yaml_config = common_utils.read_yaml(yaml_path) + for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH: + dict_to_remove_from = yaml_config + found_key = True + for key in key_list[:-1]: + if (not isinstance(dict_to_remove_from, dict) or + key not in dict_to_remove_from): + found_key = False + break + dict_to_remove_from = dict_to_remove_from[key] + if found_key and key_list[-1] in dict_to_remove_from: + dict_to_remove_from.pop(key_list[-1]) + def _hash_file(path: str) -> bytes: return common_utils.hash_file(path, 'sha256').digest() config_hash = hashlib.sha256() - config_hash.update(_hash_file(yaml_path)) + yaml_hash = hashlib.sha256( + common_utils.dump_yaml_str(yaml_config).encode('utf-8')) + config_hash.update(yaml_hash.digest()) - yaml_config = common_utils.read_yaml(yaml_path) file_mounts = yaml_config.get('file_mounts', {}) # Remove the file mounts added by the newline. if '' in file_mounts: @@ -1126,6 +1151,11 @@ def _hash_file(path: str) -> bytes: file_mounts.pop('') for dst, src in sorted(file_mounts.items()): + if src == yaml_path: + # Skip the yaml file itself. We have already hashed a modified + # version of it. The file may include fields we don't want to hash. + continue + expanded_src = os.path.expanduser(src) config_hash.update(dst.encode('utf-8') + b'\0') diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 0c67ec6b328..8974a0129bd 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1092,7 +1092,7 @@ def _gcp_handler(blocked_resources: Set['resources_lib.Resources'], 'having the required permissions and the user ' 'account does not have enough permission to ' 'update it. Please contact your administrator and ' - 'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long + 'check out: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long f'Details: {message}') _add_to_blocked_resources( blocked_resources, @@ -1389,8 +1389,7 @@ def _retry_zones( f'in {to_provision.cloud}. ' f'{colorama.Style.RESET_ALL}' f'To request quotas, check the instruction: ' - f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.' # pylint: disable=line-too-long - ) + f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.') for zones in self._yield_zones(to_provision, num_nodes, cluster_name, prev_cluster_status, diff --git a/sky/check.py b/sky/check.py index dcaa349d234..ee5ea77234b 100644 --- a/sky/check.py +++ b/sky/check.py @@ -146,7 +146,7 @@ def get_all_clouds(): dim=True) + click.style(f'sky check{clouds_arg}', bold=True) + '\n' + click.style( 'If any problems remain, refer to detailed docs at: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html', # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html', # pylint: disable=line-too-long dim=True)) if disallowed_clouds_hint: diff --git a/sky/cli.py b/sky/cli.py index edc60d38f01..dc6097fc4d7 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3589,15 +3589,12 @@ def jobs(): default=False, required=False, help='Skip confirmation prompt.') -# TODO(cooperc): remove this flag once --fast can robustly detect cluster -# yaml config changes +# TODO(cooperc): remove this flag before releasing 0.8.0 @click.option('--fast', default=False, is_flag=True, - help='[Experimental] Launch the job faster by skipping ' - 'controller initialization steps. If you update SkyPilot or ' - 'your local cloud credentials, they will not be reflected until ' - 'you run `sky jobs launch` at least once without this flag.') + help=('[Deprecated] Does nothing. Previous flag behavior is now ' + 'enabled by default.')) @timeline.event @usage_lib.entrypoint def jobs_launch( @@ -3666,6 +3663,16 @@ def jobs_launch( job_recovery=job_recovery, ) + # Deprecation. The default behavior is fast, and the flag will be removed. + # The flag was not present in 0.7.x (only nightly), so we will remove before + # 0.8.0 so that it never enters a stable release. + if fast: + click.secho( + 'Flag --fast is deprecated, as the behavior is now default. The ' + 'flag will be removed soon. Please do not use it, so that you ' + 'avoid "No such option" errors.', + fg='yellow') + if not isinstance(task_or_dag, sky.Dag): assert isinstance(task_or_dag, sky.Task), task_or_dag with sky.Dag() as dag: @@ -3704,7 +3711,7 @@ def jobs_launch( common_utils.check_cluster_name_is_valid(name) - managed_jobs.launch(dag, name, detach_run=detach_run, fast=fast) + managed_jobs.launch(dag, name, detach_run=detach_run) @jobs.command('queue', cls=_DocumentedCodeCommand) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index c42d67f8ba4..cafc789c5be 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -617,7 +617,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'Failed to fetch the availability zones for the account ' f'{identity_str}. It is likely due to permission issues, please' ' check the minimal permission required for AWS: ' - 'https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable= + 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable= f'\n{cls._INDENT_PREFIX}Details: ' f'{common_utils.format_exception(e, use_bracket=True)}') return True, hints diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 145a5d1c26e..25d285da185 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -42,8 +42,7 @@ class Cudo(clouds.Cloud): f'{_INDENT_PREFIX} $ cudoctl init\n' f'{_INDENT_PREFIX}For more info: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html' - ) + 'https://docs.skypilot.co/en/latest/getting-started/installation.html') _PROJECT_HINT = ( 'Create a project and then set it as the default project,:\n' @@ -51,8 +50,7 @@ class Cudo(clouds.Cloud): f'{_INDENT_PREFIX} $ cudoctl init\n' f'{_INDENT_PREFIX}For more info: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html' - ) + 'https://docs.skypilot.co/en/latest/getting-started/installation.html') _CLOUD_UNSUPPORTED_FEATURES = { clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.', diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 8a28a35505e..c0f22cc860b 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -167,7 +167,7 @@ class GCP(clouds.Cloud): # ~/.config/gcloud/application_default_credentials.json. f'{_INDENT_PREFIX} $ gcloud auth application-default login\n' f'{_INDENT_PREFIX}For more info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long ) _APPLICATION_CREDENTIAL_HINT = ( 'Run the following commands:\n' @@ -175,7 +175,7 @@ class GCP(clouds.Cloud): f'{_INDENT_PREFIX}Or set the environment variable GOOGLE_APPLICATION_CREDENTIALS ' 'to the path of your service account key file.\n' f'{_INDENT_PREFIX}For more info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long ) _SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier) @@ -836,7 +836,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'The following permissions are not enabled for the current ' f'GCP identity ({identity_str}):\n ' f'{diffs}\n ' - 'For more details, visit: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long + 'For more details, visit: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long return True, None def get_credential_file_mounts(self) -> Dict[str, str]: diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 471639626eb..65b50042aba 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -395,7 +395,7 @@ def make_deploy_resources_variables( tpu_requested = True k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY else: - k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY + k8s_resource_key = kubernetes_utils.get_gpu_resource_key() port_mode = network_utils.get_port_mode(None) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 95f4efe95e3..d4ae6f298d2 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -390,7 +390,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: short_credential_help_str = ( 'For more details, refer to: ' # pylint: disable=line-too-long - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci' + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#oracle-cloud-infrastructure-oci' ) credential_help_str = ( 'To configure credentials, go to: ' diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index 69a0d69ca61..dc309d9c9dd 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -258,7 +258,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: return False, ( 'Failed to access Paperspace Cloud with credentials.\n ' 'To configure credentials, follow the instructions at: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#paperspace\n ' + 'https://docs.skypilot.co/en/latest/getting-started/installation.html#paperspace\n ' 'Generate API key and create a json at `~/.paperspace/config.json` with \n ' ' {"apiKey": "[YOUR API KEY]"}\n ' f'Reason: {str(e)}') diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index 6ddbfe0f1e9..b1cc016abd9 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -253,7 +253,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: ' Credentials can be set up by running: \n' f' $ pip install runpod \n' f' $ runpod config\n' - ' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long + ' For more information, see https://docs.skypilot.co/en/latest/getting-started/installation.html#runpod' # pylint: disable=line-too-long ) return True, None diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 67c6e09b27e..29df92d7535 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -296,7 +296,7 @@ def _get_all_supported_regions_str() -> str: faq_msg = ( '\nIf a region is not included in the following ' 'list, please check the FAQ docs for how to fetch ' - 'its catalog info.\nhttps://skypilot.readthedocs.io' + 'its catalog info.\nhttps://docs.skypilot.co' '/en/latest/reference/faq.html#advanced-how-to-' 'make-skypilot-use-all-global-regions') error_msg += faq_msg + _get_all_supported_regions_str() diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index 1373fd86a03..a125258ac35 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -1,6 +1,6 @@ """Constants used for service catalog.""" HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long -CATALOG_SCHEMA_VERSION = 'v5' +CATALOG_SCHEMA_VERSION = 'v6' CATALOG_DIR = '~/.sky/catalogs' ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci', 'kubernetes', 'runpod', 'vsphere', 'cudo', 'fluidstack', diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 8521f6786cc..a83e00d8196 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -292,7 +292,9 @@ def get_instance_type_for_accelerator( if acc_name in _ACC_INSTANCE_TYPE_DICTS: df = _df[_df['InstanceType'].notna()] - instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name][acc_count] + instance_types = _ACC_INSTANCE_TYPE_DICTS[acc_name].get(acc_count, None) + if instance_types is None: + return None, [] df = df[df['InstanceType'].isin(instance_types)] # Check the cpus and memory specified by the user. diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 92e62a8a240..1fd76400c9f 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -266,7 +266,7 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: 'Run the following commands:' f'\n{cls._INDENT_PREFIX} $ pip install skypilot[vSphere]' f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. ' - 'For more details. See https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long + 'For more details. See https://docs.skypilot.co/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long f'{common_utils.format_exception(e, use_bracket=True)}') required_keys = ['name', 'username', 'password', 'clusters'] diff --git a/sky/data/storage.py b/sky/data/storage.py index 897f2f96b94..d3d18a9d18f 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -1157,7 +1157,7 @@ def _validate(self): 'Storage \'store: s3\' specified, but ' \ 'AWS access is disabled. To fix, enable '\ 'AWS by running `sky check`. More info: '\ - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) @classmethod @@ -1588,7 +1588,7 @@ def _validate(self): 'Storage \'store: gcs\' specified, but ' 'GCP access is disabled. To fix, enable ' 'GCP by running `sky check`. ' - 'More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.') # pylint: disable=line-too-long + 'More info: https://docs.skypilot.co/en/latest/getting-started/installation.html.') # pylint: disable=line-too-long @classmethod def validate_name(cls, name: str) -> str: @@ -2110,7 +2110,7 @@ def _validate(self): 'Storage "store: azure" specified, but ' 'Azure access is disabled. To fix, enable ' 'Azure by running `sky check`. More info: ' - 'https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) @classmethod @@ -2813,7 +2813,7 @@ def _validate(self): 'Storage \'store: r2\' specified, but ' \ 'Cloudflare R2 access is disabled. To fix, '\ 'enable Cloudflare R2 by running `sky check`. '\ - 'More info: https://skypilot.readthedocs.io/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long + 'More info: https://docs.skypilot.co/en/latest/getting-started/installation.html.' # pylint: disable=line-too-long ) def initialize(self): diff --git a/sky/jobs/core.py b/sky/jobs/core.py index e675e2120d1..d47922d64ce 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -37,11 +37,12 @@ @timeline.event @usage_lib.entrypoint def launch( - task: Union['sky.Task', 'sky.Dag'], - name: Optional[str] = None, - stream_logs: bool = True, - detach_run: bool = False, - fast: bool = False, + task: Union['sky.Task', 'sky.Dag'], + name: Optional[str] = None, + stream_logs: bool = True, + detach_run: bool = False, + # TODO(cooperc): remove fast arg before 0.8.0 + fast: bool = True, # pylint: disable=unused-argument for compatibility ) -> None: # NOTE(dev): Keep the docstring consistent between the Python API and CLI. """Launch a managed job. @@ -53,9 +54,8 @@ def launch( managed job. name: Name of the managed job. detach_run: Whether to detach the run. - fast: Whether to use sky.launch(fast=True) for the jobs controller. If - True, the SkyPilot wheel and the cloud credentials may not be updated - on the jobs controller. + fast: [Deprecated] Does nothing, and will be removed soon. We will + always use fast mode as it's fully safe now. Raises: ValueError: cluster does not exist. Or, the entrypoint is not a valid @@ -147,7 +147,7 @@ def launch( idle_minutes_to_autostop=skylet_constants. CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP, retry_until_up=True, - fast=fast, + fast=True, _disable_controller_check=True) diff --git a/sky/provision/gcp/constants.py b/sky/provision/gcp/constants.py index 4f442709b0c..7b3fd4046b5 100644 --- a/sky/provision/gcp/constants.py +++ b/sky/provision/gcp/constants.py @@ -142,7 +142,7 @@ ] # A list of permissions required to run SkyPilot on GCP. -# Keep this in sync with https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long +# Keep this in sync with https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html # pylint: disable=line-too-long VM_MINIMAL_PERMISSIONS = [ 'compute.disks.create', 'compute.disks.list', diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 2b13e78fdf8..c431b023ab9 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -180,6 +180,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): # case we will need to update this logic. # TODO(Doyoung): Update the error message raised # with the multi-host TPU support. + gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long if 'Insufficient google.com/tpu' in event_message: extra_msg = ( f'Verify if ' @@ -192,14 +193,15 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): pod, extra_msg, details=event_message)) - elif (('Insufficient nvidia.com/gpu' + elif ((f'Insufficient {gpu_resource_key}' in event_message) or ('didn\'t match Pod\'s node affinity/selector' in event_message)): extra_msg = ( - f'Verify if ' - f'{pod.spec.node_selector[label_key]}' - ' is available in the cluster.') + f'Verify if any node matching label ' + f'{pod.spec.node_selector[label_key]} and ' + f'sufficient resource {gpu_resource_key} ' + f'is available in the cluster.') raise config_lib.KubernetesError( _lack_resource_msg('GPU', pod, @@ -722,13 +724,13 @@ def _create_pods(region: str, cluster_name_on_cloud: str, 'Continuing without using nvidia RuntimeClass.\n' 'If you are on a K3s cluster, manually ' 'override runtimeClassName in ~/.sky/config.yaml. ' - 'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long + 'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long needs_gpus = False limits = pod_spec['spec']['containers'][0].get('resources', {}).get('limits') if limits is not None: - needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0 + needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0 # TPU pods provisioned on GKE use the default containerd runtime. # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long @@ -879,27 +881,62 @@ def _terminate_node(namespace: str, context: Optional[str], pod_name: str) -> None: """Terminate a pod.""" logger.debug('terminate_instances: calling delete_namespaced_pod') - try: - kubernetes.core_api(context).delete_namespaced_service( - pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT) - kubernetes.core_api(context).delete_namespaced_service( - f'{pod_name}-ssh', - namespace, - _request_timeout=config_lib.DELETION_TIMEOUT) - except kubernetes.api_exception(): - pass + + def _delete_k8s_resource_with_retry(delete_func: Callable, + resource_type: str, + resource_name: str) -> None: + """Helper to delete Kubernetes resources with 404 handling and retries. + + Args: + delete_func: Function to call to delete the resource + resource_type: Type of resource being deleted (e.g. 'service'), + used in logging + resource_name: Name of the resource being deleted, used in logging + """ + max_retries = 3 + retry_delay = 5 # seconds + + for attempt in range(max_retries): + try: + delete_func() + return + except kubernetes.api_exception() as e: + if e.status == 404: + logger.warning( + f'terminate_instances: Tried to delete {resource_type} ' + f'{resource_name}, but the {resource_type} was not ' + 'found (404).') + return + elif attempt < max_retries - 1: + logger.warning(f'terminate_instances: Failed to delete ' + f'{resource_type} {resource_name} (attempt ' + f'{attempt + 1}/{max_retries}). Error: {e}. ' + f'Retrying in {retry_delay} seconds...') + time.sleep(retry_delay) + else: + raise + + # Delete services for the pod + for service_name in [pod_name, f'{pod_name}-ssh']: + _delete_k8s_resource_with_retry( + delete_func=lambda name=service_name: kubernetes.core_api( + context).delete_namespaced_service(name=name, + namespace=namespace, + _request_timeout=config_lib. + DELETION_TIMEOUT), + resource_type='service', + resource_name=service_name) + # Note - delete pod after all other resources are deleted. # This is to ensure there are no leftover resources if this down is run # from within the pod, e.g., for autodown. - try: - kubernetes.core_api(context).delete_namespaced_pod( - pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT) - except kubernetes.api_exception() as e: - if e.status == 404: - logger.warning('terminate_instances: Tried to delete pod ' - f'{pod_name}, but the pod was not found (404).') - else: - raise + _delete_k8s_resource_with_retry( + delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod( + name=pod_name, + namespace=namespace, + _request_timeout=config_lib.DELETION_TIMEOUT), + resource_type='pod', + resource_name=pod_name) def terminate_instances( diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 7442c9be7a6..5150cc5860b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -438,7 +438,7 @@ def detect_accelerator_resource( nodes = get_kubernetes_nodes(context) for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or + has_accelerator = (get_gpu_resource_key() in cluster_resources or TPU_RESOURCE_KEY in cluster_resources) return has_accelerator, cluster_resources @@ -972,7 +972,7 @@ def is_kubeconfig_exec_auth( '~/.sky/config.yaml:\n' ' kubernetes:\n' ' remote_identity: SERVICE_ACCOUNT\n' - ' More: https://skypilot.readthedocs.io/en/latest/' + ' More: https://docs.skypilot.co/en/latest/' 'reference/config.html') return True, exec_msg return False, None @@ -2253,10 +2253,11 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: Number of accelerators allocated or available from the node. If no resource is found, it returns 0. """ - assert not (GPU_RESOURCE_KEY in attribute_dict and + gpu_resource_name = get_gpu_resource_key() + assert not (gpu_resource_name in attribute_dict and TPU_RESOURCE_KEY in attribute_dict) - if GPU_RESOURCE_KEY in attribute_dict: - return int(attribute_dict[GPU_RESOURCE_KEY]) + if gpu_resource_name in attribute_dict: + return int(attribute_dict[gpu_resource_name]) elif TPU_RESOURCE_KEY in attribute_dict: return int(attribute_dict[TPU_RESOURCE_KEY]) return 0 @@ -2415,3 +2416,18 @@ def process_skypilot_pods( num_pods = len(cluster.pods) cluster.resources_str = f'{num_pods}x {cluster.resources}' return list(clusters.values()), jobs_controllers, serve_controllers + + +def get_gpu_resource_key(): + """Get the GPU resource name to use in kubernetes. + The function first checks for an environment variable. + If defined, it uses its value; otherwise, it returns the default value. + Args: + name (str): Default GPU resource name, default is "nvidia.com/gpu". + Returns: + str: The selected GPU resource name. + """ + # Retrieve GPU resource name from environment variable, if set. + # Else use default. + # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc. + return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY) diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 121f96d8e8b..0770da28c43 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -186,6 +186,6 @@ def parse_readme(readme: str) -> str: 'Homepage': 'https://github.com/skypilot-org/skypilot', 'Issues': 'https://github.com/skypilot-org/skypilot/issues', 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions', - 'Documentation': 'https://skypilot.readthedocs.io/en/latest/', + 'Documentation': 'https://docs.skypilot.co/', }, ) diff --git a/sky/skypilot_config.py b/sky/skypilot_config.py index aae62afc616..e973754f4c9 100644 --- a/sky/skypilot_config.py +++ b/sky/skypilot_config.py @@ -238,7 +238,7 @@ def _try_load_config() -> None: _dict, schemas.get_config_schema(), f'Invalid config YAML ({config_path}). See: ' - 'https://skypilot.readthedocs.io/en/latest/reference/config.html. ' # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long 'Error: ', skip_none=False) diff --git a/sky/utils/kubernetes/deploy_remote_cluster.sh b/sky/utils/kubernetes/deploy_remote_cluster.sh index 94736474289..8d7ba3e5729 100755 --- a/sky/utils/kubernetes/deploy_remote_cluster.sh +++ b/sky/utils/kubernetes/deploy_remote_cluster.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Refer to https://skypilot.readthedocs.io/en/latest/reservations/existing-machines.html for details on how to use this script. +# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. set -e # Colors for nicer UX diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py index 14fbbdedca5..6877c94a2a8 100644 --- a/sky/utils/kubernetes/gpu_labeler.py +++ b/sky/utils/kubernetes/gpu_labeler.py @@ -101,7 +101,7 @@ def label(): # Get the list of nodes with GPUs gpu_nodes = [] for node in nodes: - if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity: + if kubernetes_utils.get_gpu_resource_key() in node.status.capacity: gpu_nodes.append(node) print(f'Found {len(gpu_nodes)} GPU nodes in the cluster') @@ -115,7 +115,7 @@ def label(): print('Continuing without using nvidia RuntimeClass. ' 'This may fail on K3s clusters. ' 'For more details, refer to K3s deployment notes at: ' - 'https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long + 'https://docs.skypilot.co/en/latest/reference/kubernetes/kubernetes-setup.html') # pylint: disable=line-too-long nvidia_exists = False if nvidia_exists: @@ -142,7 +142,7 @@ def label(): if len(gpu_nodes) == 0: print('No GPU nodes found in the cluster. If you have GPU nodes, ' 'please ensure that they have the label ' - f'`{kubernetes_utils.GPU_RESOURCE_KEY}: `') + f'`{kubernetes_utils.get_gpu_resource_key()}: `') else: print('GPU labeling started - this may take 10 min or more to complete.' '\nTo check the status of GPU labeling jobs, run ' diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index 0255884ae30..18b82c649e7 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -1299,6 +1299,34 @@ def test_use_spot(generic_cloud: str): smoke_tests_utils.run_one_test(test) +@pytest.mark.azure +def test_azure_spot_instance_verification(): + """Test Azure spot instance provisioning with explicit verification. + This test verifies that when --use-spot is specified for Azure: + 1. The cluster launches successfully + 2. The instances are actually provisioned as spot instances + """ + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( + 'azure-spot-verification', + [ + f'sky launch -c {name} --cloud azure tests/test_yamls/minimal.yaml --use-spot -y', + f'sky logs {name} 1 --status', f'TARGET_VM_NAME="{name}"; ' + 'VM_INFO=$(az vm list --query "[?contains(name, \'$TARGET_VM_NAME\')].{Name:name, ResourceGroup:resourceGroup}" -o tsv); ' + '[[ -z "$VM_INFO" ]] && exit 1; ' + 'FULL_VM_NAME=$(echo "$VM_INFO" | awk \'{print $1}\'); ' + 'RESOURCE_GROUP=$(echo "$VM_INFO" | awk \'{print $2}\'); ' + 'VM_DETAILS=$(az vm list --resource-group "$RESOURCE_GROUP" ' + '--query "[?name==\'$FULL_VM_NAME\'].{Name:name, Location:location, Priority:priority}" -o table); ' + '[[ -z "$VM_DETAILS" ]] && exit 1; ' + 'echo "VM Details:"; echo "$VM_DETAILS"; ' + 'echo "$VM_DETAILS" | grep -qw "Spot" && exit 0 || exit 1' + ], + f'sky down -y {name}', + ) + smoke_tests_utils.run_one_test(test) + + @pytest.mark.gcp def test_stop_gcp_spot(): """Test GCP spot can be stopped, autostopped, restarted.""" diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_quick_tests_core.py similarity index 63% rename from tests/smoke_tests/test_pre_merge.py rename to tests/smoke_tests/test_quick_tests_core.py index 4890ac15ce4..48df4ef9a2b 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_quick_tests_core.py @@ -1,23 +1,27 @@ # Smoke tests for SkyPilot required before merging +# If the change includes an interface modification or touches the core API, +# the reviewer could decide it’s necessary to trigger a pre-merge test and +# leave a comment /quicktest-core will then trigger this test. +# # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/smoke_tests/test_pre_merge.py +# > pytest tests/smoke_tests/test_quick_tests_core.py # # Terminate failed clusters after test finishes -# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure +# > pytest tests/smoke_tests/test_quick_tests_core.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount +# > pytest tests/smoke_tests/test_quick_tests_core.py::test_yaml_launch_and_mount # # Only run test for AWS + generic tests -# > pytest tests/smoke_tests/test_pre_merge.py --aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --aws # # Change cloud for generic tests to aws -# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --generic-cloud aws from smoke_tests import smoke_tests_utils @@ -29,7 +33,7 @@ def test_yaml_launch_and_mount(generic_cloud: str): test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', [ - f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml', + f'sky launch -y -c {name} tests/test_yamls/minimal_test_quick_tests_core.yaml', smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, diff --git a/tests/test_yamls/minimal_test_pre_merge.yaml b/tests/test_yamls/minimal_test_quick_tests_core.yaml similarity index 62% rename from tests/test_yamls/minimal_test_pre_merge.yaml rename to tests/test_yamls/minimal_test_quick_tests_core.yaml index 583575bee5c..15857e972dd 100644 --- a/tests/test_yamls/minimal_test_pre_merge.yaml +++ b/tests/test_yamls/minimal_test_quick_tests_core.yaml @@ -10,4 +10,4 @@ workdir: . num_nodes: 1 run: | - ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml + ls -l ~/aws/tests/test_yamls/minimal_test_quick_tests_core.yaml diff --git a/tests/unit_tests/sky/adaptors/test_oci.py b/tests/unit_tests/sky/adaptors/test_oci.py new file mode 100644 index 00000000000..59c2b1f99b7 --- /dev/null +++ b/tests/unit_tests/sky/adaptors/test_oci.py @@ -0,0 +1,65 @@ +"""Tests for OCI adaptor.""" +import logging + +import pytest + +from sky import check as sky_check +from sky.adaptors import oci +from sky.utils import log_utils + + +def test_oci_circuit_breaker_logging(): + """Test that OCI circuit breaker logging is properly configured.""" + # Get the circuit breaker logger + logger = logging.getLogger('oci.circuit_breaker') + + # Create a handler that captures log records + log_records = [] + test_handler = logging.Handler() + test_handler.emit = lambda record: log_records.append(record) + logger.addHandler(test_handler) + + # Create a null handler to suppress logs during import + null_handler = logging.NullHandler() + logger.addHandler(null_handler) + + try: + # Verify logger starts at WARNING level (set by adaptor initialization) + initial_level = logger.getEffectiveLevel() + print( + f'Initial logger level: {initial_level} (WARNING={logging.WARNING})' + ) + assert initial_level == logging.WARNING, ( + 'OCI circuit breaker logger should be set to WARNING before initialization' + ) + + # Force OCI module import through LazyImport by accessing a module attribute + print('Attempting to import OCI module...') + try: + # This will trigger LazyImport's load_module for the actual OCI module + _ = oci.oci.config.DEFAULT_LOCATION + except (ImportError, AttributeError) as e: + # Expected when OCI SDK is not installed + print(f'Import/Attribute error as expected: {e}') + pass + + # Verify logger level after import attempt + after_level = logger.getEffectiveLevel() + print( + f'Logger level after import: {after_level} (WARNING={logging.WARNING})' + ) + assert after_level == logging.WARNING, ( + 'OCI circuit breaker logger should remain at WARNING after initialization' + ) + + # Verify no circuit breaker logs were emitted + circuit_breaker_logs = [ + record for record in log_records + if 'Circuit breaker' in record.getMessage() + ] + assert not circuit_breaker_logs, ( + 'No circuit breaker logs should be emitted during initialization') + finally: + # Clean up the handlers + logger.removeHandler(test_handler) + logger.removeHandler(null_handler)