Skip to content

Commit

Permalink
Merge branch 'master' into dev/zeping/show_logs_for_storage_mount
Browse files Browse the repository at this point in the history
  • Loading branch information
zpoint committed Dec 2, 2024
2 parents 5dc9a82 + 23f9821 commit a42f8b8
Show file tree
Hide file tree
Showing 52 changed files with 1,430 additions and 477 deletions.
74 changes: 74 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Ensure this configuration aligns with format.sh and requirements.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files

- repo: https://github.com/psf/black
rev: 22.10.0 # Match the version from requirements
hooks:
- id: black
name: black (IBM specific)
files: "^sky/skylet/providers/ibm/.*" # Match only files in the IBM directory

- repo: https://github.com/pycqa/isort
rev: 5.12.0 # Match the version from requirements
hooks:
# First isort command
- id: isort
name: isort (general)
args:
- "--sg=build/**" # Matches "${ISORT_YAPF_EXCLUDES[@]}"
- "--sg=sky/skylet/providers/ibm/**"
files: "^(sky|tests|examples|llm|docs)/.*" # Only match these directories
# Second isort command
- id: isort
name: isort (IBM specific)
args:
- "--profile=black"
- "-l=88"
- "-m=3"
files: "^sky/skylet/providers/ibm/.*" # Only match IBM-specific directory

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.991 # Match the version from requirements
hooks:
- id: mypy
args:
# From tests/mypy_files.txt
- "sky"
- "--exclude"
- "sky/benchmark|sky/callbacks|sky/skylet/providers/azure|sky/resources.py|sky/backends/monkey_patches"
pass_filenames: false
additional_dependencies:
- types-PyYAML
- types-requests<2.31 # Match the condition in requirements.txt
- types-setuptools
- types-cachetools
- types-pyvmomi

- repo: https://github.com/google/yapf
rev: v0.32.0 # Match the version from requirements
hooks:
- id: yapf
name: yapf
exclude: (build/.*|sky/skylet/providers/ibm/.*) # Matches exclusions from the script
args: ['--recursive', '--parallel'] # Only necessary flags
additional_dependencies: [toml==0.10.2]

- repo: https://github.com/pylint-dev/pylint
rev: v2.14.5 # Match the version from requirements
hooks:
- id: pylint
additional_dependencies:
- pylint-quotes==0.2.3 # Match the version from requirements
name: pylint
args:
- --rcfile=.pylintrc # Use your custom pylint configuration
- --load-plugins=pylint_quotes # Load the pylint-quotes plugin
files: ^sky/ # Only include files from the 'sky/' directory
exclude: ^sky/skylet/providers/ibm/
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ It has some convenience features which you might find helpful (see [Dockerfile](
- If relevant, add tests for your changes. For changes that touch the core system, run the [smoke tests](#testing) and ensure they pass.
- Follow the [Google style guide](https://google.github.io/styleguide/pyguide.html).
- Ensure code is properly formatted by running [`format.sh`](https://github.com/skypilot-org/skypilot/blob/master/format.sh).
- [Optional] You can also install pre-commit hooks by running `pre-commit install` to automatically format your code on commit.
- Push your changes to your fork and open a pull request in the SkyPilot repository.
- In the PR description, write a `Tested:` section to describe relevant tests performed.

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile_k8s
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ARG DEBIAN_FRONTEND=noninteractive

# Initialize conda for root user, install ssh and other local dependencies
RUN apt update -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat curl -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse socat netcat-openbsd curl -y && \
rm -rf /var/lib/apt/lists/* && \
apt remove -y python3 && \
conda init
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile_k8s_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ ARG DEBIAN_FRONTEND=noninteractive
# We remove cuda lists to avoid conflicts with the cuda version installed by ray
RUN rm -rf /etc/apt/sources.list.d/cuda* && \
apt update -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat curl -y && \
apt install git gcc rsync sudo patch openssh-server pciutils nano fuse unzip socat netcat-openbsd curl -y && \
rm -rf /var/lib/apt/lists/*

# Setup SSH and generate hostkeys
Expand Down Expand Up @@ -36,6 +36,7 @@ SHELL ["/bin/bash", "-c"]

# Install conda and other dependencies
# Keep the conda and Ray versions below in sync with the ones in skylet.constants
# Keep this section in sync with the custom image optimization recommendations in our docs (kubernetes-getting-started.rst)
RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \
bash Miniconda3-Linux-x86_64.sh -b && \
eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \
Expand Down
8 changes: 8 additions & 0 deletions docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,14 @@ The :code:`~/.oci/config` file should contain the following fields:
# Note that we should avoid using full home path for the key_file configuration, e.g. use ~/.oci instead of /home/username/.oci
key_file=~/.oci/oci_api_key.pem
By default, the provisioned nodes will be in the root `compartment <https://docs.oracle.com/en/cloud/foundation/cloud_architecture/governance/compartments.html>`__. To specify the `compartment <https://docs.oracle.com/en/cloud/foundation/cloud_architecture/governance/compartments.html>`_ other than root, create/edit the file :code:`~/.sky/config.yaml`, put the compartment's OCID there, as the following:

.. code-block:: text
oci:
default:
compartment_ocid: ocid1.compartment.oc1..aaaaaaaa......
Lambda Cloud
~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion docs/source/reference/comparison.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ SkyPilot provides faster iteration for interactive development. For example, a c
* :strong:`With SkyPilot, a single command (`:literal:`sky launch`:strong:`) takes care of everything.` Behind the scenes, SkyPilot provisions pods, installs all required dependencies, executes the job, returns logs, and provides SSH and VSCode access to debug.


.. figure:: https://blog.skypilot.co/ai-on-kubernetes/images/k8s_vs_skypilot_iterative_v2.png
.. figure:: https://i.imgur.com/xfCfz4N.png
:align: center
:width: 95%
:alt: Iterative Development with Kubernetes vs SkyPilot
Expand Down
29 changes: 29 additions & 0 deletions docs/source/reference/kubernetes/kubernetes-getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,32 @@ FAQs
type: Directory
For more details refer to :ref:`config-yaml`.

* **I am using a custom image. How can I speed up the pod startup time?**

You can pre-install SkyPilot dependencies in your custom image to speed up the pod startup time. Simply add these lines at the end of your Dockerfile:

.. code-block:: dockerfile
FROM <your base image>
# Install system dependencies
RUN apt update -y && \
apt install git gcc rsync sudo patch openssh-server pciutils fuse unzip socat netcat-openbsd curl -y && \
rm -rf /var/lib/apt/lists/*
# Install conda and other python dependencies
RUN curl https://repo.anaconda.com/miniconda/Miniconda3-py310_23.11.0-2-Linux-x86_64.sh -o Miniconda3-Linux-x86_64.sh && \
bash Miniconda3-Linux-x86_64.sh -b && \
eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true && conda activate base && \
grep "# >>> conda initialize >>>" ~/.bashrc || { conda init && source ~/.bashrc; } && \
rm Miniconda3-Linux-x86_64.sh && \
export PIP_DISABLE_PIP_VERSION_CHECK=1 && \
python3 -m venv ~/skypilot-runtime && \
PYTHON_EXEC=$(echo ~/skypilot-runtime)/bin/python && \
$PYTHON_EXEC -m pip install 'skypilot-nightly[remote,kubernetes]' 'ray[default]==2.9.3' 'pycryptodome==3.12.0' && \
$PYTHON_EXEC -m pip uninstall skypilot-nightly -y && \
curl -LO "https://dl.k8s.io/release/v1.28.11/bin/linux/amd64/kubectl" && \
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \
echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def update_current_kubernetes_clusters_from_registry():
def get_allowed_contexts():
"""Mock implementation of getting allowed kubernetes contexts."""
from sky.provision.kubernetes import utils
contexts = utils.get_all_kube_config_context_names()
contexts = utils.get_all_kube_context_names()
return contexts[:2]


Expand Down
4 changes: 2 additions & 2 deletions examples/oci/serve-qwen-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ resources:
setup: |
conda create -n vllm python=3.12 -y
conda activate vllm
pip install vllm
pip install vllm-flash-attn
pip install vllm==0.6.3.post1
pip install vllm-flash-attn==2.6.2
run: |
conda activate vllm
Expand Down
44 changes: 34 additions & 10 deletions sky/adaptors/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@
# Timeout to use for API calls
API_TIMEOUT = 5

DEFAULT_IN_CLUSTER_REGION = 'in-cluster'
# The name for the environment variable that stores the in-cluster context name
# for Kubernetes clusters. This is used to associate a name with the current
# context when running with in-cluster auth. If not set, the context name is
# set to DEFAULT_IN_CLUSTER_REGION.
IN_CLUSTER_CONTEXT_NAME_ENV_VAR = 'SKYPILOT_IN_CLUSTER_CONTEXT_NAME'


def _decorate_methods(obj: Any, decorator: Callable, decoration_type: str):
for attr_name in dir(obj):
Expand Down Expand Up @@ -57,16 +64,8 @@ def wrapped(*args, **kwargs):

def _load_config(context: Optional[str] = None):
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
try:
# Load in-cluster config if running in a pod
# Kubernetes set environment variables for service discovery do not
# show up in SkyPilot tasks. For now, we work around by using
# DNS name instead of environment variables.
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
kubernetes.config.load_incluster_config()
except kubernetes.config.config_exception.ConfigException:

def _load_config_from_kubeconfig(context: Optional[str] = None):
try:
kubernetes.config.load_kube_config(context=context)
except kubernetes.config.config_exception.ConfigException as e:
Expand All @@ -90,6 +89,21 @@ def _load_config(context: Optional[str] = None):
with ux_utils.print_exception_no_traceback():
raise ValueError(err_str) from None

if context == in_cluster_context_name() or context is None:
try:
# Load in-cluster config if running in a pod and context is None.
# Kubernetes set environment variables for service discovery do not
# show up in SkyPilot tasks. For now, we work around by using
# DNS name instead of environment variables.
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
kubernetes.config.load_incluster_config()
except kubernetes.config.config_exception.ConfigException:
_load_config_from_kubeconfig()
else:
_load_config_from_kubeconfig(context)


@_api_logging_decorator('urllib3', logging.ERROR)
@functools.lru_cache()
Expand Down Expand Up @@ -154,3 +168,13 @@ def max_retry_error():

def stream():
return kubernetes.stream.stream


def in_cluster_context_name() -> Optional[str]:
"""Returns the name of the in-cluster context from the environment.
If the environment variable is not set, returns the default in-cluster
context name.
"""
return (os.environ.get(IN_CLUSTER_CONTEXT_NAME_ENV_VAR) or
DEFAULT_IN_CLUSTER_REGION)
4 changes: 2 additions & 2 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,8 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
context = config['provider'].get(
'context', kubernetes_utils.get_current_kube_config_context_name())
if context == kubernetes_utils.IN_CLUSTER_REGION:
# If the context is set to IN_CLUSTER_REGION, we are running in a pod
if context == kubernetes.in_cluster_context_name():
# If the context is an in-cluster context name, we are running in a pod
# with in-cluster configuration. We need to set the context to None
# to use the mounted service account.
context = None
Expand Down
42 changes: 26 additions & 16 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ def write_cluster_config(
resources_utils.ClusterName(
cluster_name,
cluster_name_on_cloud,
), region, zones, dryrun)
), region, zones, num_nodes, dryrun)
config_dict = {}

specific_reservations = set(
Expand Down Expand Up @@ -730,7 +730,12 @@ def write_cluster_config(
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
'is not supported by this cloud. Remove the config or set: '
'`remote_identity: LOCAL_CREDENTIALS`.')
excluded_clouds.add(cloud)
if isinstance(cloud, clouds.Kubernetes):
if skypilot_config.get_nested(
('kubernetes', 'allowed_contexts'), None) is None:
excluded_clouds.add(cloud)
else:
excluded_clouds.add(cloud)

for cloud_str, cloud_obj in cloud_registry.CLOUD_REGISTRY.items():
remote_identity_config = skypilot_config.get_nested(
Expand Down Expand Up @@ -844,7 +849,11 @@ def write_cluster_config(
'{sky_wheel_hash}',
wheel_hash).replace('{cloud}',
str(cloud).lower())),

'skypilot_wheel_installation_commands':
constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
'{sky_wheel_hash}',
wheel_hash).replace('{cloud}',
str(cloud).lower()),
# Port of Ray (GCS server).
# Ray's default port 6379 is conflicted with Redis.
'ray_port': constants.SKY_REMOTE_RAY_PORT,
Expand Down Expand Up @@ -1190,18 +1199,18 @@ def ssh_credential_from_yaml(


def parallel_data_transfer_to_nodes(
runners: List[command_runner.CommandRunner],
source: Optional[str],
target: str,
cmd: Optional[str],
run_rsync: bool,
*,
action_message: str,
# Advanced options.
log_path: str = os.devnull,
stream_logs: bool = False,
source_bashrc: bool = False,
):
runners: List[command_runner.CommandRunner],
source: Optional[str],
target: str,
cmd: Optional[str],
run_rsync: bool,
*,
action_message: str,
# Advanced options.
log_path: str = os.devnull,
stream_logs: bool = False,
source_bashrc: bool = False,
num_threads: Optional[int] = None):
"""Runs a command on all nodes and optionally runs rsync from src->dst.
Args:
Expand All @@ -1213,6 +1222,7 @@ def parallel_data_transfer_to_nodes(
log_path: str; Path to the log file
stream_logs: bool; Whether to stream logs to stdout
source_bashrc: bool; Source bashrc before running the command.
num_threads: Optional[int]; Number of threads to use.
"""
style = colorama.Style

Expand Down Expand Up @@ -1253,7 +1263,7 @@ def _sync_node(runner: 'command_runner.CommandRunner') -> None:
message = (f' {style.DIM}{action_message} (to {num_nodes} node{plural})'
f': {origin_source} -> {target}{style.RESET_ALL}')
logger.info(message)
subprocess_utils.run_in_parallel(_sync_node, runners)
subprocess_utils.run_in_parallel(_sync_node, runners, num_threads)


def check_local_gpus() -> bool:
Expand Down
Loading

0 comments on commit a42f8b8

Please sign in to comment.