Skip to content

Commit

Permalink
merge resolve
Browse files Browse the repository at this point in the history
  • Loading branch information
kristopolous committed Dec 12, 2024
1 parent 84441f3 commit 3010706
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 223 deletions.
137 changes: 12 additions & 125 deletions sky/setup_files/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,28 @@
import os
import platform
import re
import runpy
import subprocess
import sys
from typing import Dict, List

import setuptools

# __file__ is setup.py at the root of the repo. We shouldn't assume it's a
# symlink - e.g. in the sdist it's resolved to a normal file.
ROOT_DIR = os.path.dirname(__file__)
DEPENDENCIES_FILE_PATH = os.path.join(ROOT_DIR, 'sky', 'setup_files',
'dependencies.py')
INIT_FILE_PATH = os.path.join(ROOT_DIR, 'sky', '__init__.py')
_COMMIT_FAILURE_MESSAGE = (
'WARNING: SkyPilot fail to {verb} the commit hash in '
f'{INIT_FILE_PATH!r} (SkyPilot can still be normally used): '
'{error}')

# setuptools does not include the script dir on the search path, so we can't
# just do `import dependencies`. Instead, use runpy to manually load it. Note:
# dependencies here is a dict, not a module, so we access it by subscripting.
dependencies = runpy.run_path(DEPENDENCIES_FILE_PATH)

original_init_content = None

system = platform.system()
Expand Down Expand Up @@ -130,128 +139,6 @@ def parse_readme(readme: str) -> str:
return readme


install_requires = [
'wheel',
'cachetools',
# NOTE: ray requires click>=7.0.
'click >= 7.0',
'colorama',
'cryptography',
# Jinja has a bug in older versions because of the lack of pinning
# the version of the underlying markupsafe package. See:
# https://github.com/pallets/jinja/issues/1585
'jinja2 >= 3.0',
'jsonschema',
'networkx',
'pandas>=1.3.0',
'pendulum',
# PrettyTable with version >=2.0.0 is required for the support of
# `add_rows` method.
'PrettyTable >= 2.0.0',
'python-dotenv',
'rich',
'tabulate',
# Light weight requirement, can be replaced with "typing" once
# we deprecate Python 3.7 (this will take a while).
'typing_extensions',
'filelock >= 3.6.0',
'packaging',
'psutil',
'pulp',
# Cython 3.0 release breaks PyYAML 5.4.* (https://github.com/yaml/pyyaml/issues/601)
# <= 3.13 may encounter https://github.com/ultralytics/yolov5/issues/414
'pyyaml > 3.13, != 5.4.*',
'requests',
]

local_ray = [
# Lower version of ray will cause dependency conflict for
# click/grpcio/protobuf.
# Excluded 2.6.0 as it has a bug in the cluster launcher:
# https://github.com/ray-project/ray/releases/tag/ray-2.6.1
'ray[default] >= 2.2.0, != 2.6.0',
]

remote = [
# Adopted from ray's setup.py: https://github.com/ray-project/ray/blob/ray-2.4.0/python/setup.py
# SkyPilot: != 1.48.0 is required to avoid the error where ray dashboard fails to start when
# ray start is called (#2054).
# Tracking issue: https://github.com/ray-project/ray/issues/30984
"grpcio >= 1.32.0, <= 1.49.1, != 1.48.0; python_version < '3.10' and sys_platform == 'darwin'", # noqa:E501
"grpcio >= 1.42.0, <= 1.49.1, != 1.48.0; python_version >= '3.10' and sys_platform == 'darwin'", # noqa:E501
# Original issue: https://github.com/ray-project/ray/issues/33833
"grpcio >= 1.32.0, <= 1.51.3, != 1.48.0; python_version < '3.10' and sys_platform != 'darwin'", # noqa:E501
"grpcio >= 1.42.0, <= 1.51.3, != 1.48.0; python_version >= '3.10' and sys_platform != 'darwin'", # noqa:E501
# Adopted from ray's setup.py:
# https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L343
'protobuf >= 3.15.3, != 3.19.5',
# Some pydantic versions are not compatible with ray. Adopted from ray's
# setup.py: https://github.com/ray-project/ray/blob/ray-2.9.3/python/setup.py#L254
'pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3',
]

# NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the
# following packages dependencies are changed.
aws_dependencies = [
# botocore does not work with urllib3>=2.0.0, according to https://github.com/boto/botocore/issues/2926
# We have to explicitly pin the version to optimize the time for
# poetry install. See https://github.com/orgs/python-poetry/discussions/7937
'urllib3<2',
# NOTE: this installs CLI V1. To use AWS SSO (e.g., `aws sso login`), users
# should instead use CLI V2 which is not pip-installable. See
# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html.
'awscli>=1.27.10',
'botocore>=1.29.10',
'boto3>=1.26.1',
# NOTE: required by awscli. To avoid ray automatically installing
# the latest version.
'colorama < 0.4.5',
]

extras_require: Dict[str, List[str]] = {
'aws': aws_dependencies,
# TODO(zongheng): azure-cli is huge and takes a long time to install.
# Tracked in: https://github.com/Azure/azure-cli/issues/7387
# azure-identity is needed in node_provider.
# We need azure-identity>=1.13.0 to enable the customization of the
# timeout of AzureCliCredential.
'azure': [
'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
'azure-storage-blob>=12.23.1', 'msgraph-sdk'
] + local_ray,
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
# parameter for stopping instances.
# Reference: https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
'gcp': ['google-api-python-client>=2.69.0', 'google-cloud-storage'],
'ibm': [
'ibm-cloud-sdk-core', 'ibm-vpc', 'ibm-platform-services', 'ibm-cos-sdk'
] + local_ray,
'docker': ['docker'] + local_ray,
'lambda': local_ray,
'cloudflare': aws_dependencies,
'scp': local_ray,
'oci': ['oci'] + local_ray,
'kubernetes': ['kubernetes>=20.0.0'],
'remote': remote,
'runpod': ['runpod>=1.5.1'],
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'paperspace': [], # No dependencies needed for paperspace
'vast': ['vastai_sdk>=0.1.2'],
'vsphere': [
'pyvmomi==8.0.1.0.2',
# vsphere-automation-sdk is also required, but it does not have
# pypi release, which cause failure of our pypi release.
# https://peps.python.org/pep-0440/#direct-references
# We have the instruction for its installation in our
# docs instead.
# 'vsphere-automation-sdk @ git+https://github.com/vmware/[email protected]'
],
}

extras_require['all'] = sum(extras_require.values(), [])

long_description = ''
readme_filepath = 'README.md'
# When sky/backends/wheel_utils.py builds wheels, it will not contain the
Expand All @@ -278,8 +165,8 @@ def parse_readme(readme: str) -> str:
long_description_content_type='text/markdown',
setup_requires=['wheel'],
requires_python='>=3.7',
install_requires=install_requires,
extras_require=extras_require,
install_requires=dependencies['install_requires'],
extras_require=dependencies['extras_require'],
entry_points={
'console_scripts': ['sky = sky.cli:cli'],
},
Expand Down
158 changes: 60 additions & 98 deletions sky/utils/controller_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
from sky.jobs import utils as managed_job_utils
from sky.serve import constants as serve_constants
from sky.serve import serve_utils
from sky.setup_files import dependencies
from sky.skylet import constants
from sky.skylet import log_lib
from sky.utils import common_utils
from sky.utils import env_options
from sky.utils import rich_utils
Expand Down Expand Up @@ -187,79 +189,49 @@ def from_type(cls, controller_type: str) -> Optional['Controllers']:

# Install cli dependencies. Not using SkyPilot wheels because the wheel
# can be cleaned up by another process.
# TODO(zhwu): Keep the dependencies align with the ones in setup.py
def _get_cloud_dependencies_installation_commands(
controller: Controllers) -> List[str]:
# TODO(tian): Make dependency installation command a method of cloud
# class and get all installation command for enabled clouds.
commands = []
# We use <step>/<total> instead of strong formatting, as we need to update
# the <total> at the end of the for loop, and python does not support
# partial string formatting.
prefix_str = ('[<step>/<total>] Check & install cloud dependencies '
'on controller: ')
commands: List[str] = []
# This is to make sure the shorter checking message does not have junk
# characters from the previous message.
empty_str = ' ' * 10
aws_dependencies_installation = (
'pip list | grep boto3 > /dev/null 2>&1 || pip install '
'botocore>=1.29.10 boto3>=1.26.1; '
# Need to separate the installation of awscli from above because some
# other clouds will install boto3 but not awscli.
'pip list | grep awscli> /dev/null 2>&1 || pip install "urllib3<2" '
'awscli>=1.27.10 "colorama<0.4.5" > /dev/null 2>&1')
setup_clouds: List[str] = []
empty_str = ' ' * 20

# All python dependencies will be accumulated and then installed in one
# command at the end. This is very fast if the packages are already
# installed, so we don't check that.
python_packages: Set[str] = set()

step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(f'echo -en "\\r{step_prefix}uv{empty_str}" &&'
f'{constants.SKY_UV_INSTALL_CMD} >/dev/null 2>&1')

for cloud in sky_check.get_cached_enabled_clouds_or_refresh():
if isinstance(
clouds,
(clouds.Lambda, clouds.SCP, clouds.Fluidstack, clouds.Paperspace)):
# no need to install any cloud dependencies for lambda, scp,
# fluidstack and paperspace
continue
if isinstance(cloud, clouds.AWS):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(f'echo -en "\\r{step_prefix}AWS{empty_str}" && ' +
aws_dependencies_installation)
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.Azure):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(
f'echo -en "\\r{step_prefix}Azure{empty_str}" && '
'pip list | grep azure-cli > /dev/null 2>&1 || '
'pip install "azure-cli>=2.31.0" azure-core '
'"azure-identity>=1.13.0" azure-mgmt-network > /dev/null 2>&1')
# Have to separate this installation of az blob storage from above
# because this is newly-introduced and not part of azure-cli. We
# need a separate installed check for this.
cloud_python_dependencies: List[str] = copy.deepcopy(
dependencies.extras_require[cloud.canonical_name()])

if isinstance(cloud, clouds.Azure):
# azure-cli cannot be normally installed by uv.
# See comments in sky/skylet/constants.py.
cloud_python_dependencies.remove(dependencies.AZURE_CLI)

step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(
'pip list | grep azure-storage-blob > /dev/null 2>&1 || '
'pip install azure-storage-blob msgraph-sdk > /dev/null 2>&1')
setup_clouds.append(str(cloud))
f'echo -en "\\r{step_prefix}azure-cli{empty_str}" &&'
f'{constants.SKY_UV_PIP_CMD} install --prerelease=allow '
f'"{dependencies.AZURE_CLI}" > /dev/null 2>&1')
elif isinstance(cloud, clouds.GCP):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(
f'echo -en "\\r{step_prefix}GCP{empty_str}" && '
'pip list | grep google-api-python-client > /dev/null 2>&1 || '
'pip install "google-api-python-client>=2.69.0" '
'> /dev/null 2>&1')
# Have to separate the installation of google-cloud-storage from
# above because for a VM launched on GCP, the VM may have
# google-api-python-client installed alone.
commands.append(
'pip list | grep google-cloud-storage > /dev/null 2>&1 || '
'pip install google-cloud-storage > /dev/null 2>&1')
commands.append(f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
setup_clouds.append(str(cloud))
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
elif isinstance(cloud, clouds.Kubernetes):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(
f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
'pip list | grep kubernetes > /dev/null 2>&1 || '
'pip install "kubernetes>=20.0.0" > /dev/null 2>&1 &&'
# Install k8s + skypilot dependencies
'sudo bash -c "if '
'! command -v curl &> /dev/null || '
Expand All @@ -275,61 +247,43 @@ def _get_cloud_dependencies_installation_commands(
'/bin/linux/amd64/kubectl" && '
'sudo install -o root -g root -m 0755 '
'kubectl /usr/local/bin/kubectl))')
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.Cudo):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(
f'echo -en "\\r{step_prefix}Cudo{empty_str}" && '
'pip list | grep cudo-compute > /dev/null 2>&1 || '
'pip install "cudo-compute>=0.1.10" > /dev/null 2>&1 && '
f'echo -en "\\r{step_prefix}cudoctl{empty_str}" && '
'wget https://download.cudo.org/compute/cudoctl-0.3.2-amd64.deb -O ~/cudoctl.deb > /dev/null 2>&1 && ' # pylint: disable=line-too-long
'sudo dpkg -i ~/cudoctl.deb > /dev/null 2>&1')
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.RunPod):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(f'echo -en "\\r{step_prefix}RunPod{empty_str}" && '
'pip list | grep runpod > /dev/null 2>&1 || '
'pip install "runpod>=1.5.1" > /dev/null 2>&1')
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.OCI):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(f'echo -en "\\r{prefix_str}OCI{empty_str}" && '
'pip list | grep oci > /dev/null 2>&1 || '
'pip install oci > /dev/null 2>&1')
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.Vast):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(f'echo -en "\\r{step_prefix}Vast{empty_str}" && '
'pip list | grep vastai_sdk > /dev/null 2>&1 || '
'pip install "vastai_sdk>=0.1.2" > /dev/null 2>&1')
setup_clouds.append(str(cloud))
if controller == Controllers.JOBS_CONTROLLER:
if isinstance(cloud, clouds.IBM):
step_prefix = prefix_str.replace('<step>',
str(len(setup_clouds) + 1))
commands.append(
f'echo -en "\\r{step_prefix}IBM{empty_str}" '
'&& pip list | grep ibm-cloud-sdk-core > /dev/null 2>&1 || '
'pip install ibm-cloud-sdk-core ibm-vpc '
'ibm-platform-services ibm-cos-sdk > /dev/null 2>&1')
setup_clouds.append(str(cloud))
elif isinstance(cloud, clouds.IBM):
if controller != Controllers.JOBS_CONTROLLER:
# We only need IBM deps on the jobs controller.
cloud_python_dependencies = []

python_packages.update(cloud_python_dependencies)

if (cloudflare.NAME
in storage_lib.get_cached_enabled_storage_clouds_or_refresh()):
step_prefix = prefix_str.replace('<step>', str(len(setup_clouds) + 1))
commands.append(
f'echo -en "\\r{step_prefix}Cloudflare{empty_str}" && ' +
aws_dependencies_installation)
setup_clouds.append(cloudflare.NAME)
python_packages.update(dependencies.extras_require['cloudflare'])

packages_string = ' '.join([f'"{package}"' for package in python_packages])
step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
commands.append(
f'echo -en "\\r{step_prefix}cloud python packages{empty_str}" && '
f'{constants.SKY_UV_PIP_CMD} install {packages_string} > /dev/null 2>&1'
)

total_commands = len(commands)
finish_prefix = prefix_str.replace('[<step>/<total>] ', ' ')
commands.append(f'echo -e "\\r{finish_prefix}done.{empty_str}"')

commands = [
command.replace('<total>', str(len(setup_clouds)))
for command in commands
command.replace('<total>', str(total_commands)) for command in commands
]
return commands

Expand Down Expand Up @@ -387,11 +341,19 @@ def download_and_stream_latest_job_log(
else:
log_dir = list(log_dirs.values())[0]
log_file = os.path.join(log_dir, 'run.log')

# Print the logs to the console.
# TODO(zhwu): refactor this into log_utils, along with the
# refactoring for the log_lib.tail_logs.
try:
with open(log_file, 'r', encoding='utf-8') as f:
print(f.read())
# Stream the logs to the console without reading the whole
# file into memory.
start_streaming = False
for line in f:
if log_lib.LOG_FILE_START_STREAMING_AT in line:
start_streaming = True
if start_streaming:
print(line, end='', flush=True)
except FileNotFoundError:
logger.error('Failed to find the logs for the user '
f'program at {log_file}.')
Expand Down

0 comments on commit 3010706

Please sign in to comment.