Skip to content

Commit

Permalink
patch tests
Browse files Browse the repository at this point in the history
  • Loading branch information
asaiacai committed Dec 19, 2024
1 parent 61e0ae9 commit b33edad
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 28 deletions.
5 changes: 0 additions & 5 deletions sky/provision/do/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,3 @@
'gpu-h100x1-80gb': 'gpu-h100x1-base',
'gpu-h100x8-640gb': 'gpu-h100x8-base',
}

INSTALL_DOCKER = ('#!/bin/bash\n'
'if ! command -v docker &> /dev/null; then \n'
'sudo apt install -y docker.io \n'
'fi \n')
36 changes: 24 additions & 12 deletions sky/provision/do/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,33 @@ def _init_client():
'no credentials file found from '
f'the following paths {POSSIBLE_CREDENTIALS_PATHS}')

auth_contexts = common_utils.read_yaml(CREDENTIALS_PATH)['auth-contexts']
for context, api_token in auth_contexts.items():
# attempt default context
credentials = common_utils.read_yaml(CREDENTIALS_PATH)
default_token = credentials.get('access-token', None)
if default_token is not None:
try:
test_client = do.pydo.Client(token=api_token)
test_client = do.pydo.Client(token=default_token)
test_client.droplets.list()
logger.debug(f'using {context} context')
logger.debug('trying `default` context')
_client = test_client
break
return _client
except do.exceptions().HttpResponseError:
continue
else:
raise DigitalOceanError(
'no valid api tokens found try '
'setting a new API token with `doctl auth init`')
pass

auth_contexts = credentials.get('auth-contexts', None)
if auth_contexts is not None:
for context, api_token in auth_contexts.items():
try:
test_client = do.pydo.Client(token=api_token)
test_client.droplets.list()
logger.debug(f'using {context} context')
_client = test_client
break
except do.exceptions().HttpResponseError:
continue
else:
logger.debug('no valid api tokens found try '
'setting a new API token with `doctl auth init`')
return _client


Expand Down Expand Up @@ -167,7 +180,7 @@ def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str,
tags = [f'{key}:{value}' for key, value in tags.items()]
default_image = constants.GPU_IMAGES.get(
config.node_config['InstanceType'],
'ubuntu-22-04-x64',
'gpu-h100x1-base',
)
image_id = config.node_config['ImageId']
image_id = image_id if image_id is not None else default_image
Expand All @@ -183,7 +196,6 @@ def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str,
config.authentication_config['ssh_public_key'])['fingerprint']
],
'tags': tags,
'user_data': constants.INSTALL_DOCKER,
}
instance = _create_droplet(instance_request)

Expand Down
1 change: 1 addition & 0 deletions sky/setup_files/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'paperspace': [], # No dependencies needed for paperspace
'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'],
'vsphere': [
'pyvmomi==8.0.1.0.2',
# vsphere-automation-sdk is also required, but it does not have
Expand Down
26 changes: 15 additions & 11 deletions tests/smoke_tests/test_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,16 +220,18 @@ def test_scp_job_queue():
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
@pytest.mark.no_oci # OCI Cloud does not have T4 gpus.
@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet
def test_job_queue_multinode(generic_cloud: str):
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = accelerator.get(generic_cloud, 'T4')
name = smoke_tests_utils.get_cluster_name()
total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
test = smoke_tests_utils.Test(
'job_queue_multinode',
[
f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml',
f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml',
f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml',
f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml',
f'sky launch -y -c {name} --cloud {generic_cloud} --gpus {accelerator} examples/job_queue/cluster_multinode.yaml',
f'sky exec {name} -n {name}-1 -d --gpus {accelerator} examples/job_queue/job_multinode.yaml',
f'sky exec {name} -n {name}-2 -d --gpus {accelerator} examples/job_queue/job_multinode.yaml',
f'sky launch -c {name} -n {name}-3 --detach-setup -d --gpus {accelerator} examples/job_queue/job_multinode.yaml',
f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)',
f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)',
f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)',
Expand All @@ -238,16 +240,16 @@ def test_job_queue_multinode(generic_cloud: str):
'sleep 5',
f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP',
f'sky cancel -y {name} 1 2 3',
f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml',
f'sky launch -c {name} -n {name}-4 --detach-setup -d --gpus {accelerator} examples/job_queue/job_multinode.yaml',
# Test the job status is correctly set to SETTING_UP, during the setup is running,
# and the job can be cancelled during the setup.
'sleep 5',
f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)',
f'sky cancel -y {name} 4',
f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)',
f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus {accelerator}:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus {accelerator}:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky exec {name} --gpus {accelerator}:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"',
f'sky logs {name} 5 --status',
f'sky logs {name} 6 --status',
f'sky logs {name} 7 --status',
Expand Down Expand Up @@ -1238,12 +1240,14 @@ def test_cancel_azure():
@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA
@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
def test_cancel_pytorch(generic_cloud: str):
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = accelerator.get(generic_cloud, 'T4')
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
'cancel-pytorch',
[
f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d',
f'sky launch -c {name} --cloud {generic_cloud} --gpus {accelerator} examples/resnet_distributed_torch.yaml -y -d',
# Wait the GPU process to start.
'sleep 90',
f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || '
Expand Down

0 comments on commit b33edad

Please sign in to comment.