diff --git a/sky/provision/do/constants.py b/sky/provision/do/constants.py index 3f15e02f842..0010646f873 100644 --- a/sky/provision/do/constants.py +++ b/sky/provision/do/constants.py @@ -8,8 +8,3 @@ 'gpu-h100x1-80gb': 'gpu-h100x1-base', 'gpu-h100x8-640gb': 'gpu-h100x8-base', } - -INSTALL_DOCKER = ('#!/bin/bash\n' - 'if ! command -v docker &> /dev/null; then \n' - 'sudo apt install -y docker.io \n' - 'fi \n') diff --git a/sky/provision/do/utils.py b/sky/provision/do/utils.py index 5ffb6b24390..c489fef670f 100644 --- a/sky/provision/do/utils.py +++ b/sky/provision/do/utils.py @@ -61,20 +61,33 @@ def _init_client(): 'no credentials file found from ' f'the following paths {POSSIBLE_CREDENTIALS_PATHS}') - auth_contexts = common_utils.read_yaml(CREDENTIALS_PATH)['auth-contexts'] - for context, api_token in auth_contexts.items(): + # attempt default context + credentials = common_utils.read_yaml(CREDENTIALS_PATH) + default_token = credentials.get('access-token', None) + if default_token is not None: try: - test_client = do.pydo.Client(token=api_token) + test_client = do.pydo.Client(token=default_token) test_client.droplets.list() - logger.debug(f'using {context} context') + logger.debug('trying `default` context') _client = test_client - break + return _client except do.exceptions().HttpResponseError: - continue - else: - raise DigitalOceanError( - 'no valid api tokens found try ' - 'setting a new API token with `doctl auth init`') + pass + + auth_contexts = credentials.get('auth-contexts', None) + if auth_contexts is not None: + for context, api_token in auth_contexts.items(): + try: + test_client = do.pydo.Client(token=api_token) + test_client.droplets.list() + logger.debug(f'using {context} context') + _client = test_client + break + except do.exceptions().HttpResponseError: + continue + else: + logger.debug('no valid api tokens found try ' + 'setting a new API token with `doctl auth init`') return _client @@ -167,7 +180,7 @@ def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str, tags = [f'{key}:{value}' for key, value in tags.items()] default_image = constants.GPU_IMAGES.get( config.node_config['InstanceType'], - 'ubuntu-22-04-x64', + 'gpu-h100x1-base', ) image_id = config.node_config['ImageId'] image_id = image_id if image_id is not None else default_image @@ -183,7 +196,6 @@ def create_instance(region: str, cluster_name_on_cloud: str, instance_type: str, config.authentication_config['ssh_public_key'])['fingerprint'] ], 'tags': tags, - 'user_data': constants.INSTALL_DOCKER, } instance = _create_droplet(instance_request) diff --git a/sky/setup_files/dependencies.py b/sky/setup_files/dependencies.py index 18d2f5cdc08..16590a9fd0d 100644 --- a/sky/setup_files/dependencies.py +++ b/sky/setup_files/dependencies.py @@ -127,6 +127,7 @@ 'fluidstack': [], # No dependencies needed for fluidstack 'cudo': ['cudo-compute>=0.1.10'], 'paperspace': [], # No dependencies needed for paperspace + 'do': ['pydo>=0.3.0', 'azure-core>=1.24.0', 'azure-common'], 'vsphere': [ 'pyvmomi==8.0.1.0.2', # vsphere-automation-sdk is also required, but it does not have diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index ac0b599f626..baf2ee1fae9 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -220,16 +220,18 @@ def test_scp_job_queue(): @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet @pytest.mark.no_oci # OCI Cloud does not have T4 gpus. @pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet -def test_job_queue_multinode(generic_cloud: str): +@pytest.mark.parametrize('accelerator', [{'do': 'H100'}]) +def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]): + accelerator = accelerator.get(generic_cloud, 'T4') name = smoke_tests_utils.get_cluster_name() total_timeout_minutes = 30 if generic_cloud == 'azure' else 15 test = smoke_tests_utils.Test( 'job_queue_multinode', [ - f'sky launch -y -c {name} --cloud {generic_cloud} examples/job_queue/cluster_multinode.yaml', - f'sky exec {name} -n {name}-1 -d examples/job_queue/job_multinode.yaml', - f'sky exec {name} -n {name}-2 -d examples/job_queue/job_multinode.yaml', - f'sky launch -c {name} -n {name}-3 --detach-setup -d examples/job_queue/job_multinode.yaml', + f'sky launch -y -c {name} --cloud {generic_cloud} --gpus {accelerator} examples/job_queue/cluster_multinode.yaml', + f'sky exec {name} -n {name}-1 -d --gpus {accelerator} examples/job_queue/job_multinode.yaml', + f'sky exec {name} -n {name}-2 -d --gpus {accelerator} examples/job_queue/job_multinode.yaml', + f'sky launch -c {name} -n {name}-3 --detach-setup -d --gpus {accelerator} examples/job_queue/job_multinode.yaml', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-1 | grep RUNNING)', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-2 | grep RUNNING)', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-3 | grep PENDING)', @@ -238,16 +240,16 @@ def test_job_queue_multinode(generic_cloud: str): 'sleep 5', f's=$(sky queue {name}); echo "$s"; echo; echo; echo "$s" | grep {name}-3 | grep SETTING_UP', f'sky cancel -y {name} 1 2 3', - f'sky launch -c {name} -n {name}-4 --detach-setup -d examples/job_queue/job_multinode.yaml', + f'sky launch -c {name} -n {name}-4 --detach-setup -d --gpus {accelerator} examples/job_queue/job_multinode.yaml', # Test the job status is correctly set to SETTING_UP, during the setup is running, # and the job can be cancelled during the setup. 'sleep 5', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep SETTING_UP)', f'sky cancel -y {name} 4', f's=$(sky queue {name}) && echo "$s" && (echo "$s" | grep {name}-4 | grep CANCELLED)', - f'sky exec {name} --gpus T4:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', - f'sky exec {name} --gpus T4:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus {accelerator}:0.2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus {accelerator}:0.2 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', + f'sky exec {name} --gpus {accelerator}:1 --num-nodes 2 "[[ \$SKYPILOT_NUM_GPUS_PER_NODE -eq 1 ]] || exit 1"', f'sky logs {name} 5 --status', f'sky logs {name} 6 --status', f'sky logs {name} 7 --status', @@ -1238,12 +1240,14 @@ def test_cancel_azure(): @pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA @pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi @pytest.mark.no_scp # SCP does not support num_nodes > 1 yet -def test_cancel_pytorch(generic_cloud: str): +@pytest.mark.parametrize('accelerator', [{'do': 'H100'}]) +def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]): + accelerator = accelerator.get(generic_cloud, 'T4') name = smoke_tests_utils.get_cluster_name() test = smoke_tests_utils.Test( 'cancel-pytorch', [ - f'sky launch -c {name} --cloud {generic_cloud} examples/resnet_distributed_torch.yaml -y -d', + f'sky launch -c {name} --cloud {generic_cloud} --gpus {accelerator} examples/resnet_distributed_torch.yaml -y -d', # Wait the GPU process to start. 'sleep 90', f'sky exec {name} --num-nodes 2 "(nvidia-smi | grep python) || '