From c19e94e35c8c386d8ee7111529210ea0ac650883 Mon Sep 17 00:00:00 2001 From: zpoint Date: Thu, 19 Dec 2024 10:07:50 +0800 Subject: [PATCH 1/3] Add tests for Azure spot instance (#4475) * verify azure spot instance * string style * echo * echo vm detail * bug fix * remove comment --- tests/smoke_tests/test_cluster_job.py | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py index 0255884ae30..18b82c649e7 100644 --- a/tests/smoke_tests/test_cluster_job.py +++ b/tests/smoke_tests/test_cluster_job.py @@ -1299,6 +1299,34 @@ def test_use_spot(generic_cloud: str): smoke_tests_utils.run_one_test(test) +@pytest.mark.azure +def test_azure_spot_instance_verification(): + """Test Azure spot instance provisioning with explicit verification. + This test verifies that when --use-spot is specified for Azure: + 1. The cluster launches successfully + 2. The instances are actually provisioned as spot instances + """ + name = smoke_tests_utils.get_cluster_name() + test = smoke_tests_utils.Test( + 'azure-spot-verification', + [ + f'sky launch -c {name} --cloud azure tests/test_yamls/minimal.yaml --use-spot -y', + f'sky logs {name} 1 --status', f'TARGET_VM_NAME="{name}"; ' + 'VM_INFO=$(az vm list --query "[?contains(name, \'$TARGET_VM_NAME\')].{Name:name, ResourceGroup:resourceGroup}" -o tsv); ' + '[[ -z "$VM_INFO" ]] && exit 1; ' + 'FULL_VM_NAME=$(echo "$VM_INFO" | awk \'{print $1}\'); ' + 'RESOURCE_GROUP=$(echo "$VM_INFO" | awk \'{print $2}\'); ' + 'VM_DETAILS=$(az vm list --resource-group "$RESOURCE_GROUP" ' + '--query "[?name==\'$FULL_VM_NAME\'].{Name:name, Location:location, Priority:priority}" -o table); ' + '[[ -z "$VM_DETAILS" ]] && exit 1; ' + 'echo "VM Details:"; echo "$VM_DETAILS"; ' + 'echo "$VM_DETAILS" | grep -qw "Spot" && exit 0 || exit 1' + ], + f'sky down -y {name}', + ) + smoke_tests_utils.run_one_test(test) + + @pytest.mark.gcp def test_stop_gcp_spot(): """Test GCP spot can be stopped, autostopped, restarted.""" From 745cf598c5b5f7e5cf035c462e93ff1a254bb107 Mon Sep 17 00:00:00 2001 From: zpoint Date: Thu, 19 Dec 2024 12:29:33 +0800 Subject: [PATCH 2/3] rename pre-merge test to quicktest-core (#4486) * rename to test core * rename file --- .buildkite/generate_pipeline.py | 14 +++++++------- ...est_pre_merge.py => test_quick_tests_core.py} | 16 ++++++++++------ ...e.yaml => minimal_test_quick_tests_core.yaml} | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) rename tests/smoke_tests/{test_pre_merge.py => test_quick_tests_core.py} (63%) rename tests/test_yamls/{minimal_test_pre_merge.yaml => minimal_test_quick_tests_core.yaml} (62%) diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py index 2b0f1cec788..99f29ee258a 100644 --- a/.buildkite/generate_pipeline.py +++ b/.buildkite/generate_pipeline.py @@ -5,7 +5,7 @@ tests/smoke_tests ├── test_*.py -> release pipeline -├── test_pre_merge.py -> pre-merge pipeline +├── test_quick_tests_core.py -> run quick tests on PR before merging run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py` to generate the pipeline for testing. The CI will run this script as a pre-step, @@ -208,8 +208,8 @@ def _convert_release(test_files: List[str]): extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP}) -def _convert_pre_merge(test_files: List[str]): - yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml' +def _convert_quick_tests_core(test_files: List[str]): + yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml' output_file_pipelines = [] for test_file in test_files: print(f'Converting {test_file} to {yaml_file_path}') @@ -234,18 +234,18 @@ def _convert_pre_merge(test_files: List[str]): def main(): test_files = os.listdir('tests/smoke_tests') release_files = [] - pre_merge_files = [] + quick_tests_core_files = [] for test_file in test_files: if not test_file.startswith('test_'): continue test_file_path = os.path.join('tests/smoke_tests', test_file) - if "test_pre_merge" in test_file: - pre_merge_files.append(test_file_path) + if "test_quick_tests_core" in test_file: + quick_tests_core_files.append(test_file_path) else: release_files.append(test_file_path) _convert_release(release_files) - _convert_pre_merge(pre_merge_files) + _convert_quick_tests_core(quick_tests_core_files) if __name__ == '__main__': diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_quick_tests_core.py similarity index 63% rename from tests/smoke_tests/test_pre_merge.py rename to tests/smoke_tests/test_quick_tests_core.py index 4890ac15ce4..48df4ef9a2b 100644 --- a/tests/smoke_tests/test_pre_merge.py +++ b/tests/smoke_tests/test_quick_tests_core.py @@ -1,23 +1,27 @@ # Smoke tests for SkyPilot required before merging +# If the change includes an interface modification or touches the core API, +# the reviewer could decide it’s necessary to trigger a pre-merge test and +# leave a comment /quicktest-core will then trigger this test. +# # Default options are set in pyproject.toml # Example usage: # Run all tests except for AWS and Lambda Cloud -# > pytest tests/smoke_tests/test_pre_merge.py +# > pytest tests/smoke_tests/test_quick_tests_core.py # # Terminate failed clusters after test finishes -# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure +# > pytest tests/smoke_tests/test_quick_tests_core.py --terminate-on-failure # # Re-run last failed tests # > pytest --lf # # Run one of the smoke tests -# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount +# > pytest tests/smoke_tests/test_quick_tests_core.py::test_yaml_launch_and_mount # # Only run test for AWS + generic tests -# > pytest tests/smoke_tests/test_pre_merge.py --aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --aws # # Change cloud for generic tests to aws -# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws +# > pytest tests/smoke_tests/test_quick_tests_core.py --generic-cloud aws from smoke_tests import smoke_tests_utils @@ -29,7 +33,7 @@ def test_yaml_launch_and_mount(generic_cloud: str): test = smoke_tests_utils.Test( 'test_yaml_launch_and_mount', [ - f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml', + f'sky launch -y -c {name} tests/test_yamls/minimal_test_quick_tests_core.yaml', smoke_tests_utils. get_cmd_wait_until_job_status_contains_matching_job_id( cluster_name=name, diff --git a/tests/test_yamls/minimal_test_pre_merge.yaml b/tests/test_yamls/minimal_test_quick_tests_core.yaml similarity index 62% rename from tests/test_yamls/minimal_test_pre_merge.yaml rename to tests/test_yamls/minimal_test_quick_tests_core.yaml index 583575bee5c..15857e972dd 100644 --- a/tests/test_yamls/minimal_test_pre_merge.yaml +++ b/tests/test_yamls/minimal_test_quick_tests_core.yaml @@ -10,4 +10,4 @@ workdir: . num_nodes: 1 run: | - ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml + ls -l ~/aws/tests/test_yamls/minimal_test_quick_tests_core.yaml From a73a9cb37152c5f05d06bd1afe29a685ef86f85c Mon Sep 17 00:00:00 2001 From: Lei Date: Thu, 19 Dec 2024 14:24:22 +0800 Subject: [PATCH 3/3] [k8s] support to use custom gpu resource name if it's not nvidia.com/gpu (#4337) * [k8s] support to use custom gpu resource name if it's not nvidia.com/gpu Signed-off-by: nkwangleiGIT * fix format issue Signed-off-by: nkwangleiGIT --------- Signed-off-by: nkwangleiGIT --- sky/clouds/kubernetes.py | 2 +- sky/provision/kubernetes/instance.py | 12 +++++++----- sky/provision/kubernetes/utils.py | 24 ++++++++++++++++++++---- sky/utils/kubernetes/gpu_labeler.py | 4 ++-- 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 471639626eb..65b50042aba 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -395,7 +395,7 @@ def make_deploy_resources_variables( tpu_requested = True k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY else: - k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY + k8s_resource_key = kubernetes_utils.get_gpu_resource_key() port_mode = network_utils.get_port_mode(None) diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py index 0682b105a4f..c431b023ab9 100644 --- a/sky/provision/kubernetes/instance.py +++ b/sky/provision/kubernetes/instance.py @@ -180,6 +180,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): # case we will need to update this logic. # TODO(Doyoung): Update the error message raised # with the multi-host TPU support. + gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long if 'Insufficient google.com/tpu' in event_message: extra_msg = ( f'Verify if ' @@ -192,14 +193,15 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes): pod, extra_msg, details=event_message)) - elif (('Insufficient nvidia.com/gpu' + elif ((f'Insufficient {gpu_resource_key}' in event_message) or ('didn\'t match Pod\'s node affinity/selector' in event_message)): extra_msg = ( - f'Verify if ' - f'{pod.spec.node_selector[label_key]}' - ' is available in the cluster.') + f'Verify if any node matching label ' + f'{pod.spec.node_selector[label_key]} and ' + f'sufficient resource {gpu_resource_key} ' + f'is available in the cluster.') raise config_lib.KubernetesError( _lack_resource_msg('GPU', pod, @@ -728,7 +730,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str, limits = pod_spec['spec']['containers'][0].get('resources', {}).get('limits') if limits is not None: - needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0 + needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0 # TPU pods provisioned on GKE use the default containerd runtime. # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 5a7db7e91d6..5150cc5860b 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -438,7 +438,7 @@ def detect_accelerator_resource( nodes = get_kubernetes_nodes(context) for node in nodes: cluster_resources.update(node.status.allocatable.keys()) - has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or + has_accelerator = (get_gpu_resource_key() in cluster_resources or TPU_RESOURCE_KEY in cluster_resources) return has_accelerator, cluster_resources @@ -2253,10 +2253,11 @@ def get_node_accelerator_count(attribute_dict: dict) -> int: Number of accelerators allocated or available from the node. If no resource is found, it returns 0. """ - assert not (GPU_RESOURCE_KEY in attribute_dict and + gpu_resource_name = get_gpu_resource_key() + assert not (gpu_resource_name in attribute_dict and TPU_RESOURCE_KEY in attribute_dict) - if GPU_RESOURCE_KEY in attribute_dict: - return int(attribute_dict[GPU_RESOURCE_KEY]) + if gpu_resource_name in attribute_dict: + return int(attribute_dict[gpu_resource_name]) elif TPU_RESOURCE_KEY in attribute_dict: return int(attribute_dict[TPU_RESOURCE_KEY]) return 0 @@ -2415,3 +2416,18 @@ def process_skypilot_pods( num_pods = len(cluster.pods) cluster.resources_str = f'{num_pods}x {cluster.resources}' return list(clusters.values()), jobs_controllers, serve_controllers + + +def get_gpu_resource_key(): + """Get the GPU resource name to use in kubernetes. + The function first checks for an environment variable. + If defined, it uses its value; otherwise, it returns the default value. + Args: + name (str): Default GPU resource name, default is "nvidia.com/gpu". + Returns: + str: The selected GPU resource name. + """ + # Retrieve GPU resource name from environment variable, if set. + # Else use default. + # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc. + return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY) diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py index a9d899f4348..6877c94a2a8 100644 --- a/sky/utils/kubernetes/gpu_labeler.py +++ b/sky/utils/kubernetes/gpu_labeler.py @@ -101,7 +101,7 @@ def label(): # Get the list of nodes with GPUs gpu_nodes = [] for node in nodes: - if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity: + if kubernetes_utils.get_gpu_resource_key() in node.status.capacity: gpu_nodes.append(node) print(f'Found {len(gpu_nodes)} GPU nodes in the cluster') @@ -142,7 +142,7 @@ def label(): if len(gpu_nodes) == 0: print('No GPU nodes found in the cluster. If you have GPU nodes, ' 'please ensure that they have the label ' - f'`{kubernetes_utils.GPU_RESOURCE_KEY}: `') + f'`{kubernetes_utils.get_gpu_resource_key()}: `') else: print('GPU labeling started - this may take 10 min or more to complete.' '\nTo check the status of GPU labeling jobs, run '