From c19e94e35c8c386d8ee7111529210ea0ac650883 Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Thu, 19 Dec 2024 10:07:50 +0800
Subject: [PATCH 1/3] Add tests for Azure spot instance (#4475)

* verify azure spot instance

* string style

* echo

* echo vm detail

* bug fix

* remove comment
---
 tests/smoke_tests/test_cluster_job.py | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/smoke_tests/test_cluster_job.py b/tests/smoke_tests/test_cluster_job.py
index 0255884ae30..18b82c649e7 100644
--- a/tests/smoke_tests/test_cluster_job.py
+++ b/tests/smoke_tests/test_cluster_job.py
@@ -1299,6 +1299,34 @@ def test_use_spot(generic_cloud: str):
     smoke_tests_utils.run_one_test(test)
 
 
+@pytest.mark.azure
+def test_azure_spot_instance_verification():
+    """Test Azure spot instance provisioning with explicit verification.
+    This test verifies that when --use-spot is specified for Azure:
+    1. The cluster launches successfully
+    2. The instances are actually provisioned as spot instances
+    """
+    name = smoke_tests_utils.get_cluster_name()
+    test = smoke_tests_utils.Test(
+        'azure-spot-verification',
+        [
+            f'sky launch -c {name} --cloud azure tests/test_yamls/minimal.yaml --use-spot -y',
+            f'sky logs {name} 1 --status', f'TARGET_VM_NAME="{name}"; '
+            'VM_INFO=$(az vm list --query "[?contains(name, \'$TARGET_VM_NAME\')].{Name:name, ResourceGroup:resourceGroup}" -o tsv); '
+            '[[ -z "$VM_INFO" ]] && exit 1; '
+            'FULL_VM_NAME=$(echo "$VM_INFO" | awk \'{print $1}\'); '
+            'RESOURCE_GROUP=$(echo "$VM_INFO" | awk \'{print $2}\'); '
+            'VM_DETAILS=$(az vm list --resource-group "$RESOURCE_GROUP" '
+            '--query "[?name==\'$FULL_VM_NAME\'].{Name:name, Location:location, Priority:priority}" -o table); '
+            '[[ -z "$VM_DETAILS" ]] && exit 1; '
+            'echo "VM Details:"; echo "$VM_DETAILS"; '
+            'echo "$VM_DETAILS" | grep -qw "Spot" && exit 0 || exit 1'
+        ],
+        f'sky down -y {name}',
+    )
+    smoke_tests_utils.run_one_test(test)
+
+
 @pytest.mark.gcp
 def test_stop_gcp_spot():
     """Test GCP spot can be stopped, autostopped, restarted."""

From 745cf598c5b5f7e5cf035c462e93ff1a254bb107 Mon Sep 17 00:00:00 2001
From: zpoint <zp0int@qq.com>
Date: Thu, 19 Dec 2024 12:29:33 +0800
Subject: [PATCH 2/3] rename pre-merge test to quicktest-core (#4486)

* rename to test core

* rename file
---
 .buildkite/generate_pipeline.py                  | 14 +++++++-------
 ...est_pre_merge.py => test_quick_tests_core.py} | 16 ++++++++++------
 ...e.yaml => minimal_test_quick_tests_core.yaml} |  2 +-
 3 files changed, 18 insertions(+), 14 deletions(-)
 rename tests/smoke_tests/{test_pre_merge.py => test_quick_tests_core.py} (63%)
 rename tests/test_yamls/{minimal_test_pre_merge.yaml => minimal_test_quick_tests_core.yaml} (62%)

diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
index 2b0f1cec788..99f29ee258a 100644
--- a/.buildkite/generate_pipeline.py
+++ b/.buildkite/generate_pipeline.py
@@ -5,7 +5,7 @@
 
 tests/smoke_tests
 ├── test_*.py -> release pipeline
-├── test_pre_merge.py -> pre-merge pipeline
+├── test_quick_tests_core.py -> run quick tests on PR before merging
 
 run `PYTHONPATH=$(pwd)/tests:$PYTHONPATH python .buildkite/generate_pipeline.py`
 to generate the pipeline for testing. The CI will run this script as a pre-step,
@@ -208,8 +208,8 @@ def _convert_release(test_files: List[str]):
                            extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP})
 
 
-def _convert_pre_merge(test_files: List[str]):
-    yaml_file_path = '.buildkite/pipeline_smoke_tests_pre_merge.yaml'
+def _convert_quick_tests_core(test_files: List[str]):
+    yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml'
     output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
@@ -234,18 +234,18 @@ def _convert_pre_merge(test_files: List[str]):
 def main():
     test_files = os.listdir('tests/smoke_tests')
     release_files = []
-    pre_merge_files = []
+    quick_tests_core_files = []
     for test_file in test_files:
         if not test_file.startswith('test_'):
             continue
         test_file_path = os.path.join('tests/smoke_tests', test_file)
-        if "test_pre_merge" in test_file:
-            pre_merge_files.append(test_file_path)
+        if "test_quick_tests_core" in test_file:
+            quick_tests_core_files.append(test_file_path)
         else:
             release_files.append(test_file_path)
 
     _convert_release(release_files)
-    _convert_pre_merge(pre_merge_files)
+    _convert_quick_tests_core(quick_tests_core_files)
 
 
 if __name__ == '__main__':
diff --git a/tests/smoke_tests/test_pre_merge.py b/tests/smoke_tests/test_quick_tests_core.py
similarity index 63%
rename from tests/smoke_tests/test_pre_merge.py
rename to tests/smoke_tests/test_quick_tests_core.py
index 4890ac15ce4..48df4ef9a2b 100644
--- a/tests/smoke_tests/test_pre_merge.py
+++ b/tests/smoke_tests/test_quick_tests_core.py
@@ -1,23 +1,27 @@
 # Smoke tests for SkyPilot required before merging
+# If the change includes an interface modification or touches the core API,
+# the reviewer could decide it’s necessary to trigger a pre-merge test and
+# leave a comment /quicktest-core will then trigger this test.
+#
 # Default options are set in pyproject.toml
 # Example usage:
 # Run all tests except for AWS and Lambda Cloud
-# > pytest tests/smoke_tests/test_pre_merge.py
+# > pytest tests/smoke_tests/test_quick_tests_core.py
 #
 # Terminate failed clusters after test finishes
-# > pytest tests/smoke_tests/test_pre_merge.py --terminate-on-failure
+# > pytest tests/smoke_tests/test_quick_tests_core.py --terminate-on-failure
 #
 # Re-run last failed tests
 # > pytest --lf
 #
 # Run one of the smoke tests
-# > pytest tests/smoke_tests/test_pre_merge.py::test_yaml_launch_and_mount
+# > pytest tests/smoke_tests/test_quick_tests_core.py::test_yaml_launch_and_mount
 #
 # Only run test for AWS + generic tests
-# > pytest tests/smoke_tests/test_pre_merge.py --aws
+# > pytest tests/smoke_tests/test_quick_tests_core.py --aws
 #
 # Change cloud for generic tests to aws
-# > pytest tests/smoke_tests/test_pre_merge.py --generic-cloud aws
+# > pytest tests/smoke_tests/test_quick_tests_core.py --generic-cloud aws
 
 from smoke_tests import smoke_tests_utils
 
@@ -29,7 +33,7 @@ def test_yaml_launch_and_mount(generic_cloud: str):
     test = smoke_tests_utils.Test(
         'test_yaml_launch_and_mount',
         [
-            f'sky launch -y -c {name} tests/test_yamls/minimal_test_pre_merge.yaml',
+            f'sky launch -y -c {name} tests/test_yamls/minimal_test_quick_tests_core.yaml',
             smoke_tests_utils.
             get_cmd_wait_until_job_status_contains_matching_job_id(
                 cluster_name=name,
diff --git a/tests/test_yamls/minimal_test_pre_merge.yaml b/tests/test_yamls/minimal_test_quick_tests_core.yaml
similarity index 62%
rename from tests/test_yamls/minimal_test_pre_merge.yaml
rename to tests/test_yamls/minimal_test_quick_tests_core.yaml
index 583575bee5c..15857e972dd 100644
--- a/tests/test_yamls/minimal_test_pre_merge.yaml
+++ b/tests/test_yamls/minimal_test_quick_tests_core.yaml
@@ -10,4 +10,4 @@ workdir: .
 num_nodes: 1
 
 run: |
-  ls -l ~/aws/tests/test_yamls/minimal_test_pre_merge.yaml
+  ls -l ~/aws/tests/test_yamls/minimal_test_quick_tests_core.yaml

From a73a9cb37152c5f05d06bd1afe29a685ef86f85c Mon Sep 17 00:00:00 2001
From: Lei <nkwanglei@126.com>
Date: Thu, 19 Dec 2024 14:24:22 +0800
Subject: [PATCH 3/3] [k8s] support to use custom gpu resource name if it's not
 nvidia.com/gpu (#4337)

* [k8s] support to use custom gpu resource name if it's not nvidia.com/gpu

Signed-off-by: nkwangleiGIT <nkwanglei@126.com>

* fix format issue

Signed-off-by: nkwangleiGIT <nkwanglei@126.com>

---------

Signed-off-by: nkwangleiGIT <nkwanglei@126.com>
---
 sky/clouds/kubernetes.py             |  2 +-
 sky/provision/kubernetes/instance.py | 12 +++++++-----
 sky/provision/kubernetes/utils.py    | 24 ++++++++++++++++++++----
 sky/utils/kubernetes/gpu_labeler.py  |  4 ++--
 4 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 471639626eb..65b50042aba 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -395,7 +395,7 @@ def make_deploy_resources_variables(
                 tpu_requested = True
                 k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
             else:
-                k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY
+                k8s_resource_key = kubernetes_utils.get_gpu_resource_key()
 
         port_mode = network_utils.get_port_mode(None)
 
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index 0682b105a4f..c431b023ab9 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -180,6 +180,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                             #  case we will need to update this logic.
                             # TODO(Doyoung): Update the error message raised
                             # with the multi-host TPU support.
+                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key()  # pylint: disable=line-too-long
                             if 'Insufficient google.com/tpu' in event_message:
                                 extra_msg = (
                                     f'Verify if '
@@ -192,14 +193,15 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                                                        pod,
                                                        extra_msg,
                                                        details=event_message))
-                            elif (('Insufficient nvidia.com/gpu'
+                            elif ((f'Insufficient {gpu_resource_key}'
                                    in event_message) or
                                   ('didn\'t match Pod\'s node affinity/selector'
                                    in event_message)):
                                 extra_msg = (
-                                    f'Verify if '
-                                    f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster.')
+                                    f'Verify if any node matching label  '
+                                    f'{pod.spec.node_selector[label_key]} and '
+                                    f'sufficient resource {gpu_resource_key} '
+                                    f'is available in the cluster.')
                                 raise config_lib.KubernetesError(
                                     _lack_resource_msg('GPU',
                                                        pod,
@@ -728,7 +730,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     limits = pod_spec['spec']['containers'][0].get('resources',
                                                    {}).get('limits')
     if limits is not None:
-        needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
+        needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
 
     # TPU pods provisioned on GKE use the default containerd runtime.
     # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview  # pylint: disable=line-too-long
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index 5a7db7e91d6..5150cc5860b 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -438,7 +438,7 @@ def detect_accelerator_resource(
     nodes = get_kubernetes_nodes(context)
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or
+    has_accelerator = (get_gpu_resource_key() in cluster_resources or
                        TPU_RESOURCE_KEY in cluster_resources)
 
     return has_accelerator, cluster_resources
@@ -2253,10 +2253,11 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
         Number of accelerators allocated or available from the node. If no
             resource is found, it returns 0.
     """
-    assert not (GPU_RESOURCE_KEY in attribute_dict and
+    gpu_resource_name = get_gpu_resource_key()
+    assert not (gpu_resource_name in attribute_dict and
                 TPU_RESOURCE_KEY in attribute_dict)
-    if GPU_RESOURCE_KEY in attribute_dict:
-        return int(attribute_dict[GPU_RESOURCE_KEY])
+    if gpu_resource_name in attribute_dict:
+        return int(attribute_dict[gpu_resource_name])
     elif TPU_RESOURCE_KEY in attribute_dict:
         return int(attribute_dict[TPU_RESOURCE_KEY])
     return 0
@@ -2415,3 +2416,18 @@ def process_skypilot_pods(
         num_pods = len(cluster.pods)
         cluster.resources_str = f'{num_pods}x {cluster.resources}'
     return list(clusters.values()), jobs_controllers, serve_controllers
+
+
+def get_gpu_resource_key():
+    """Get the GPU resource name to use in kubernetes.
+    The function first checks for an environment variable.
+    If defined, it uses its value; otherwise, it returns the default value.
+    Args:
+        name (str): Default GPU resource name, default is "nvidia.com/gpu".
+    Returns:
+        str: The selected GPU resource name.
+    """
+    # Retrieve GPU resource name from environment variable, if set.
+    # Else use default.
+    # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
+    return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
diff --git a/sky/utils/kubernetes/gpu_labeler.py b/sky/utils/kubernetes/gpu_labeler.py
index a9d899f4348..6877c94a2a8 100644
--- a/sky/utils/kubernetes/gpu_labeler.py
+++ b/sky/utils/kubernetes/gpu_labeler.py
@@ -101,7 +101,7 @@ def label():
         # Get the list of nodes with GPUs
         gpu_nodes = []
         for node in nodes:
-            if kubernetes_utils.GPU_RESOURCE_KEY in node.status.capacity:
+            if kubernetes_utils.get_gpu_resource_key() in node.status.capacity:
                 gpu_nodes.append(node)
 
         print(f'Found {len(gpu_nodes)} GPU nodes in the cluster')
@@ -142,7 +142,7 @@ def label():
     if len(gpu_nodes) == 0:
         print('No GPU nodes found in the cluster. If you have GPU nodes, '
               'please ensure that they have the label '
-              f'`{kubernetes_utils.GPU_RESOURCE_KEY}: <number of GPUs>`')
+              f'`{kubernetes_utils.get_gpu_resource_key()}: <number of GPUs>`')
     else:
         print('GPU labeling started - this may take 10 min or more to complete.'
               '\nTo check the status of GPU labeling jobs, run '