Merge remote-tracking branch 'origin/master' into refactor-autoscaler

skypilot-org · Jan 15, 2025 · 52f1453 · 52f1453
2 parents 8fbb309 + d9bb51a
commit 52f1453
Show file tree

Hide file tree

Showing 135 changed files with 4,814 additions and 1,137 deletions.
diff --git a/.buildkite/generate_pipeline.py b/.buildkite/generate_pipeline.py
@@ -21,11 +21,13 @@
 clouds are not supported yet, smoke tests for those clouds are not generated.
 """
 
-import ast
 import os
 import random
+import re
+import subprocess
 from typing import Any, Dict, List, Optional
 
+import click
 from conftest import cloud_to_pytest_keyword
 from conftest import default_clouds_to_run
 import yaml
@@ -36,7 +38,7 @@
 QUEUE_GENERIC_CLOUD = 'generic_cloud'
 QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
 QUEUE_KUBERNETES = 'kubernetes'
-QUEUE_KUBERNETES_SERVE = 'kubernetes_serve'
+QUEUE_GKE = 'gke'
 # Only aws, gcp, azure, and kubernetes are supported for now.
 # Other clouds do not have credentials.
 CLOUD_QUEUE_MAP = {
@@ -52,26 +54,18 @@
     'aws': QUEUE_GENERIC_CLOUD_SERVE,
     'gcp': QUEUE_GENERIC_CLOUD_SERVE,
     'azure': QUEUE_GENERIC_CLOUD_SERVE,
-    'kubernetes': QUEUE_KUBERNETES_SERVE
+    # Now we run kubernetes on local cluster, so it should be find if we run
+    # serve tests on same queue as kubernetes.
+    'kubernetes': QUEUE_KUBERNETES
 }
 
 GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by '
                        '.buildkite/generate_pipeline.py, Please do not '
                        'edit directly.\n')
 
 
-def _get_full_decorator_path(decorator: ast.AST) -> str:
-    """Recursively get the full path of a decorator."""
-    if isinstance(decorator, ast.Attribute):
-        return f'{_get_full_decorator_path(decorator.value)}.{decorator.attr}'
-    elif isinstance(decorator, ast.Name):
-        return decorator.id
-    elif isinstance(decorator, ast.Call):
-        return _get_full_decorator_path(decorator.func)
-    raise ValueError(f'Unknown decorator type: {type(decorator)}')
-
-
-def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
+def _extract_marked_tests(file_path: str,
+                          filter_marks: List[str]) -> Dict[str, List[str]]:
     """Extract test functions and filter clouds using pytest.mark
     from a Python test file.
 
@@ -85,80 +79,72 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
     rerun failures. Additionally, the parallelism would be controlled by pytest
     instead of the buildkite job queue.
     """
-    with open(file_path, 'r', encoding='utf-8') as file:
-        tree = ast.parse(file.read(), filename=file_path)
-
-    for node in ast.walk(tree):
-        for child in ast.iter_child_nodes(node):
-            setattr(child, 'parent', node)
-
+    cmd = f'pytest {file_path} --collect-only'
+    output = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    matches = re.findall('Collected .+?\.py::(.+?) with marks: \[(.*?)\]',
+                         output.stdout)
+    function_name_marks_map = {}
+    for function_name, marks in matches:
+        function_name = re.sub(r'\[.*?\]', '', function_name)
+        marks = marks.replace('\'', '').split(',')
+        marks = [i.strip() for i in marks]
+        if function_name not in function_name_marks_map:
+            function_name_marks_map[function_name] = set(marks)
+        else:
+            function_name_marks_map[function_name].update(marks)
     function_cloud_map = {}
-    for node in ast.walk(tree):
-        if isinstance(node, ast.FunctionDef) and node.name.startswith('test_'):
-            class_name = None
-            if hasattr(node, 'parent') and isinstance(node.parent,
-                                                      ast.ClassDef):
-                class_name = node.parent.name
-
-            clouds_to_include = []
-            clouds_to_exclude = []
-            is_serve_test = False
-            for decorator in node.decorator_list:
-                if isinstance(decorator, ast.Call):
-                    # We only need to consider the decorator with no arguments
-                    # to extract clouds.
+    filter_marks = set(filter_marks)
+    for function_name, marks in function_name_marks_map.items():
+        if filter_marks and not filter_marks & marks:
+            continue
+        clouds_to_include = []
+        clouds_to_exclude = []
+        is_serve_test = 'serve' in marks
+        run_on_gke = 'requires_gke' in marks
+        for mark in marks:
+            if mark.startswith('no_'):
+                clouds_to_exclude.append(mark[3:])
+            else:
+                if mark not in PYTEST_TO_CLOUD_KEYWORD:
+                    # This mark does not specify a cloud, so we skip it.
                     continue
-                full_path = _get_full_decorator_path(decorator)
-                if full_path.startswith('pytest.mark.'):
-                    assert isinstance(decorator, ast.Attribute)
-                    suffix = decorator.attr
-                    if suffix.startswith('no_'):
-                        clouds_to_exclude.append(suffix[3:])
-                    else:
-                        if suffix == 'serve':
-                            is_serve_test = True
-                            continue
-                        if suffix not in PYTEST_TO_CLOUD_KEYWORD:
-                            # This mark does not specify a cloud, so we skip it.
-                            continue
-                        clouds_to_include.append(
-                            PYTEST_TO_CLOUD_KEYWORD[suffix])
-            clouds_to_include = (clouds_to_include if clouds_to_include else
-                                 DEFAULT_CLOUDS_TO_RUN)
-            clouds_to_include = [
-                cloud for cloud in clouds_to_include
-                if cloud not in clouds_to_exclude
-            ]
-            cloud_queue_map = SERVE_CLOUD_QUEUE_MAP if is_serve_test else CLOUD_QUEUE_MAP
-            final_clouds_to_include = [
-                cloud for cloud in clouds_to_include if cloud in cloud_queue_map
-            ]
-            if clouds_to_include and not final_clouds_to_include:
-                print(f'Warning: {file_path}:{node.name} '
-                      f'is marked to run on {clouds_to_include}, '
-                      f'but we do not have credentials for those clouds. '
-                      f'Skipped.')
-                continue
-            if clouds_to_include != final_clouds_to_include:
-                excluded_clouds = set(clouds_to_include) - set(
-                    final_clouds_to_include)
-                print(
-                    f'Warning: {file_path}:{node.name} '
-                    f'is marked to run on {clouds_to_include}, '
-                    f'but we only have credentials for {final_clouds_to_include}. '
-                    f'clouds {excluded_clouds} are skipped.')
-            function_name = (f'{class_name}::{node.name}'
-                             if class_name else node.name)
-            function_cloud_map[function_name] = (final_clouds_to_include, [
-                cloud_queue_map[cloud] for cloud in final_clouds_to_include
-            ])
+                clouds_to_include.append(PYTEST_TO_CLOUD_KEYWORD[mark])
+
+        clouds_to_include = (clouds_to_include
+                             if clouds_to_include else DEFAULT_CLOUDS_TO_RUN)
+        clouds_to_include = [
+            cloud for cloud in clouds_to_include
+            if cloud not in clouds_to_exclude
+        ]
+        cloud_queue_map = SERVE_CLOUD_QUEUE_MAP if is_serve_test else CLOUD_QUEUE_MAP
+        final_clouds_to_include = [
+            cloud for cloud in clouds_to_include if cloud in cloud_queue_map
+        ]
+        if clouds_to_include and not final_clouds_to_include:
+            print(
+                f'Warning: {function_name} is marked to run on {clouds_to_include}, '
+                f'but we do not have credentials for those clouds. Skipped.')
+            continue
+        if clouds_to_include != final_clouds_to_include:
+            excluded_clouds = set(clouds_to_include) - set(
+                final_clouds_to_include)
+            print(
+                f'Warning: {function_name} is marked to run on {clouds_to_include}, '
+                f'but we only have credentials for {final_clouds_to_include}. '
+                f'clouds {excluded_clouds} are skipped.')
+        function_cloud_map[function_name] = (final_clouds_to_include, [
+            QUEUE_GKE if run_on_gke else cloud_queue_map[cloud]
+            for cloud in final_clouds_to_include
+        ])
     return function_cloud_map
 
 
-def _generate_pipeline(test_file: str) -> Dict[str, Any]:
+def _generate_pipeline(test_file: str,
+                       filter_marks: List[str],
+                       auto_retry: bool = False) -> Dict[str, Any]:
     """Generate a Buildkite pipeline from test files."""
     steps = []
-    function_cloud_map = _extract_marked_tests(test_file)
+    function_cloud_map = _extract_marked_tests(test_file, filter_marks)
     for test_function, clouds_and_queues in function_cloud_map.items():
         for cloud, queue in zip(*clouds_and_queues):
             step = {
@@ -172,6 +158,11 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
                 },
                 'if': f'build.env("{cloud}") == "1"'
             }
+            if auto_retry:
+                step['retry'] = {
+                    # Automatically retry 2 times on any failure by default.
+                    'automatic': True
+                }
             steps.append(step)
     return {'steps': steps}
 
@@ -194,12 +185,12 @@ def _dump_pipeline_to_file(yaml_file_path: str,
         yaml.dump(final_pipeline, file, default_flow_style=False)
 
 
-def _convert_release(test_files: List[str]):
+def _convert_release(test_files: List[str], filter_marks: List[str]):
     yaml_file_path = '.buildkite/pipeline_smoke_tests_release.yaml'
     output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
-        pipeline = _generate_pipeline(test_file)
+        pipeline = _generate_pipeline(test_file, filter_marks, auto_retry=True)
         output_file_pipelines.append(pipeline)
         print(f'Converted {test_file} to {yaml_file_path}\n\n')
     # Enable all clouds by default for release pipeline.
@@ -208,15 +199,15 @@ def _convert_release(test_files: List[str]):
                            extra_env={cloud: '1' for cloud in CLOUD_QUEUE_MAP})
 
 
-def _convert_quick_tests_core(test_files: List[str]):
+def _convert_quick_tests_core(test_files: List[str], filter_marks: List[str]):
     yaml_file_path = '.buildkite/pipeline_smoke_tests_quick_tests_core.yaml'
     output_file_pipelines = []
     for test_file in test_files:
         print(f'Converting {test_file} to {yaml_file_path}')
         # We want enable all clouds by default for each test function
         # for pre-merge. And let the author controls which clouds
         # to run by parameter.
-        pipeline = _generate_pipeline(test_file)
+        pipeline = _generate_pipeline(test_file, filter_marks)
         pipeline['steps'].append({
             'label': 'Backward compatibility test',
             'command': 'bash tests/backward_compatibility_tests.sh',
@@ -231,7 +222,12 @@ def _convert_quick_tests_core(test_files: List[str]):
                            extra_env={'SKYPILOT_SUPPRESS_SENSITIVE_LOG': '1'})
 
 
-def main():
+@click.command()
+@click.option(
+    '--filter-marks',
+    type=str,
+    help='Filter to include only a subset of pytest marks, e.g., managed_jobs')
+def main(filter_marks):
     test_files = os.listdir('tests/smoke_tests')
     release_files = []
     quick_tests_core_files = []
@@ -244,8 +240,15 @@ def main():
         else:
             release_files.append(test_file_path)
 
-    _convert_release(release_files)
-    _convert_quick_tests_core(quick_tests_core_files)
+    filter_marks = filter_marks or os.getenv('FILTER_MARKS')
+    if filter_marks:
+        filter_marks = filter_marks.split(',')
+        print(f'Filter marks: {filter_marks}')
+    else:
+        filter_marks = []
+
+    _convert_release(release_files, filter_marks)
+    _convert_quick_tests_core(quick_tests_core_files, filter_marks)
 
 
 if __name__ == '__main__':

diff --git a/.github/workflows/test-poetry-build.yml b/.github/workflows/test-poetry-build.yml
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -50,9 +50,10 @@ pytest tests/test_smoke.py --generic-cloud azure
 
 For profiling code, use:
 ```
-pip install tuna # Tuna is used for visualization of profiling data.
-python3 -m cProfile -o sky.prof -m sky.cli status # Or some other command
-tuna sky.prof
+pip install py-spy # py-spy is a sampling profiler for Python programs
+py-spy record -t -o sky.svg -- python -m sky.cli status # Or some other command
+py-spy top -- python -m sky.cli status # Get a live top view
+py-spy -h # For more options
 ```
 
 #### Testing in a container

diff --git a/README.md b/README.md
@@ -199,6 +199,8 @@ Read the research:
 - [Sky Computing vision paper](https://sigops.org/s/conferences/hotos/2021/papers/hotos21-s02-stoica.pdf) (HotOS 2021)
 - [Policy for Managed Spot Jobs](https://www.usenix.org/conference/nsdi24/presentation/wu-zhanghao)  (NSDI 2024)
 
+SkyPilot was initially started at the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley and has since gained many industry contributors. Read more about the project's origin [here](https://docs.skypilot.co/en/latest/sky-computing.html).
+
 ## Support and Questions
 We are excited to hear your feedback!
 * For issues and feature requests, please [open a GitHub issue](https://github.com/skypilot-org/skypilot/issues/new).