diff --git a/sky/cli.py b/sky/cli.py index 1faf0003ff9..7da119b02fe 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -35,6 +35,7 @@ import sys import textwrap import time +import traceback import typing from typing import Any, Dict, List, Optional, Tuple, Union import webbrowser @@ -72,6 +73,7 @@ from sky.utils import common_utils from sky.utils import controller_utils from sky.utils import dag_utils +from sky.utils import env_options from sky.utils import log_utils from sky.utils import resources_utils from sky.utils import rich_utils @@ -1398,8 +1400,12 @@ def _get_managed_jobs( f'Details: {common_utils.format_exception(e, use_bracket=True)}' ) except Exception as e: # pylint: disable=broad-except - msg = ('Failed to query managed jobs: ' - f'{common_utils.format_exception(e, use_bracket=True)}') + msg = '' + if env_options.Options.SHOW_DEBUG_INFO.get(): + msg += traceback.format_exc() + msg += '\n' + msg += ('Failed to query managed jobs: ' + f'{common_utils.format_exception(e, use_bracket=True)}') else: max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS if limit_num_jobs_to_show else None) diff --git a/sky/jobs/constants.py b/sky/jobs/constants.py index d5f32908317..99e1eda12d8 100644 --- a/sky/jobs/constants.py +++ b/sky/jobs/constants.py @@ -1,4 +1,7 @@ """Constants used for Managed Jobs.""" +from typing import Dict, Union + +from sky.skylet import constants as skylet_constants JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2' JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller' @@ -13,7 +16,11 @@ # OOM (each vCPU can have 4 jobs controller processes as we set the CPU # requirement to 0.25, and 3 GB is barely enough for 4 job processes). # We use 50 GB disk size to reduce the cost. -CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50} +CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = { + 'cpus': '8+', + 'memory': '3x', + 'disk_size': 50 +} # Max length of the cluster name for GCP is 35, the user hash to be attached is # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max @@ -25,3 +32,13 @@ # change for the jobs/utils, we need to bump this version and update # job.utils.ManagedJobCodeGen to handle the version update. MANAGED_JOBS_VERSION = 1 + +DASHBOARD_SETUP_CMD = ( + 'ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | ' + 'awk \'{print $2}\' | xargs kill > /dev/null 2>&1 || true; ' + 'pip list | grep flask > /dev/null 2>&1 || ' + 'pip install flask 2>&1 > /dev/null; ' + '((ps aux | grep -v nohup | grep -v grep | ' + 'grep -q -- "-m sky.jobs.dashboard.dashboard") || ' + f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard ' + '>> ~/.sky/job-dashboard.log 2>&1 &));') diff --git a/sky/jobs/core.py b/sky/jobs/core.py index 9cde3443816..d1bfb3b6fa5 100644 --- a/sky/jobs/core.py +++ b/sky/jobs/core.py @@ -119,6 +119,7 @@ def launch( 'remote_user_config_path': remote_user_config_path, 'modified_catalogs': service_catalog_common.get_modified_catalog_file_mounts(), + 'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD, **controller_utils.shared_controller_vars_to_fill( controller_utils.Controllers.JOBS_CONTROLLER, remote_user_config_path=remote_user_config_path, @@ -255,7 +256,19 @@ def _maybe_restart_controller( rich_utils.force_update_status( ux_utils.spinner_message(f'{spinner_message} - restarting ' 'controller')) - handle = sky.start(jobs_controller_type.value.cluster_name) + # Make sure the dashboard is running when the controller is restarted. + task = task_lib.Task( + setup=managed_job_constants.DASHBOARD_SETUP_CMD, + envs={ + skylet_constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), + }) + _, handle = sky.launch( + task=task, + cluster_name=jobs_controller_type.value.cluster_name, + idle_minutes_to_autostop=( + skylet_constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP), + retry_until_up=True, + _disable_controller_check=True) controller_status = status_lib.ClusterStatus.UP rich_utils.force_update_status(ux_utils.spinner_message(spinner_message)) diff --git a/sky/templates/jobs-controller.yaml.j2 b/sky/templates/jobs-controller.yaml.j2 index 45cdb5141d4..5a68d81878a 100644 --- a/sky/templates/jobs-controller.yaml.j2 +++ b/sky/templates/jobs-controller.yaml.j2 @@ -27,9 +27,7 @@ setup: | {% endif %} # Dashboard. - ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true - pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null - ((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &)); + {{ dashboard_setup_cmd }} run: | {{ sky_activate_python_env }}