Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Jobs] Restart dashboard when refreshing the controller #4441

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import sys
import textwrap
import time
import traceback
import typing
from typing import Any, Dict, List, Optional, Tuple, Union
import webbrowser
Expand Down Expand Up @@ -72,6 +73,7 @@
from sky.utils import common_utils
from sky.utils import controller_utils
from sky.utils import dag_utils
from sky.utils import env_options
from sky.utils import log_utils
from sky.utils import resources_utils
from sky.utils import rich_utils
Expand Down Expand Up @@ -1398,8 +1400,12 @@ def _get_managed_jobs(
f'Details: {common_utils.format_exception(e, use_bracket=True)}'
)
except Exception as e: # pylint: disable=broad-except
msg = ('Failed to query managed jobs: '
f'{common_utils.format_exception(e, use_bracket=True)}')
msg = ''
if env_options.Options.SHOW_DEBUG_INFO.get():
msg += traceback.format_exc()
msg += '\n'
msg += ('Failed to query managed jobs: '
f'{common_utils.format_exception(e, use_bracket=True)}')
else:
max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
if limit_num_jobs_to_show else None)
Expand Down
19 changes: 18 additions & 1 deletion sky/jobs/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Constants used for Managed Jobs."""
from typing import Dict, Union

from sky.skylet import constants as skylet_constants

JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
Expand All @@ -13,7 +16,11 @@
# OOM (each vCPU can have 4 jobs controller processes as we set the CPU
# requirement to 0.25, and 3 GB is barely enough for 4 job processes).
# We use 50 GB disk size to reduce the cost.
CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
'cpus': '8+',
'memory': '3x',
'disk_size': 50
}

# Max length of the cluster name for GCP is 35, the user hash to be attached is
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
Expand All @@ -25,3 +32,13 @@
# change for the jobs/utils, we need to bump this version and update
# job.utils.ManagedJobCodeGen to handle the version update.
MANAGED_JOBS_VERSION = 1

DASHBOARD_SETUP_CMD = (
'ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | '
'awk \'{print $2}\' | xargs kill > /dev/null 2>&1 || true; '
'pip list | grep flask > /dev/null 2>&1 || '
'pip install flask 2>&1 > /dev/null; '
'((ps aux | grep -v nohup | grep -v grep | '
'grep -q -- "-m sky.jobs.dashboard.dashboard") || '
f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
'>> ~/.sky/job-dashboard.log 2>&1 &));')
15 changes: 14 additions & 1 deletion sky/jobs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def launch(
'remote_user_config_path': remote_user_config_path,
'modified_catalogs':
service_catalog_common.get_modified_catalog_file_mounts(),
'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
**controller_utils.shared_controller_vars_to_fill(
controller_utils.Controllers.JOBS_CONTROLLER,
remote_user_config_path=remote_user_config_path,
Expand Down Expand Up @@ -255,7 +256,19 @@ def _maybe_restart_controller(
rich_utils.force_update_status(
ux_utils.spinner_message(f'{spinner_message} - restarting '
'controller'))
handle = sky.start(jobs_controller_type.value.cluster_name)
# Make sure the dashboard is running when the controller is restarted.
task = task_lib.Task(
setup=managed_job_constants.DASHBOARD_SETUP_CMD,
envs={
skylet_constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
})
_, handle = sky.launch(
task=task,
cluster_name=jobs_controller_type.value.cluster_name,
idle_minutes_to_autostop=(
skylet_constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP),
retry_until_up=True,
_disable_controller_check=True)
controller_status = status_lib.ClusterStatus.UP
rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))

Expand Down
4 changes: 1 addition & 3 deletions sky/templates/jobs-controller.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,7 @@ setup: |
{% endif %}

# Dashboard.
ps aux | grep -v nohup | grep -v grep | grep -- "-m sky.spot.dashboard" | awk '{print $2}' | xargs kill > /dev/null 2>&1 || true
pip list | grep flask > /dev/null 2>&1 || pip install flask 2>&1 > /dev/null
((ps aux | grep -v nohup | grep -v grep | grep -q -- "-m sky.jobs.dashboard.dashboard") || (nohup {{ sky_python_cmd }} -m sky.jobs.dashboard.dashboard >> ~/.sky/job-dashboard.log 2>&1 &));
{{ dashboard_setup_cmd }}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also move this into run commands, so that it will be started by sky jobs launch --fast?


run: |
{{ sky_activate_python_env }}
Expand Down
Loading