diff --git a/sky/jobs/constants.py b/sky/jobs/constants.py index 50668f42862..7fc0ec694fb 100644 --- a/sky/jobs/constants.py +++ b/sky/jobs/constants.py @@ -2,7 +2,7 @@ JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2' JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller' -JOBS_CONTROLLER_LOGS_DIR = '~/sky_controller_logs' +JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller' JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs' diff --git a/sky/jobs/scheduler.py b/sky/jobs/scheduler.py index 959d1516806..ba93fd82c5f 100644 --- a/sky/jobs/scheduler.py +++ b/sky/jobs/scheduler.py @@ -55,7 +55,7 @@ def maybe_start_waiting_jobs() -> None: - """Determine if any managed jobs can be launched, and if so, launch them. + """Determine if any managed jobs can be scheduled, and if so, schedule them. For newly submitted jobs, this includes starting the job controller process. For jobs that are already alive but are waiting to launch a new @@ -76,7 +76,6 @@ def maybe_start_waiting_jobs() -> None: the jobs controller. New job controller processes will be detached from the current process and there will not be a parent/child relationship - see launch_new_process_tree for more. - """ try: # We must use a global lock rather than a per-job lock to ensure correct @@ -112,7 +111,7 @@ def maybe_start_waiting_jobs() -> None: # Can't schedule anything, break from scheduling loop. break - logger.info(f'Scheduling job {maybe_next_job["job_id"]}') + logger.debug(f'Scheduling job {maybe_next_job["job_id"]}') state.scheduler_set_launching(maybe_next_job['job_id'], current_state) @@ -136,7 +135,7 @@ def maybe_start_waiting_jobs() -> None: run_cmd, log_output=log_path) state.set_job_controller_pid(job_id, pid) - logger.info(f'Job {job_id} started with pid {pid}') + logger.debug(f'Job {job_id} started with pid {pid}') except filelock.Timeout: # If we can't get the lock, just exit. The process holding the lock @@ -203,7 +202,6 @@ def job_done(job_id: int, idempotent: bool = False) -> None: The job could be in any terminal ManagedJobStatus. However, once DONE, it should never transition back to another state. - """ if idempotent and (state.get_job_schedule_state(job_id) == state.ManagedJobScheduleState.DONE): diff --git a/sky/jobs/state.py b/sky/jobs/state.py index de650106f26..aa5418b80bb 100644 --- a/sky/jobs/state.py +++ b/sky/jobs/state.py @@ -827,9 +827,11 @@ def scheduler_set_done(job_id: int, idempotent: bool = False) -> None: def set_job_controller_pid(job_id: int, pid: int): with db_utils.safe_cursor(_DB_PATH) as cursor: - # XXX cooperc - cursor.execute( - f'UPDATE job_info SET pid={pid} WHERE spot_job_id={job_id!r}') + updated_count = cursor.execute( + 'UPDATE job_info SET' + 'pid = (?) ' + 'WHERE spot_job_id = (?)', (pid, job_id)).rowcount + assert updated_count == 1, (job_id, updated_count) def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState: