diff --git a/client/app.cpp b/client/app.cpp index a0254c277f3..df288cebc56 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -114,6 +114,9 @@ ACTIVE_TASK::ACTIVE_TASK() { fraction_done_elapsed_time = 0; first_fraction_done = 0; first_fraction_done_elapsed_time = 0; + stuck_check_fraction_done = 0; + stuck_check_elapsed_time = 0; + stuck_check_cpu_time = 0; scheduler_state = CPU_SCHED_UNINITIALIZED; next_scheduler_state = CPU_SCHED_UNINITIALIZED; signal = 0; diff --git a/client/app.h b/client/app.h index 1999acaeb11..8a42cff9436 100644 --- a/client/app.h +++ b/client/app.h @@ -112,6 +112,12 @@ struct ACTIVE_TASK { // first frac done reported during this run of task double first_fraction_done_elapsed_time; // elapsed time when the above was reported + double stuck_check_fraction_done; + // fraction done since last check for stuck + double stuck_check_elapsed_time; + // elapsed time at last stuck check + double stuck_check_cpu_time; + // cpu time at last check SCHEDULER_STATE scheduler_state; SCHEDULER_STATE next_scheduler_state; // temp int signal; diff --git a/client/app_control.cpp b/client/app_control.cpp index b03a9875db1..10e4fc18ed8 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -149,6 +149,39 @@ bool ACTIVE_TASK_SET::poll() { } } } + + // check if a job is "stuck" (did not make progress in the last hour) + // notify the user about the issue + // abort after some time + static double last_stuck_check_time = 0; + if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) { + last_stuck_check_time = gstate.now; + for (i=0; inon_cpu_intensive()) continue; + if (atp->sporadic()) continue; + if (atp->stuck_check_elapsed_time == 0) { + // first pass + atp->stuck_check_elapsed_time = atp->elapsed_time; + atp->stuck_check_fraction_done = atp->fraction_done; + atp->stuck_check_cpu_time = atp->current_cpu_time; + continue; + } + if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue; + if (atp->stuck_check_fraction_done == atp->fraction_done && + (atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) { + // if fraction done does not change and cpu time is <10, message the user + msg_printf(atp->result->project, MSG_USER_ALERT, + "[task] has not made progress in last hour, consider aborting task %s", + atp->result->name + ); + } + atp->stuck_check_elapsed_time = atp->elapsed_time; + atp->stuck_check_fraction_done = atp->fraction_done; + atp->stuck_check_cpu_time = atp->current_cpu_time; + } + } + if (action) { gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll"); } diff --git a/client/client_state.h b/client/client_state.h index c1bbb2b9e91..61513628415 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -601,6 +601,9 @@ extern THREAD throttle_thread; #define MEMORY_USAGE_PERIOD 10 // computer memory usage and check for exclusive apps this often +#define STUCK_CHECK_POLL_PERIOD 3600 + // poll if a job is ever stuck + //////// WORK FETCH #define WORK_FETCH_PERIOD 60