Skip to content

Commit

Permalink
make sure to not keep unstucking the same job
Browse files Browse the repository at this point in the history
  • Loading branch information
elasticroentgen committed Jul 29, 2024
1 parent 2bc8032 commit 36e7a24
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 38 deletions.
27 changes: 27 additions & 0 deletions Database/DbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,33 @@ protected override void OnModelCreating(ModelBuilder modelBuilder)
.HasForeignKey<Job>(j => j.RunnerId)
.IsRequired(false);
}

public async Task<Runner> LinkJobToRunner(long jobId, string runnerName)
{
try
{
var job = await Jobs.Include(x => x.Runner).FirstOrDefaultAsync(x => x.GithubJobId == jobId);
var runner = await Runners.Include(x => x.Job).Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.Hostname == runnerName);
runner.Job = job;
job.Runner = runner;
job.InProgressTime = DateTime.UtcNow;
runner.Lifecycle.Add(new()
{
Event = $"Runner got picked by job {jobId}",
Status = RunnerStatus.Processing,
EventTimeUtc = DateTime.UtcNow
});
await SaveChangesAsync();
return runner;
}
catch
{
// unable to link
return null;
}

}

}

// Runners provisioned over time
Expand Down
2 changes: 1 addition & 1 deletion PoolManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
{
var db = new ActionsRunnerContext();
var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(10);
var stuckJobs = await db.Jobs.Where(x => x.RunnerId == null && x.QueueTime < stuckTime).ToListAsync();
var stuckJobs = await db.Jobs.Where(x => x.State == JobState.Queued && x.RunnerId == null && x.QueueTime < stuckTime).ToListAsync();
foreach (var stuckJob in stuckJobs)
{
_logger.LogWarning($"Found stuck Job: {stuckJob.JobId} in {stuckJob.Repository}. Starting new runner to compensate...");
Expand Down
74 changes: 37 additions & 37 deletions Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
dbWorkflowComplete.State = JobState.Completed;
dbWorkflowComplete.CompleteTime = DateTime.UtcNow;
await db.SaveChangesAsync();
await JobCompleted(logger, jobId, poolMgr, repoName, orgName);
await JobCompleted(logger, jobId, poolMgr, repoName, orgName, workflowJson);
break;
default:
logger.LogWarning("Unknown action. Ignoring");
Expand Down Expand Up @@ -359,7 +359,7 @@ private static async Task<IResult> AddRunnerManuallyHandler(HttpRequest request,
return Results.StatusCode(201);
}

private static async Task JobCompleted(ILogger<Program> logger, long jobId, RunnerQueue poolMgr, string repoName, string orgName)
private static async Task JobCompleted(ILogger<Program> logger, long jobId, RunnerQueue poolMgr, string repoName, string orgName, JsonElement workflowJson)
{
var db = new ActionsRunnerContext();
var job = await db.Jobs
Expand All @@ -385,34 +385,46 @@ await db.Jobs.AddAsync(new Job

logger.LogInformation(
$"Workflow Job {jobId} in repo {repoName} has completed. Queuing deletion of VM associated with Job.");


Runner jobRunner = null;
if (job.Runner == null)
{
logger.LogError($"No VM on record for JobID: {jobId}");
// Retroactivly assign runner to job
string runnerName = workflowJson.GetProperty("runner_name").GetString();
logger.LogError($"No VM on record for JobID: {jobId}. Trying to re-link to {runnerName}.");
jobRunner = await db.LinkJobToRunner(jobId, runnerName);

if (jobRunner == null)
{
logger.LogError("Unable to link runner. aborting");
return;
}
}
else
{
// record event in DB
job.Runner.Lifecycle.Add(new()
{
Status = RunnerStatus.DeletionQueued,
EventTimeUtc = DateTime.UtcNow,
Event = $"Workflow Job {jobId} in repo {repoName} has completed."
});
job.Runner.IsOnline = false;
await db.SaveChangesAsync();

// Sent to pool manager to delete
poolMgr.DeleteTasks.Enqueue(new DeleteRunnerTask
{
ServerId = job.Runner.CloudServerId,
RunnerDbId = job.Runner.RunnerId
});
ProcessedJobCount.Labels(job.Owner, job.Runner.Size).Inc();

double secondsAlive = (DateTime.UtcNow - job.Runner.CreateTime).TotalSeconds;
TotalMachineTime.Labels(job.Owner, job.Runner.Size).Inc(secondsAlive);
jobRunner = job.Runner;
}

// record event in DB
jobRunner.Lifecycle.Add(new()
{
Status = RunnerStatus.DeletionQueued,
EventTimeUtc = DateTime.UtcNow,
Event = $"Workflow Job {jobId} in repo {repoName} has completed."
});
jobRunner.IsOnline = false;
await db.SaveChangesAsync();

// Sent to pool manager to delete
poolMgr.DeleteTasks.Enqueue(new DeleteRunnerTask
{
ServerId = jobRunner.CloudServerId,
RunnerDbId = jobRunner.RunnerId
});
ProcessedJobCount.Labels(job.Owner, jobRunner.Size).Inc();

double secondsAlive = (DateTime.UtcNow - jobRunner.CreateTime).TotalSeconds;
TotalMachineTime.Labels(job.Owner, jobRunner.Size).Inc(secondsAlive);
}

private static async Task JobInProgress(JsonElement workflowJson, ILogger<Program> logger, long jobId,
Expand All @@ -423,19 +435,7 @@ private static async Task JobInProgress(JsonElement workflowJson, ILogger<Progra

// Make the connection between the job and the runner in the DB
var db = new ActionsRunnerContext();
var job = await db.Jobs.Include(x => x.Runner).FirstOrDefaultAsync(x => x.GithubJobId == jobId);
var runner = await db.Runners.Include(x => x.Job).Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.Hostname == runnerName);
runner.Job = job;
job.Runner = runner;
job.InProgressTime = DateTime.UtcNow;
runner.Lifecycle.Add(new()
{
Event = $"Runner got picked by job {jobId}",
Status = RunnerStatus.Processing,
EventTimeUtc = DateTime.UtcNow
});

await db.SaveChangesAsync();
Runner runner = await db.LinkJobToRunner(jobId, runnerName);

// Metrics
PickedJobCount.Labels(orgName, runner.Size).Inc();
Expand Down

0 comments on commit 36e7a24

Please sign in to comment.