From 36e7a24a5c580915a58042f298bcbd243c20c685 Mon Sep 17 00:00:00 2001 From: Markus Keil Date: Mon, 29 Jul 2024 13:04:07 +0200 Subject: [PATCH] make sure to not keep unstucking the same job --- Database/DbContext.cs | 27 ++++++++++++++++ PoolManager.cs | 2 +- Program.cs | 74 +++++++++++++++++++++---------------------- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/Database/DbContext.cs b/Database/DbContext.cs index bf8989f..d03a34f 100644 --- a/Database/DbContext.cs +++ b/Database/DbContext.cs @@ -29,6 +29,33 @@ protected override void OnModelCreating(ModelBuilder modelBuilder) .HasForeignKey(j => j.RunnerId) .IsRequired(false); } + + public async Task LinkJobToRunner(long jobId, string runnerName) + { + try + { + var job = await Jobs.Include(x => x.Runner).FirstOrDefaultAsync(x => x.GithubJobId == jobId); + var runner = await Runners.Include(x => x.Job).Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.Hostname == runnerName); + runner.Job = job; + job.Runner = runner; + job.InProgressTime = DateTime.UtcNow; + runner.Lifecycle.Add(new() + { + Event = $"Runner got picked by job {jobId}", + Status = RunnerStatus.Processing, + EventTimeUtc = DateTime.UtcNow + }); + await SaveChangesAsync(); + return runner; + } + catch + { + // unable to link + return null; + } + + } + } // Runners provisioned over time diff --git a/PoolManager.cs b/PoolManager.cs index 1aa7155..c5a8657 100644 --- a/PoolManager.cs +++ b/PoolManager.cs @@ -224,7 +224,7 @@ private async Task CheckForStuckJobs(List targetConfi { var db = new ActionsRunnerContext(); var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(10); - var stuckJobs = await db.Jobs.Where(x => x.RunnerId == null && x.QueueTime < stuckTime).ToListAsync(); + var stuckJobs = await db.Jobs.Where(x => x.State == JobState.Queued && x.RunnerId == null && x.QueueTime < stuckTime).ToListAsync(); foreach (var stuckJob in stuckJobs) { _logger.LogWarning($"Found stuck Job: {stuckJob.JobId} in {stuckJob.Repository}. Starting new runner to compensate..."); diff --git a/Program.cs b/Program.cs index 9e7750a..6ccd358 100644 --- a/Program.cs +++ b/Program.cs @@ -245,7 +245,7 @@ private static async Task GithubWebhookHandler(HttpRequest request, [Fr dbWorkflowComplete.State = JobState.Completed; dbWorkflowComplete.CompleteTime = DateTime.UtcNow; await db.SaveChangesAsync(); - await JobCompleted(logger, jobId, poolMgr, repoName, orgName); + await JobCompleted(logger, jobId, poolMgr, repoName, orgName, workflowJson); break; default: logger.LogWarning("Unknown action. Ignoring"); @@ -359,7 +359,7 @@ private static async Task AddRunnerManuallyHandler(HttpRequest request, return Results.StatusCode(201); } - private static async Task JobCompleted(ILogger logger, long jobId, RunnerQueue poolMgr, string repoName, string orgName) + private static async Task JobCompleted(ILogger logger, long jobId, RunnerQueue poolMgr, string repoName, string orgName, JsonElement workflowJson) { var db = new ActionsRunnerContext(); var job = await db.Jobs @@ -385,34 +385,46 @@ await db.Jobs.AddAsync(new Job logger.LogInformation( $"Workflow Job {jobId} in repo {repoName} has completed. Queuing deletion of VM associated with Job."); - + + Runner jobRunner = null; if (job.Runner == null) { - logger.LogError($"No VM on record for JobID: {jobId}"); + // Retroactivly assign runner to job + string runnerName = workflowJson.GetProperty("runner_name").GetString(); + logger.LogError($"No VM on record for JobID: {jobId}. Trying to re-link to {runnerName}."); + jobRunner = await db.LinkJobToRunner(jobId, runnerName); + + if (jobRunner == null) + { + logger.LogError("Unable to link runner. aborting"); + return; + } } else { - // record event in DB - job.Runner.Lifecycle.Add(new() - { - Status = RunnerStatus.DeletionQueued, - EventTimeUtc = DateTime.UtcNow, - Event = $"Workflow Job {jobId} in repo {repoName} has completed." - }); - job.Runner.IsOnline = false; - await db.SaveChangesAsync(); - - // Sent to pool manager to delete - poolMgr.DeleteTasks.Enqueue(new DeleteRunnerTask - { - ServerId = job.Runner.CloudServerId, - RunnerDbId = job.Runner.RunnerId - }); - ProcessedJobCount.Labels(job.Owner, job.Runner.Size).Inc(); - - double secondsAlive = (DateTime.UtcNow - job.Runner.CreateTime).TotalSeconds; - TotalMachineTime.Labels(job.Owner, job.Runner.Size).Inc(secondsAlive); + jobRunner = job.Runner; } + + // record event in DB + jobRunner.Lifecycle.Add(new() + { + Status = RunnerStatus.DeletionQueued, + EventTimeUtc = DateTime.UtcNow, + Event = $"Workflow Job {jobId} in repo {repoName} has completed." + }); + jobRunner.IsOnline = false; + await db.SaveChangesAsync(); + + // Sent to pool manager to delete + poolMgr.DeleteTasks.Enqueue(new DeleteRunnerTask + { + ServerId = jobRunner.CloudServerId, + RunnerDbId = jobRunner.RunnerId + }); + ProcessedJobCount.Labels(job.Owner, jobRunner.Size).Inc(); + + double secondsAlive = (DateTime.UtcNow - jobRunner.CreateTime).TotalSeconds; + TotalMachineTime.Labels(job.Owner, jobRunner.Size).Inc(secondsAlive); } private static async Task JobInProgress(JsonElement workflowJson, ILogger logger, long jobId, @@ -423,19 +435,7 @@ private static async Task JobInProgress(JsonElement workflowJson, ILogger x.Runner).FirstOrDefaultAsync(x => x.GithubJobId == jobId); - var runner = await db.Runners.Include(x => x.Job).Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.Hostname == runnerName); - runner.Job = job; - job.Runner = runner; - job.InProgressTime = DateTime.UtcNow; - runner.Lifecycle.Add(new() - { - Event = $"Runner got picked by job {jobId}", - Status = RunnerStatus.Processing, - EventTimeUtc = DateTime.UtcNow - }); - - await db.SaveChangesAsync(); + Runner runner = await db.LinkJobToRunner(jobId, runnerName); // Metrics PickedJobCount.Labels(orgName, runner.Size).Inc();