From 9c63003aaab03a4f4b53a996b6427b35c7c18fd2 Mon Sep 17 00:00:00 2001 From: Markus Keil Date: Fri, 25 Oct 2024 09:33:41 +0200 Subject: [PATCH] make stuck jobs and the HTZ interface more resiliant --- .gitignore | 2 + Database/DbContext.cs | 2 + PoolManager.cs | 103 ++++++++++++++++++--------------- Program.cs | 3 + Properties/launchSettings.json | 4 +- 5 files changed, 65 insertions(+), 49 deletions(-) diff --git a/.gitignore b/.gitignore index 39c9242..0148ec5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.DS_Store +*.user bin/ obj/ /packages/ diff --git a/Database/DbContext.cs b/Database/DbContext.cs index e028f7a..6ea8718 100644 --- a/Database/DbContext.cs +++ b/Database/DbContext.cs @@ -106,6 +106,8 @@ public DateTime LastStateTime return Lifecycle.MaxBy(x => x.EventTimeUtc).EventTimeUtc; } } + + public bool StuckJobReplacement { get; set; } = false; } public enum RunnerStatus diff --git a/PoolManager.cs b/PoolManager.cs index 56668c2..1a44e86 100644 --- a/PoolManager.cs +++ b/PoolManager.cs @@ -50,9 +50,6 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) List targetConfig = Program.Config.TargetConfigs; - // Cull runners - List allHtzSrvs = await _cc.GetAllServersFromCsp(); - await CleanUpRunners(targetConfig); await StartPoolRunners(targetConfig); _logger.LogInformation("Poolmanager init done."); @@ -297,7 +294,9 @@ private async Task CheckForStuckJobs(List targetConfi Arch = arch, IPv4 = string.Empty, IsCustom = profile != "default", - Owner = stuckJob.Owner + Owner = stuckJob.Owner, + StuckJobReplacement = true + }; await db.Runners.AddAsync(newRunner); await db.SaveChangesAsync(); @@ -422,55 +421,64 @@ private async Task CleanUpRunners(List targetConfigs) } // Remove every VM that's not in the github registered runners - List remainingHtzServer = await _cc.GetAllServersFromCsp(); - foreach (Server htzSrv in remainingHtzServer) + try { - if (registeredServerNames.Contains(htzSrv.Name)) + List remainingHtzServer = await _cc.GetAllServersFromCsp(); + foreach (Server htzSrv in remainingHtzServer) { - // If we know the server in github, skip - continue; - } - _logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner"); + if (registeredServerNames.Contains(htzSrv.Name)) + { + // If we know the server in github, skip + continue; + } - var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id); - if (runner == null) - { - _logger.LogInformation($"{htzSrv.Name} is not found in the database"); - continue; - } - if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued)) - { - runner.Lifecycle.Add(new() + _logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner"); + + var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id); + if (runner == null) { - Status = RunnerStatus.DeletionQueued, - Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.", - EventTimeUtc = DateTime.UtcNow - }); - await db.SaveChangesAsync(); - - } - else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) || - (runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40))) - { - _logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}"); - runner.IsOnline = false; - runner.Lifecycle.Add(new() + _logger.LogInformation($"{htzSrv.Name} is not found in the database"); + continue; + } + + if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued)) { - Status = RunnerStatus.DeletionQueued, - Event = "Removing as VM not longer in any GitHub registration", - EventTimeUtc = DateTime.UtcNow - }); - await db.SaveChangesAsync(); - _queues.DeleteTasks.Enqueue(new() + runner.Lifecycle.Add(new() + { + Status = RunnerStatus.DeletionQueued, + Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.", + EventTimeUtc = DateTime.UtcNow + }); + await db.SaveChangesAsync(); + + } + else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) || + (runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40))) { - RunnerDbId = runner.RunnerId, - ServerId = htzSrv.Id - }); - + _logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}"); + runner.IsOnline = false; + runner.Lifecycle.Add(new() + { + Status = RunnerStatus.DeletionQueued, + Event = "Removing as VM not longer in any GitHub registration", + EventTimeUtc = DateTime.UtcNow + }); + await db.SaveChangesAsync(); + _queues.DeleteTasks.Enqueue(new() + { + RunnerDbId = runner.RunnerId, + ServerId = htzSrv.Id + }); + + } + } - } - + catch (Exception ex) + { + _logger.LogError($"Failed during cleanup from CSP: {ex.Message}"); + } + } private async Task DeleteRunner(DeleteRunnerTask rt) @@ -564,13 +572,14 @@ private async Task CreateRunner(CreateRunnerTask rt) Event = $"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.Message}" }); rt.RetryCount += 1; - if (rt.RetryCount < 3) + // Don't retry stuck job runners - the stuck job detector will create retry servers + if (rt.RetryCount < 3 && !runner.StuckJobReplacement) { _queues.CreateTasks.Enqueue(rt); } else { - _logger.LogError($"Retries exceeded for {runner.Size} on {runner.Arch}. giving up."); + _logger.LogError(runner.StuckJobReplacement ? $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up. (Stuck job replacement)" : $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up."); runner.Lifecycle.Add(new RunnerLifecycle { Status = RunnerStatus.Failure, diff --git a/Program.cs b/Program.cs index cff2cc0..ffcbc71 100644 --- a/Program.cs +++ b/Program.cs @@ -76,6 +76,7 @@ public static void Main(string[] args) Log.Error($"Hetzner cloud token not set in {configPath}"); return; } + Log.Information($"Loaded {Config.TargetConfigs.Count} targets and {Config.Sizes.Count} sizes."); @@ -111,6 +112,7 @@ public static void Main(string[] args) builder.Services.AddHostedService(); // Add services to the container. + // Learn more about configuring Swagger/OpenAPI at https://aka.ms/aspnetcore/swashbuckle builder.Services.AddCors(options => @@ -490,6 +492,7 @@ await db.Jobs.AddAsync(new Job double secondsAlive = (DateTime.UtcNow - jobRunner.CreateTime).TotalSeconds; TotalMachineTime.Labels(job.Owner, jobRunner.Size).Inc(secondsAlive); + } private static async Task JobInProgress(JsonElement workflowJson, ILogger logger, long jobId, diff --git a/Properties/launchSettings.json b/Properties/launchSettings.json index c50ea9f..9721902 100644 --- a/Properties/launchSettings.json +++ b/Properties/launchSettings.json @@ -17,8 +17,8 @@ "applicationUrl": "http://localhost:5178", "environmentVariables": { "ASPNETCORE_ENVIRONMENT": "Development", - "CONFIG_DIR": "/Users/markuskeil/dev/tmp", - "PERSIST_DIR": "/Users/markuskeil/dev/tmp" + "CONFIG_DIR": "/Users/markus/dev/tmp", + "PERSIST_DIR": "/Users/markus/dev/tmp" } }, "https": {