Skip to content

Commit

Permalink
make stuck jobs and the HTZ interface more resiliant
Browse files Browse the repository at this point in the history
  • Loading branch information
elasticroentgen committed Oct 25, 2024
1 parent 8f069f6 commit 9c63003
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 49 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.DS_Store
*.user
bin/
obj/
/packages/
Expand Down
2 changes: 2 additions & 0 deletions Database/DbContext.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ public DateTime LastStateTime
return Lifecycle.MaxBy(x => x.EventTimeUtc).EventTimeUtc;
}
}

public bool StuckJobReplacement { get; set; } = false;
}

public enum RunnerStatus
Expand Down
103 changes: 56 additions & 47 deletions PoolManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)

List<GithubTargetConfiguration> targetConfig = Program.Config.TargetConfigs;

// Cull runners
List<Server> allHtzSrvs = await _cc.GetAllServersFromCsp();

await CleanUpRunners(targetConfig);
await StartPoolRunners(targetConfig);
_logger.LogInformation("Poolmanager init done.");
Expand Down Expand Up @@ -297,7 +294,9 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
Arch = arch,
IPv4 = string.Empty,
IsCustom = profile != "default",
Owner = stuckJob.Owner
Owner = stuckJob.Owner,
StuckJobReplacement = true

};
await db.Runners.AddAsync(newRunner);
await db.SaveChangesAsync();
Expand Down Expand Up @@ -422,55 +421,64 @@ private async Task CleanUpRunners(List<GithubTargetConfiguration> targetConfigs)
}

// Remove every VM that's not in the github registered runners
List<Server> remainingHtzServer = await _cc.GetAllServersFromCsp();
foreach (Server htzSrv in remainingHtzServer)
try
{
if (registeredServerNames.Contains(htzSrv.Name))
List<Server> remainingHtzServer = await _cc.GetAllServersFromCsp();
foreach (Server htzSrv in remainingHtzServer)
{
// If we know the server in github, skip
continue;
}
_logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner");
if (registeredServerNames.Contains(htzSrv.Name))
{
// If we know the server in github, skip
continue;
}

var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id);
if (runner == null)
{
_logger.LogInformation($"{htzSrv.Name} is not found in the database");
continue;
}
if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued))
{
runner.Lifecycle.Add(new()
_logger.LogInformation($"{htzSrv.Name} is a candidate to be killed from Hetzner");

var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.CloudServerId == htzSrv.Id);
if (runner == null)
{
Status = RunnerStatus.DeletionQueued,
Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.",
EventTimeUtc = DateTime.UtcNow
});
await db.SaveChangesAsync();

}
else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) ||
(runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40)))
{
_logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}");
runner.IsOnline = false;
runner.Lifecycle.Add(new()
_logger.LogInformation($"{htzSrv.Name} is not found in the database");
continue;
}

if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued))
{
Status = RunnerStatus.DeletionQueued,
Event = "Removing as VM not longer in any GitHub registration",
EventTimeUtc = DateTime.UtcNow
});
await db.SaveChangesAsync();
_queues.DeleteTasks.Enqueue(new()
runner.Lifecycle.Add(new()
{
Status = RunnerStatus.DeletionQueued,
Event = "Don't queue deletion due to Github registration. Runner already queued for deletion.",
EventTimeUtc = DateTime.UtcNow
});
await db.SaveChangesAsync();

}
else if ((runner.LastState >= RunnerStatus.Provisioned && DateTime.UtcNow - runner.LastStateTime > TimeSpan.FromMinutes(5)) ||
(runner.LastState != RunnerStatus.Processing && DateTime.UtcNow - htzSrv.Created.ToUniversalTime() > TimeSpan.FromMinutes(40)))
{
RunnerDbId = runner.RunnerId,
ServerId = htzSrv.Id
});

_logger.LogInformation($"Removing VM that is not in any GitHub registration: {htzSrv.Name} created at {htzSrv.Created:u}");
runner.IsOnline = false;
runner.Lifecycle.Add(new()
{
Status = RunnerStatus.DeletionQueued,
Event = "Removing as VM not longer in any GitHub registration",
EventTimeUtc = DateTime.UtcNow
});
await db.SaveChangesAsync();
_queues.DeleteTasks.Enqueue(new()
{
RunnerDbId = runner.RunnerId,
ServerId = htzSrv.Id
});

}

}

}

catch (Exception ex)
{
_logger.LogError($"Failed during cleanup from CSP: {ex.Message}");
}

}

private async Task<bool> DeleteRunner(DeleteRunnerTask rt)
Expand Down Expand Up @@ -564,13 +572,14 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
Event = $"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.Message}"
});
rt.RetryCount += 1;
if (rt.RetryCount < 3)
// Don't retry stuck job runners - the stuck job detector will create retry servers
if (rt.RetryCount < 3 && !runner.StuckJobReplacement)
{
_queues.CreateTasks.Enqueue(rt);
}
else
{
_logger.LogError($"Retries exceeded for {runner.Size} on {runner.Arch}. giving up.");
_logger.LogError(runner.StuckJobReplacement ? $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up. (Stuck job replacement)" : $"Retries exceeded for {runner.Size} on {runner.Arch}. giving up.");
runner.Lifecycle.Add(new RunnerLifecycle
{
Status = RunnerStatus.Failure,
Expand Down
3 changes: 3 additions & 0 deletions Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public static void Main(string[] args)
Log.Error($"Hetzner cloud token not set in {configPath}");
return;
}


Log.Information($"Loaded {Config.TargetConfigs.Count} targets and {Config.Sizes.Count} sizes.");

Expand Down Expand Up @@ -111,6 +112,7 @@ public static void Main(string[] args)
builder.Services.AddHostedService<PoolManager>();

// Add services to the container.


// Learn more about configuring Swagger/OpenAPI at https://aka.ms/aspnetcore/swashbuckle
builder.Services.AddCors(options =>
Expand Down Expand Up @@ -490,6 +492,7 @@ await db.Jobs.AddAsync(new Job

double secondsAlive = (DateTime.UtcNow - jobRunner.CreateTime).TotalSeconds;
TotalMachineTime.Labels(job.Owner, jobRunner.Size).Inc(secondsAlive);

}

private static async Task JobInProgress(JsonElement workflowJson, ILogger<Program> logger, long jobId,
Expand Down
4 changes: 2 additions & 2 deletions Properties/launchSettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"applicationUrl": "http://localhost:5178",
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development",
"CONFIG_DIR": "/Users/markuskeil/dev/tmp",
"PERSIST_DIR": "/Users/markuskeil/dev/tmp"
"CONFIG_DIR": "/Users/markus/dev/tmp",
"PERSIST_DIR": "/Users/markus/dev/tmp"
}
},
"https": {
Expand Down

0 comments on commit 9c63003

Please sign in to comment.