Skip to content

Commit

Permalink
fix deletyion race condition and better metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
elasticroentgen committed Aug 6, 2024
1 parent f2826ff commit 2f7d011
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 17 deletions.
4 changes: 4 additions & 0 deletions CloudController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -179,4 +179,8 @@ public async Task<List<Server>> GetAllServersFromCsp()
return srvs;
}

public async Task<int> GetServerCountFromCsp()
{
return (await _client.Server.Get()).Count;
}
}
53 changes: 39 additions & 14 deletions PoolManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,17 @@ public class PoolManager : BackgroundService
private readonly CloudController _cc;
private readonly ILogger<PoolManager> _logger;
private static readonly Counter MachineCreatedCount = Metrics
.CreateCounter("github_machines_created", "Number of created machines", labelNames: ["org","size"]);
private static readonly Gauge QueueSize = Metrics
.CreateGauge("github_queue", "Number of queued runner tasks");
.CreateCounter("github_autoscaler_machines_created", "Number of created machines", labelNames: ["org","size"]);
private static readonly Gauge CreateQueueSize = Metrics
.CreateGauge("github_autoscaler_create_queue", "Number of queued runner create tasks");
private static readonly Gauge GithubRunnersGauge = Metrics
.CreateGauge("github_registered_runners", "Number of runners registered to github actions", labelNames: ["org", "status"]);
private static readonly Gauge DeleteQueueSize = Metrics
.CreateGauge("github_autoscaler_delete_queue", "Number of queued runner delete tasks");
private static readonly Gauge ProvisionQueueSize = Metrics
.CreateGauge("github_autoscaler_runners_provisioning", "Number of runners currently provisioning");
private static readonly Gauge CspRunnerCount = Metrics
.CreateGauge("github_autoscaler_csp_runners", "Number of runners currently on the CSP", labelNames: ["csp"]);

private readonly RunnerQueue _queues;

Expand Down Expand Up @@ -48,13 +54,17 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)

DateTime crudeTimer = DateTime.UtcNow;
DateTime crudeStatsTimer = DateTime.UtcNow;
int cullMinutes = 3;
int cullMinutes = 5;
int statsSeconds = 10;

while (!stoppingToken.IsCancellationRequested)
{
// Grab some stats
QueueSize.Set(_queues.CreateTasks.Count + _queues.DeleteTasks.Count);
CreateQueueSize.Set(_queues.CreateTasks.Count);
DeleteQueueSize.Set(_queues.DeleteTasks.Count);
ProvisionQueueSize.Set(_queues.CreatedRunners.Count);
CspRunnerCount.Labels("htz").Set(await _cc.GetServerCountFromCsp());


if (DateTime.UtcNow - crudeStatsTimer > TimeSpan.FromSeconds(statsSeconds))
{
Expand Down Expand Up @@ -439,23 +449,38 @@ private async Task<bool> DeleteRunner(DeleteRunnerTask rt)

try
{
await _cc.DeleteRunner(rt.ServerId);
runner.IsOnline = false;
runner.Lifecycle.Add(new()
if (runner.Lifecycle.Any(x => x.Status == RunnerStatus.DeletionQueued))
{
Status = RunnerStatus.Deleted,
EventTimeUtc = DateTime.UtcNow,
Event = "Runner was successfully deleted from CSP"
});
runner.IsOnline = false;
runner.Lifecycle.Add(new()
{
Status = RunnerStatus.Deleted,
EventTimeUtc = DateTime.UtcNow,
Event = "Runner already queued for deletion."
});
}
else
{

await _cc.DeleteRunner(rt.ServerId);
runner.IsOnline = false;
runner.Lifecycle.Add(new()
{
Status = RunnerStatus.Deleted,
EventTimeUtc = DateTime.UtcNow,
Event = "Runner was successfully deleted from CSP"
});
}
await db.SaveChangesAsync();

return true;
}
catch (Exception ex)
{
_logger.LogError(
$"Unable to delete runner [{rt.ServerId} | Retry: {rt.RetryCount}]: {ex.Message}");
rt.RetryCount += 1;
if (rt.RetryCount < 10)
if (rt.RetryCount < 3)
{
_queues.DeleteTasks.Enqueue(rt);
runner.Lifecycle.Add(new RunnerLifecycle
Expand Down Expand Up @@ -522,7 +547,7 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
Event = $"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.Message}"
});
rt.RetryCount += 1;
if (rt.RetryCount < 10)
if (rt.RetryCount < 3)
{
_queues.CreateTasks.Enqueue(rt);
}
Expand Down
6 changes: 3 additions & 3 deletions Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ public class Program

private static readonly Counter MachineFailedCount = Metrics
.CreateCounter("github_autoscaler_machine_failed", "Number of machines failed to provision",
labelNames: ["arch", "size"]);
labelNames: ["org", "size"]);

private static readonly Counter MachineSuccessCount = Metrics
.CreateCounter("github_autoscaler_machine_success", "Number of machines provisioned fine",
labelNames: ["arch", "size"]);
labelNames: ["org", "size"]);

private static readonly Counter TotalMachineTime = Metrics
.CreateCounter("github_total_machine_time", "Number of seconds machines were alive",
.CreateCounter("github_autoscaler_total_machine_time", "Number of seconds machines were alive",
labelNames: ["org", "size"]);

public static void Main(string[] args)
Expand Down

0 comments on commit 2f7d011

Please sign in to comment.