From 7df9cd2e70fc6d6053cb727484abdaee8cae5537 Mon Sep 17 00:00:00 2001 From: Markus Keil Date: Tue, 6 Aug 2024 16:13:42 +0200 Subject: [PATCH] better metrics --- PoolManager.cs | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/PoolManager.cs b/PoolManager.cs index ed585a8..c142855 100644 --- a/PoolManager.cs +++ b/PoolManager.cs @@ -23,6 +23,14 @@ public class PoolManager : BackgroundService .CreateGauge("github_autoscaler_runners_provisioning", "Number of runners currently provisioning"); private static readonly Gauge CspRunnerCount = Metrics .CreateGauge("github_autoscaler_csp_runners", "Number of runners currently on the CSP", labelNames: ["csp"]); + private static readonly Gauge StuckJobsCount = Metrics + .CreateGauge("github_autoscaler_job_stuck", "Number of jobs not picked up after 15min"); + private static readonly Gauge QueuedJobsCount = Metrics + .CreateGauge("github_autoscaler_job_queued", "Total Number of jobs queued"); + private static readonly Gauge CompletedJobsCount = Metrics + .CreateGauge("github_autoscaler_job_completed", "Total Number of jobs completed"); + private static readonly Gauge InProgressJobsCount = Metrics + .CreateGauge("github_autoscaler_job_inprogress", "Total Number of jobs inprogress"); private readonly RunnerQueue _queues; @@ -59,15 +67,11 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) while (!stoppingToken.IsCancellationRequested) { - // Grab some stats - CreateQueueSize.Set(_queues.CreateTasks.Count); - DeleteQueueSize.Set(_queues.DeleteTasks.Count); - ProvisionQueueSize.Set(_queues.CreatedRunners.Count); - CspRunnerCount.Labels("htz").Set(await _cc.GetServerCountFromCsp()); if (DateTime.UtcNow - crudeStatsTimer > TimeSpan.FromSeconds(statsSeconds)) { + // Grab some stats await ProcessStats(targetConfig); crudeStatsTimer = DateTime.UtcNow; } @@ -122,6 +126,26 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) private async Task ProcessStats(List targetConfig) { + CreateQueueSize.Set(_queues.CreateTasks.Count); + DeleteQueueSize.Set(_queues.DeleteTasks.Count); + ProvisionQueueSize.Set(_queues.CreatedRunners.Count); + CspRunnerCount.Labels("htz").Set(await _cc.GetServerCountFromCsp()); + + // Grab job state counts + var db = new ActionsRunnerContext(); + var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(15); + var stuckJobs = await db.Jobs.CountAsync(x => x.State == JobState.Queued && x.RunnerId == null && x.QueueTime < stuckTime); + StuckJobsCount.Set(stuckJobs); + + var jobsByState = await db.Jobs.GroupBy(x => x.State).Select(x => new { x.Key, Count = x.Count() }).ToListAsync(); + + QueuedJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.Queued)!.Count); + CompletedJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.Completed)!.Count); + InProgressJobsCount.Set(jobsByState.FirstOrDefault(x => x.Key == JobState.InProgress)!.Count); + + // grab runner state counts + + // Github runner stats try { foreach (GithubTargetConfiguration tgt in targetConfig)