From a5b67eaecb36c2a3f13f23dc135c8e8738be2537 Mon Sep 17 00:00:00 2001 From: Matthew Johnson Date: Thu, 29 Aug 2024 20:37:12 -0400 Subject: [PATCH] initial polling delay (#136) An issue occurred where system metrics publishing was occurring too close to the minute boundary. Coupled with some changes to the Atlas Aggregators, which added a little bit of delay, this was causing updates to drift between minutes. This resulted in some minutes without updates, and some minutes with 2X value updates. This perturbation of metrics triggered a number of different alarms in the environment. Given that spectatord publishes every 5 seconds with a small variability, and that most system metrics gather cycles are around 100ms, we want the system agent to publish metrics between 10 and 50 seconds after the minute, to ensure the stability of system metrics values. This change adds a function which calculates the initial polling delay that the service will experience at startup. Once that wait period is complete, then all subsequent updates should safely occur within the middle of a minute. The metric publishing log messages produced at info level from an older build of the system agent were helpful in determining the cause of this issue, so the log message is restored to info level, with some delay timing data. Resolves #135. --- bin/atlas-agent.cc | 68 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/bin/atlas-agent.cc b/bin/atlas-agent.cc index 5da976b..8bab0e9 100644 --- a/bin/atlas-agent.cc +++ b/bin/atlas-agent.cc @@ -19,6 +19,7 @@ #include "spectator/registry.h" #include #include +#include #include using atlasagent::GetLogger; @@ -55,7 +56,6 @@ static void gather_peak_titus_metrics(CGroup* cGroup) { } static void gather_slow_titus_metrics(CGroup* cGroup, Proc* proc, Disk* disk, Aws* aws) { - Logger()->debug("Gather Titus system metrics"); aws->update_stats(); cGroup->cpu_stats(); cGroup->memory_stats_v2(); @@ -75,7 +75,6 @@ static void gather_scaling_metrics(CpuFreq* cpufreq) { cpufreq->Stats(); } static void gather_slow_system_metrics(Proc* proc, Disk* disk, Ethtool* ethtool, Ntp* ntp, PressureStall* pressureStall, Aws* aws) { - Logger()->debug("Gather EC2 system metrics"); aws->update_stats(); disk->disk_stats(); ethtool->update_stats(); @@ -148,9 +147,34 @@ static void init_signals() { sigaction(SIGTERM, &sa, nullptr); } +long initial_polling_delay() { + auto now = std::chrono::system_clock::now(); + auto now_ms = std::chrono::time_point_cast(now); + auto epoch = now_ms.time_since_epoch(); + + long step_size = 60000; // 60 seconds, atlas step size + long offset = 10000; // 10 seconds, twice the spectatord publish interval + + // previous boundary, by integer arithmetic + long step_boundary = epoch.count() / step_size * step_size; + long delay = epoch.count() - step_boundary; + + Logger()->debug("epoch={} step_boundary={} delay={}", epoch.count(), step_boundary, delay); + + if (delay < offset) { + return offset; + } else if (delay > step_size - offset) { + return 2 * offset; + } else { + return 0; + } +} + #if defined(TITUS_SYSTEM_SERVICE) void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr nvidia_lib, spectator::Tags net_tags) { + using std::chrono::duration_cast; + using std::chrono::milliseconds; using std::chrono::seconds; using std::chrono::system_clock; @@ -162,9 +186,17 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr nvid auto gpu = init_gpu(registry, std::move(nvidia_lib)); - // the first call to this gather function takes >1 second, so it must - // be done before we start calculating times to wait for peak metrics + // initial polling delay, to prevent publishing too close to a minute boundary + auto delay = initial_polling_delay(); + Logger()->info("Initial polling delay is {}ms", delay); + if (delay > 0) { + runner.wait_for(milliseconds(delay)); + } + + // the first call to this gather function takes ~100ms, so it must be + // done before we start calculating times to wait for peak metrics gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws); + Logger()->info("Published slow Titus metrics (first iteration)"); auto now = system_clock::now(); auto next_run = now; @@ -172,15 +204,20 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr nvid std::chrono::nanoseconds time_to_sleep; do { + auto start = system_clock::now(); gather_peak_titus_metrics(&cGroup); - if (system_clock::now() >= next_slow_run) { + + if (start >= next_slow_run) { gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws); perf_metrics.collect(); if (gpu) { gpu->gpu_metrics(); } + auto elapsed = duration_cast(system_clock::now() - start); + Logger()->info("Published Titus metrics (delay={})", elapsed); next_slow_run += seconds(60); } + next_run += seconds(1); time_to_sleep = next_run - system_clock::now(); } while (runner.wait_for(time_to_sleep)); @@ -188,6 +225,8 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr nvid #else void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr nvidia_lib, spectator::Tags net_tags) { + using std::chrono::duration_cast; + using std::chrono::milliseconds; using std::chrono::seconds; using std::chrono::system_clock; @@ -202,9 +241,17 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr1 second, so it must - // be done before we start calculating times to wait for peak metrics + // initial polling delay, to prevent publishing too close to a minute boundary + auto delay = initial_polling_delay(); + Logger()->info("Initial polling delay is {}ms", delay); + if (delay > 0) { + runner.wait_for(milliseconds(delay)); + } + + // the first call to this gather function takes ~100ms, so it must be + // done before we start calculating times to wait for peak metrics gather_slow_system_metrics(&proc, &disk, ðtool, &ntp, &pressureStall, &aws); + Logger()->info("Published slow system metrics (first iteration)"); auto now = system_clock::now(); auto next_run = now; @@ -212,16 +259,21 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr= next_slow_run) { + + if (start >= next_slow_run) { gather_slow_system_metrics(&proc, &disk, ðtool, &ntp, &pressureStall, &aws); perf_metrics.collect(); if (gpu) { gpu->gpu_metrics(); } + auto elapsed = duration_cast(system_clock::now() - start); + Logger()->info("Published system metrics (delay={})", elapsed); next_slow_run += seconds(60); } + next_run += seconds(1); time_to_sleep = next_run - system_clock::now(); } while (runner.wait_for(time_to_sleep));