From a5b67eaecb36c2a3f13f23dc135c8e8738be2537 Mon Sep 17 00:00:00 2001
From: Matthew Johnson <mjjohnson33@gmail.com>
Date: Thu, 29 Aug 2024 20:37:12 -0400
Subject: [PATCH] initial polling delay (#136)

An issue occurred where system metrics publishing was occurring too close to
the minute boundary. Coupled with some changes to the Atlas Aggregators, which
added a little bit of delay, this was causing updates to drift between minutes.
This resulted in some minutes without updates, and some minutes with 2X value
updates. This perturbation of metrics triggered a number of different alarms
in the environment.

Given that spectatord publishes every 5 seconds with a small variability, and
that most system metrics gather cycles are around 100ms, we want the system
agent to publish metrics between 10 and 50 seconds after the minute, to ensure
the stability of system metrics values.

This change adds a function which calculates the initial polling delay that
the service will experience at startup. Once that wait period is complete,
then all subsequent updates should safely occur within the middle of a minute.

The metric publishing log messages produced at info level from an older build
of the system agent were helpful in determining the cause of this issue, so the
log message is restored to info level, with some delay timing data.

Resolves #135.
---
 bin/atlas-agent.cc | 68 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 8 deletions(-)
diff --git a/bin/atlas-agent.cc b/bin/atlas-agent.cc
index 5da976b..8bab0e9 100644
--- a/bin/atlas-agent.cc
+++ b/bin/atlas-agent.cc
@@ -19,6 +19,7 @@
 #include "spectator/registry.h"
 #include <condition_variable>
 #include <csignal>
+#include <fmt/chrono.h>
 #include <getopt.h>
 
 using atlasagent::GetLogger;
@@ -55,7 +56,6 @@ static void gather_peak_titus_metrics(CGroup* cGroup) {
 }
 
 static void gather_slow_titus_metrics(CGroup* cGroup, Proc* proc, Disk* disk, Aws* aws) {
-  Logger()->debug("Gather Titus system metrics");
   aws->update_stats();
   cGroup->cpu_stats();
   cGroup->memory_stats_v2();
@@ -75,7 +75,6 @@ static void gather_scaling_metrics(CpuFreq* cpufreq) { cpufreq->Stats(); }
 
 static void gather_slow_system_metrics(Proc* proc, Disk* disk, Ethtool* ethtool, Ntp* ntp,
                                        PressureStall* pressureStall, Aws* aws) {
-  Logger()->debug("Gather EC2 system metrics");
   aws->update_stats();
   disk->disk_stats();
   ethtool->update_stats();
@@ -148,9 +147,34 @@ static void init_signals() {
   sigaction(SIGTERM, &sa, nullptr);
 }
 
+long initial_polling_delay() {
+  auto now = std::chrono::system_clock::now();
+  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
+  auto epoch = now_ms.time_since_epoch();
+
+  long step_size = 60000;  // 60 seconds, atlas step size
+  long offset = 10000;     // 10 seconds, twice the spectatord publish interval
+
+  // previous boundary, by integer arithmetic
+  long step_boundary = epoch.count() / step_size * step_size;
+  long delay = epoch.count() - step_boundary;
+
+  Logger()->debug("epoch={} step_boundary={} delay={}", epoch.count(), step_boundary, delay);
+
+  if (delay < offset) {
+    return offset;
+  } else if (delay > step_size - offset) {
+    return 2 * offset;
+  } else {
+    return 0;
+  }
+}
+
 #if defined(TITUS_SYSTEM_SERVICE)
 void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvidia_lib,
                            spectator::Tags net_tags) {
+  using std::chrono::duration_cast;
+  using std::chrono::milliseconds;
   using std::chrono::seconds;
   using std::chrono::system_clock;
 
@@ -162,9 +186,17 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvid
 
   auto gpu = init_gpu(registry, std::move(nvidia_lib));
 
-  // the first call to this gather function takes >1 second, so it must
-  // be done before we start calculating times to wait for peak metrics
+  // initial polling delay, to prevent publishing too close to a minute boundary
+  auto delay = initial_polling_delay();
+  Logger()->info("Initial polling delay is {}ms", delay);
+  if (delay > 0) {
+    runner.wait_for(milliseconds(delay));
+  }
+
+  // the first call to this gather function takes ~100ms, so it must be
+  // done before we start calculating times to wait for peak metrics
   gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws);
+  Logger()->info("Published slow Titus metrics (first iteration)");
 
   auto now = system_clock::now();
   auto next_run = now;
@@ -172,15 +204,20 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvid
   std::chrono::nanoseconds time_to_sleep;
 
   do {
+    auto start = system_clock::now();
     gather_peak_titus_metrics(&cGroup);
-    if (system_clock::now() >= next_slow_run) {
+
+    if (start >= next_slow_run) {
       gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws);
       perf_metrics.collect();
       if (gpu) {
         gpu->gpu_metrics();
       }
+      auto elapsed = duration_cast<milliseconds>(system_clock::now() - start);
+      Logger()->info("Published Titus metrics (delay={})", elapsed);
       next_slow_run += seconds(60);
     }
+
     next_run += seconds(1);
     time_to_sleep = next_run - system_clock::now();
   } while (runner.wait_for(time_to_sleep));
@@ -188,6 +225,8 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvid
 #else
 void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagent::Nvml> nvidia_lib,
                             spectator::Tags net_tags) {
+  using std::chrono::duration_cast;
+  using std::chrono::milliseconds;
   using std::chrono::seconds;
   using std::chrono::system_clock;
 
@@ -202,9 +241,17 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
 
   auto gpu = init_gpu(registry, std::move(nvidia_lib));
 
-  // the first call to this gather function takes >1 second, so it must
-  // be done before we start calculating times to wait for peak metrics
+  // initial polling delay, to prevent publishing too close to a minute boundary
+  auto delay = initial_polling_delay();
+  Logger()->info("Initial polling delay is {}ms", delay);
+  if (delay > 0) {
+    runner.wait_for(milliseconds(delay));
+  }
+
+  // the first call to this gather function takes ~100ms, so it must be
+  // done before we start calculating times to wait for peak metrics
   gather_slow_system_metrics(&proc, &disk, &ethtool, &ntp, &pressureStall, &aws);
+  Logger()->info("Published slow system metrics (first iteration)");
 
   auto now = system_clock::now();
   auto next_run = now;
@@ -212,16 +259,21 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
   std::chrono::nanoseconds time_to_sleep;
 
   do {
+    auto start = system_clock::now();
     gather_peak_system_metrics(&proc);
     gather_scaling_metrics(&cpufreq);
-    if (system_clock::now() >= next_slow_run) {
+
+    if (start >= next_slow_run) {
       gather_slow_system_metrics(&proc, &disk, &ethtool, &ntp, &pressureStall, &aws);
       perf_metrics.collect();
       if (gpu) {
         gpu->gpu_metrics();
       }
+      auto elapsed = duration_cast<milliseconds>(system_clock::now() - start);
+      Logger()->info("Published system metrics (delay={})", elapsed);
       next_slow_run += seconds(60);
     }
+
     next_run += seconds(1);
     time_to_sleep = next_run - system_clock::now();
   } while (runner.wait_for(time_to_sleep));