initial polling delay (#136)

An issue occurred where system metrics publishing was occurring too close to the minute boundary. Coupled with some changes to the Atlas Aggregators, which added a little bit of delay, this was causing updates to drift between minutes. This resulted in some minutes without updates, and some minutes with 2X value updates. This perturbation of metrics triggered a number of different alarms in the environment. Given that spectatord publishes every 5 seconds with a small variability, and that most system metrics gather cycles are around 100ms, we want the system agent to publish metrics between 10 and 50 seconds after the minute, to ensure the stability of system metrics values. This change adds a function which calculates the initial polling delay that the service will experience at startup. Once that wait period is complete, then all subsequent updates should safely occur within the middle of a minute. The metric publishing log messages produced at info level from an older build of the system agent were helpful in determining the cause of this issue, so the log message is restored to info level, with some delay timing data. Resolves #135.
Netflix-Skunkworks · Aug 30, 2024 · a5b67ea · a5b67ea
1 parent caad3c7
commit a5b67ea
Showing 1 changed file with 60 additions and 8 deletions.
diff --git a/bin/atlas-agent.cc b/bin/atlas-agent.cc
@@ -19,6 +19,7 @@
 #include "spectator/registry.h"
 #include <condition_variable>
 #include <csignal>
+#include <fmt/chrono.h>
 #include <getopt.h>
 
 using atlasagent::GetLogger;
@@ -55,7 +56,6 @@ static void gather_peak_titus_metrics(CGroup* cGroup) {
 }
 
 static void gather_slow_titus_metrics(CGroup* cGroup, Proc* proc, Disk* disk, Aws* aws) {
-  Logger()->debug("Gather Titus system metrics");
   aws->update_stats();
   cGroup->cpu_stats();
   cGroup->memory_stats_v2();
@@ -75,7 +75,6 @@ static void gather_scaling_metrics(CpuFreq* cpufreq) { cpufreq->Stats(); }
 
 static void gather_slow_system_metrics(Proc* proc, Disk* disk, Ethtool* ethtool, Ntp* ntp,
                                        PressureStall* pressureStall, Aws* aws) {
-  Logger()->debug("Gather EC2 system metrics");
   aws->update_stats();
   disk->disk_stats();
   ethtool->update_stats();
@@ -148,9 +147,34 @@ static void init_signals() {
   sigaction(SIGTERM, &sa, nullptr);
 }
 
+long initial_polling_delay() {
+  auto now = std::chrono::system_clock::now();
+  auto now_ms = std::chrono::time_point_cast<std::chrono::milliseconds>(now);
+  auto epoch = now_ms.time_since_epoch();
+
+  long step_size = 60000;  // 60 seconds, atlas step size
+  long offset = 10000;     // 10 seconds, twice the spectatord publish interval
+
+  // previous boundary, by integer arithmetic
+  long step_boundary = epoch.count() / step_size * step_size;
+  long delay = epoch.count() - step_boundary;
+
+  Logger()->debug("epoch={} step_boundary={} delay={}", epoch.count(), step_boundary, delay);
+
+  if (delay < offset) {
+    return offset;
+  } else if (delay > step_size - offset) {
+    return 2 * offset;
+  } else {
+    return 0;
+  }
+}
+
 #if defined(TITUS_SYSTEM_SERVICE)
 void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvidia_lib,
                            spectator::Tags net_tags) {
+  using std::chrono::duration_cast;
+  using std::chrono::milliseconds;
   using std::chrono::seconds;
   using std::chrono::system_clock;
 
@@ -162,32 +186,47 @@ void collect_titus_metrics(TaggingRegistry* registry, std::unique_ptr<Nvml> nvid
 
   auto gpu = init_gpu(registry, std::move(nvidia_lib));
 
-  // the first call to this gather function takes >1 second, so it must
-  // be done before we start calculating times to wait for peak metrics
+  // initial polling delay, to prevent publishing too close to a minute boundary
+  auto delay = initial_polling_delay();
+  Logger()->info("Initial polling delay is {}ms", delay);
+  if (delay > 0) {
+    runner.wait_for(milliseconds(delay));
+  }
+
+  // the first call to this gather function takes ~100ms, so it must be
+  // done before we start calculating times to wait for peak metrics
   gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws);
+  Logger()->info("Published slow Titus metrics (first iteration)");
 
   auto now = system_clock::now();
   auto next_run = now;
   auto next_slow_run = now + seconds(60);
   std::chrono::nanoseconds time_to_sleep;
 
   do {
+    auto start = system_clock::now();
     gather_peak_titus_metrics(&cGroup);
-    if (system_clock::now() >= next_slow_run) {
+
+    if (start >= next_slow_run) {
       gather_slow_titus_metrics(&cGroup, &proc, &disk, &aws);
       perf_metrics.collect();
       if (gpu) {
         gpu->gpu_metrics();
       }
+      auto elapsed = duration_cast<milliseconds>(system_clock::now() - start);
+      Logger()->info("Published Titus metrics (delay={})", elapsed);
       next_slow_run += seconds(60);
     }
+
     next_run += seconds(1);
     time_to_sleep = next_run - system_clock::now();
   } while (runner.wait_for(time_to_sleep));
 }
 #else
 void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagent::Nvml> nvidia_lib,
                             spectator::Tags net_tags) {
+  using std::chrono::duration_cast;
+  using std::chrono::milliseconds;
   using std::chrono::seconds;
   using std::chrono::system_clock;
 
@@ -202,26 +241,39 @@ void collect_system_metrics(TaggingRegistry* registry, std::unique_ptr<atlasagen
 
   auto gpu = init_gpu(registry, std::move(nvidia_lib));
 
-  // the first call to this gather function takes >1 second, so it must
-  // be done before we start calculating times to wait for peak metrics
+  // initial polling delay, to prevent publishing too close to a minute boundary
+  auto delay = initial_polling_delay();
+  Logger()->info("Initial polling delay is {}ms", delay);
+  if (delay > 0) {
+    runner.wait_for(milliseconds(delay));
+  }
+
+  // the first call to this gather function takes ~100ms, so it must be
+  // done before we start calculating times to wait for peak metrics
   gather_slow_system_metrics(&proc, &disk, &ethtool, &ntp, &pressureStall, &aws);
+  Logger()->info("Published slow system metrics (first iteration)");
 
   auto now = system_clock::now();
   auto next_run = now;
   auto next_slow_run = now + seconds(60);
   std::chrono::nanoseconds time_to_sleep;
 
   do {
+    auto start = system_clock::now();
     gather_peak_system_metrics(&proc);
     gather_scaling_metrics(&cpufreq);
-    if (system_clock::now() >= next_slow_run) {
+
+    if (start >= next_slow_run) {
       gather_slow_system_metrics(&proc, &disk, &ethtool, &ntp, &pressureStall, &aws);
       perf_metrics.collect();
       if (gpu) {
         gpu->gpu_metrics();
       }
+      auto elapsed = duration_cast<milliseconds>(system_clock::now() - start);
+      Logger()->info("Published system metrics (delay={})", elapsed);
       next_slow_run += seconds(60);
     }
+
     next_run += seconds(1);
     time_to_sleep = next_run - system_clock::now();
   } while (runner.wait_for(time_to_sleep));