Skip to content

Commit

Permalink
Merge POC branch, rename from super agent to agent control, update co…
Browse files Browse the repository at this point in the history
…nfiguration to match current spec
  • Loading branch information
tippmar-nr committed Jan 23, 2025
2 parents 37dfdd6 + b91f8f5 commit f42929e
Show file tree
Hide file tree
Showing 28 changed files with 635 additions and 100 deletions.
95 changes: 92 additions & 3 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/AgentHealthReporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
using System.Linq;
using System.Net;
using System.Threading;
using System.IO;
using System.Security.Policy;
using System.Runtime.InteropServices;
using Grpc.Core;

namespace NewRelic.Agent.Core.AgentHealth
{
Expand All @@ -38,10 +42,27 @@ public class AgentHealthReporter : ConfigurationBasedService, IAgentHealthReport
private InterlockedCounter _traceContextCreateSuccessCounter;
private InterlockedCounter _traceContextAcceptSuccessCounter;

private readonly HealthCheck _healthCheck;

public AgentHealthReporter(IMetricBuilder metricBuilder, IScheduler scheduler)
{
_healthCheck = new()
{
IsHealthy = true,
Status = "Agent starting",
LastError = string.Empty
};

_metricBuilder = metricBuilder;
_scheduler = scheduler;

// Want this to start immediately and write out the first health check - only if enabled
if (_configuration.AgentControlEnabled)
{
Log.Info(">>>>>>>>>AgentHealthReporter: Starting health check");
_scheduler.ExecuteEvery(PublishAgentControlHealthCheck, TimeSpan.FromSeconds(_configuration.HealthFrequency), TimeSpan.Zero);
}

_scheduler.ExecuteEvery(LogPeriodicReport, _timeBetweenExecutions);
var agentHealthEvents = Enum.GetValues(typeof(AgentHealthEvent)) as AgentHealthEvent[];
foreach (var agentHealthEvent in agentHealthEvents)
Expand Down Expand Up @@ -258,9 +279,9 @@ public void ReportIfHostIsLinuxOs()
{
#if NETSTANDARD2_0

bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
var metric =_metricBuilder.TryBuildLinuxOsMetric(isLinux);
TrySend(metric);
bool isLinux = System.Runtime.InteropServices.RuntimeInformation.IsOSPlatform(System.Runtime.InteropServices.OSPlatform.Linux);
var metric = _metricBuilder.TryBuildLinuxOsMetric(isLinux);
TrySend(metric);
#endif
}

Expand Down Expand Up @@ -667,6 +688,72 @@ public void ReportLogForwardingConfiguredValues()

#endregion

#region Agent Control

private void ReportIfAgentControlHealthEnabled()
{
if (_configuration.AgentControlEnabled)
{
ReportSupportabilityCountMetric(MetricNames.SupportabilityAgentControlHealthEnabled);
}
}

public void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
// Do nothing if agent control is not enabled
if (!_configuration.AgentControlEnabled)
return;

if (healthStatus.Equals(HealthCodes.AgentShutdownHealthy))
{
if (_healthCheck.IsHealthy)
{
_healthCheck.TrySetHealth(healthStatus);
}
}
else
{
_healthCheck.TrySetHealth(healthStatus, statusParams);
}
}

public void PublishAgentControlHealthCheck()
{
if (!_configuration.AgentControlEnabled || _healthChecksFailed)
return;

var fileUri = new Uri(_configuration.HealthDeliveryLocation);
if (fileUri.Scheme != Uri.UriSchemeFile)
{
Log.Debug("The provided agent_control.health.delivery_location is not a file URL, skipping agent control health check: " + _configuration.HealthDeliveryLocation);
_healthChecksFailed = true;
return;
}

// Ensure the path is cleaned up for Windows by removing a possible leading slash
var cleanedPath = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? fileUri.LocalPath.TrimStart('/') : fileUri.LocalPath;
// verify the directory exists and is writeable
if (!Directory.Exists(cleanedPath))
{
Log.Warn("Agent Control is enabled but the path specified in agent_control.health.delivery_location does not exist.");
_healthChecksFailed = true;
}

try
{
using StreamWriter writer = new StreamWriter(Path.Combine(cleanedPath, _healthCheck.FileName));
writer.WriteAsync(_healthCheck.ToYaml()).GetAwaiter().GetResult();
}
catch (Exception ex)
{
Log.Warn(ex, "Agent Control is enabled but the path specified in agent_control.health.delivery_location is not writeable.");
_healthChecksFailed = true;
}

}

#endregion

public void ReportSupportabilityPayloadsDroppeDueToMaxPayloadSizeLimit(string endpoint)
{
TrySend(_metricBuilder.TryBuildSupportabilityPayloadsDroppedDueToMaxPayloadLimit(endpoint));
Expand All @@ -686,6 +773,7 @@ private void CollectOneTimeMetrics()
ReportIfInstrumentationIsDisabled();
ReportIfGCSamplerV2IsEnabled();
ReportIfAwsAccountIdProvided();
ReportIfAgentControlHealthEnabled();
}

public void CollectMetrics()
Expand Down Expand Up @@ -761,6 +849,7 @@ private bool TryGetCount(InterlockedLongCounter counter, out long metricCount)
}

private ConcurrentBag<DestinationInteractionSample> _externalApiDataUsageSamples = new ConcurrentBag<DestinationInteractionSample>();
private bool _healthChecksFailed;

public void ReportSupportabilityDataUsage(string api, string apiArea, long dataSent, long dataReceived)
{
Expand Down
58 changes: 58 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCheck.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

using System;
using NewRelic.Agent.Core.Utilities;

namespace NewRelic.Agent.Core.AgentHealth
{
public class HealthCheck
{
private const int NanoSecondsPerMillisecond = 1000000;

public bool IsHealthy { get; internal set; }
public string Status { get; internal set; }
public string LastError { get; internal set; }
public DateTime StartTime { get; } = DateTime.UtcNow;
public DateTime StatusTime { get; internal set; }
public string FileName { get; } = "health-" + System.Guid.NewGuid().ToString("N") + ".yml";

/// <summary>
/// Set the health status of the agent, but only update changed values.
/// </summary>
/// <param name="healthy"></param>
/// <param name="healthStatus"></param>
/// <param name="statusParams"></param>
public void TrySetHealth((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams)
{
// Threading!
if (IsHealthy != healthStatus.IsHealthy)
{
IsHealthy = healthStatus.IsHealthy;
}

if (!Status.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
if (statusParams != null && statusParams.Length > 0)
{
Status = string.Format(Status, statusParams);
}
else
{
Status = healthStatus.Status;
}
}

if (!LastError.Equals(healthStatus.Code, StringComparison.OrdinalIgnoreCase))
{
LastError = healthStatus.Code;
}
}

public string ToYaml()
{
StatusTime = DateTime.UtcNow;
return $"healthy: {IsHealthy}\nstatus: {Status}\nlast_error: {LastError}\nstatus_time_unix_nano: {StatusTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}\nstart_time_unix_nano: {StartTime.ToUnixTimeMilliseconds() * NanoSecondsPerMillisecond}";
}
}
}
84 changes: 84 additions & 0 deletions src/Agent/NewRelic/Agent/Core/AgentHealth/HealthCodes.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2020 New Relic, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

namespace NewRelic.Agent.Core.AgentHealth
{
public static class HealthCodes
{
/// <summary>
/// Healthy
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) Healthy = (true, "NR-APM-000",
"Healthy");

/// <summary>
/// Invalid license key (HTTP status code 401)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyInvalid = (false, "NR-APM-001",
"Invalid license key (HTTP status code 401)");

/// <summary>
/// License key missing in configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) LicenseKeyMissing = (false, "NR-APM-002",
"License key missing in configuration");

/// <summary>
/// Forced disconnect received from New Relic (HTTP status code 410)
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ForceDisconnect = (false, "NR-APM-003",
"Forced disconnect received from New Relic (HTTP status code 410)");

/// <summary>
/// HTTP error response code [%s] received from New Relic while sending data type [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpError = (false, "NR-APM-004",
"HTTP error response code {0} received from New Relic while sending data type {1}");

/// <summary>
/// Missing application name in agent configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) ApplicationNameMissing = (false, "NR-APM-005",
"Missing application name in agent configuration");

/// <summary>
/// The maximum number of configured app names (3) exceeded
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) MaxApplicationNamesExceeded = (false, "NR-APM-006",
"The maximum number of configured app names (3) exceeded");

/// <summary>
/// HTTP Proxy configuration error; response code [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) HttpProxyError = (false, "NR-APM-007",
"HTTP Proxy configuration error; response code {0}");

/// <summary>
/// Agent is disabled via configuration
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentDisabledByConfiguration = (false, "NR-APM-008",
"Agent is disabled via configuration");

/// <summary>
/// Failed to connect to New Relic data collector
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) FailedToConnect = (false, "NR-APM-009",
"Failed to connect to New Relic data collector");

/// <summary>
/// Agent has shutdown
/// Only be reported if agent is "healthy" on shutdown.
/// If the agent status is not Healthy on agent shutdown, the existing error MUST not be overwritten.
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownHealthy = (true, "NR-APM-099",
"Agent has shutdown");

// Agent health codes for the .NET agent are 200-299

/// <summary>
/// Agent has shutdown with exception [%s]
/// </summary>
public static readonly (bool IsHealthy, string Code, string Status) AgentShutdownError = (false, "NR-APM-200",
"Agent has shutdown with exception {0}");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,7 @@ public interface IAgentHealthReporter : IOutOfBandMetricSource
void ReportLogForwardingEnabledWithFramework(string logFramework);
void ReportByteMetric(string metricName, long totalBytes, long? exclusiveBytes = null);
void ReportLoggingEventsEmpty(int count = 1);
void SetAgentControlStatus((bool IsHealthy, string Code, string Status) healthStatus, params string[] statusParams);
void PublishAgentControlHealthCheck();
}
}
13 changes: 12 additions & 1 deletion src/Agent/NewRelic/Agent/Core/AgentManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using NewRelic.Agent.Api;
using NewRelic.Agent.Configuration;
using NewRelic.Agent.Core.AgentHealth;
using NewRelic.Agent.Core.Commands;
using NewRelic.Agent.Core.Config;
using NewRelic.Agent.Core.Configuration;
Expand Down Expand Up @@ -81,6 +82,7 @@ public static IAgentManager Instance
private IConfiguration Configuration { get { return _configurationSubscription.Configuration; } }
private ThreadProfilingService _threadProfilingService;
private readonly IWrapperService _wrapperService;
private readonly IAgentHealthReporter _agentHealthReporter;

private volatile bool _shutdownEventReceived;
private volatile bool _isInitialized;
Expand Down Expand Up @@ -154,6 +156,9 @@ private AgentManager()
var agentApi = _container.Resolve<IAgentApi>();
_wrapperService = _container.Resolve<IWrapperService>();

// Start the AgentHealthReporter early so that we can potentially report health issues during startup
_agentHealthReporter = _container.Resolve<IAgentHealthReporter>();

// Attempt to auto start the agent once all services have resolved, except in serverless mode
if (!bootstrapConfig.ServerlessModeEnabled)
_container.Resolve<IConnectionManager>().AttemptAutoStart();
Expand Down Expand Up @@ -288,6 +293,9 @@ private void LogInitialized()
"NEW_RELIC_SEND_DATA_ON_EXIT",
"NEW_RELIC_SEND_DATA_ON_EXIT_THRESHOLD_MS",
"NEW_RELIC_AZURE_FUNCTION_MODE_ENABLED",
"NEW_RELIC_AGENT_CONTROL_ENABLED",
"NEW_RELIC_AGENT_CONTROL_HEALTH_DELIVERY_LOCATION",
"NEW_RELIC_AGENT_CONTROL_HEALTH_FREQUENCY"
};

List<string> environmentVariablesSensitive = new List<string> {
Expand Down Expand Up @@ -409,7 +417,7 @@ public ITracer GetTracerImpl(string tracerFactoryName, uint tracerArguments, str
private void ProcessExit(object sender, EventArgs e)
{
Log.Debug("Received a ProcessExit CLR event for the application domain. About to shut down the .NET Agent...");

Shutdown(true);
}

Expand Down Expand Up @@ -437,13 +445,16 @@ private void Shutdown(bool cleanShutdown)

Log.Debug("Shutting down public agent services...");
StopServices();
_agentHealthReporter?.SetAgentControlStatus(HealthCodes.AgentShutdownHealthy);
}
catch (Exception e)
{
_agentHealthReporter?.SetAgentControlStatus(HealthCodes.AgentShutdownError, e.Message);
Log.Info(e, "Unexpected exception during agent shutdown");
}
finally
{
_agentHealthReporter?.PublishAgentControlHealthCheck();
Log.Debug("Shutting down internal agent services...");
Dispose();

Expand Down
Loading

0 comments on commit f42929e

Please sign in to comment.