-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Investigation hanging audit instance #4758
base: otel-metrics
Are you sure you want to change the base?
Changes from 4 commits
3835af4
6a26fdc
24c4ef3
d8d5730
251f167
4f78bdf
4c00e6e
76f26ec
4c1b13c
a2f471e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
namespace ServiceControl.Audit.Persistence.UnitOfWork | ||
{ | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
|
||
public interface IAuditIngestionUnitOfWorkFactory | ||
{ | ||
ValueTask<IAuditIngestionUnitOfWork> StartNew(int batchSize); //Throws if not enough space or some other problem preventing from writing data | ||
ValueTask<IAuditIngestionUnitOfWork> StartNew(int batchSize, CancellationToken cancellationToken = default); //Throws if not enough space or some other problem preventing from writing data | ||
bool CanIngestMore(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,15 +52,27 @@ public AuditIngestion( | |
|
||
errorHandlingPolicy = new AuditIngestionFaultPolicy(failedImportsStorage, settings.LoggingSettings, OnCriticalError); | ||
|
||
watchdog = new Watchdog("audit message ingestion", EnsureStarted, EnsureStopped, ingestionState.ReportError, ingestionState.Clear, settings.TimeToRestartAuditIngestionAfterFailure, logger); | ||
|
||
ingestionWorker = Task.Run(() => Loop(), CancellationToken.None); | ||
watchdog = new Watchdog( | ||
"audit message ingestion", | ||
EnsureStarted, | ||
EnsureStopped, | ||
ingestionState.ReportError, | ||
ingestionState.Clear, | ||
settings.TimeToRestartAuditIngestionAfterFailure, | ||
logger | ||
); | ||
} | ||
|
||
public Task StartAsync(CancellationToken _) => watchdog.Start(() => applicationLifetime.StopApplication()); | ||
public async Task StartAsync(CancellationToken cancellationToken) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andreasohlund @danielmarbach Alternative is to use BackgroundService.ExecuteAsync, assuming we can safely terminate ASAP: protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
await watchdog.Start(() => applicationLifetime.StopApplication());
await Loop(stoppingToken);
// Intentionally not invoking the following to shut down ASAP
// watchdog.Stop();
// channel.Writer.Complete();
// if (transportInfrastructure != null)
// {
// await transportInfrastructure.Shutdown(stoppingToken);
// }
} As everything is at-least-once processing and idempotent this would be the fastest method to shutdow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sense to switch to a BackgroundService here and inline the loop into the execute method Be aware though of dotnet/runtime#36063 and https://blog.stephencleary.com/2020/05/backgroundservice-gotcha-startup.html for more context There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I already tested that and it works. As the loop pretty much immediately does IO its not affected which is why I removed the |
||
{ | ||
stopSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); | ||
ingestionWorker = Loop(stopSource.Token); | ||
await watchdog.Start(() => applicationLifetime.StopApplication()); | ||
} | ||
|
||
public async Task StopAsync(CancellationToken cancellationToken) | ||
{ | ||
await stopSource.CancelAsync(); | ||
await watchdog.Stop(); | ||
channel.Writer.Complete(); | ||
await ingestionWorker; | ||
|
@@ -117,7 +129,7 @@ async Task EnsureStarted(CancellationToken cancellationToken = default) | |
|
||
queueIngestor = transportInfrastructure.Receivers[inputEndpoint]; | ||
|
||
await auditIngestor.VerifyCanReachForwardingAddress(); | ||
await auditIngestor.VerifyCanReachForwardingAddress(cancellationToken); | ||
|
||
await queueIngestor.StartReceive(cancellationToken); | ||
|
||
|
@@ -197,56 +209,74 @@ async Task OnMessage(MessageContext messageContext, CancellationToken cancellati | |
await taskCompletionSource.Task; | ||
} | ||
|
||
async Task Loop() | ||
async Task Loop(CancellationToken cancellationToken) | ||
{ | ||
var contexts = new List<MessageContext>(transportSettings.MaxConcurrency.Value); | ||
|
||
while (await channel.Reader.WaitToReadAsync()) | ||
try | ||
{ | ||
// will only enter here if there is something to read. | ||
try | ||
var contexts = new List<MessageContext>(transportSettings.MaxConcurrency.Value); | ||
|
||
while (await channel.Reader.WaitToReadAsync(cancellationToken)) | ||
{ | ||
// as long as there is something to read this will fetch up to MaximumConcurrency items | ||
while (channel.Reader.TryRead(out var context)) | ||
// will only enter here if there is something to read. | ||
try | ||
{ | ||
contexts.Add(context); | ||
auditMessageSize.Record(context.Body.Length / 1024.0); | ||
// as long as there is something to read this will fetch up to MaximumConcurrency items | ||
while (channel.Reader.TryRead(out var context)) | ||
{ | ||
contexts.Add(context); | ||
auditMessageSize.Record(context.Body.Length / 1024D); | ||
} | ||
|
||
auditBatchSize.Record(contexts.Count); | ||
var sw = Stopwatch.StartNew(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not relevant for this PR but we should be using a ValueStopWatch here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that included somewhere or do we copy https://github.com/dotnet/aspnetcore/blob/main/src/Shared/ValueStopwatch/ValueStopwatch.cs ? |
||
|
||
await auditIngestor.Ingest(contexts, cancellationToken); | ||
auditBatchDuration.Record(sw.ElapsedMilliseconds); | ||
} | ||
|
||
auditBatchSize.Record(contexts.Count); | ||
var sw = Stopwatch.StartNew(); | ||
|
||
await auditIngestor.Ingest(contexts); | ||
auditBatchDuration.Record(sw.ElapsedMilliseconds); | ||
} | ||
catch (OperationCanceledException) | ||
{ | ||
//Do nothing as we are shutting down | ||
continue; | ||
} | ||
catch (Exception e) // show must go on | ||
{ | ||
if (logger.IsInfoEnabled) | ||
catch (OperationCanceledException e) when (e.CancellationToken == cancellationToken) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @andreasohlund please review todays commits seperately and then combined. |
||
{ | ||
logger.Info("Ingesting messages failed", e); | ||
logger.Debug("Cancelled by host"); | ||
return; // No point in continueing as WaitToReadAsync will throw OCE | ||
} | ||
|
||
// signal all message handling tasks to terminate | ||
foreach (var context in contexts) | ||
catch (Exception e) // show must go on | ||
{ | ||
context.GetTaskCompletionSource().TrySetException(e); | ||
if (logger.IsInfoEnabled) | ||
{ | ||
logger.Info("Ingesting messages failed", e); | ||
} | ||
|
||
// signal all message handling tasks to terminate | ||
foreach (var context in contexts) | ||
{ | ||
if (!context.GetTaskCompletionSource().TrySetException(e)) | ||
{ | ||
logger.Error("Loop TrySetException failed"); | ||
} | ||
} | ||
} | ||
finally | ||
{ | ||
contexts.Clear(); | ||
} | ||
} | ||
finally | ||
{ | ||
contexts.Clear(); | ||
} | ||
// will fall out here when writer is completed | ||
} | ||
catch (OperationCanceledException e) when (e.CancellationToken == cancellationToken) | ||
{ | ||
logger.Debug("Cancelled by host"); | ||
} | ||
catch (Exception e) | ||
{ | ||
// Might the next exception scope throw an exception, consider this fatal as that cannot be an OCE | ||
logger.Fatal("Loop interrupted", e); | ||
applicationLifetime.StopApplication(); | ||
throw; | ||
} | ||
// will fall out here when writer is completed | ||
} | ||
|
||
TransportInfrastructure transportInfrastructure; | ||
IMessageReceiver queueIngestor; | ||
Task ingestionWorker; | ||
|
||
readonly SemaphoreSlim startStopSemaphore = new(1); | ||
readonly string inputEndpoint; | ||
|
@@ -262,9 +292,10 @@ async Task Loop() | |
readonly Histogram<double> auditMessageSize = AuditMetrics.Meter.CreateHistogram<double>($"{AuditMetrics.Prefix}.audit_message_size", unit: "kilobytes"); | ||
readonly Counter<long> receivedAudits = AuditMetrics.Meter.CreateCounter<long>($"{AuditMetrics.Prefix}.received_audits"); | ||
readonly Watchdog watchdog; | ||
readonly Task ingestionWorker; | ||
readonly IHostApplicationLifetime applicationLifetime; | ||
|
||
CancellationTokenSource stopSource; | ||
|
||
static readonly ILog logger = LogManager.GetLogger<AuditIngestion>(); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be good to have a comment here indicating the disposal happens as part of the unit of work