-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Implement otel retry metrics #12064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Implement otel retry metrics #12064
Changes from all commits
c3b473a
6c6a0f5
f9c5a68
bd69ed5
fab9a26
43a746b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ | |
*/ | ||
final class OpenTelemetryMetricsModule { | ||
private static final Logger logger = Logger.getLogger(OpenTelemetryMetricsModule.class.getName()); | ||
private static final double NANOS_PER_SEC = 1_000_000_000.0; | ||
public static final ImmutableSet<String> DEFAULT_PER_CALL_METRICS_SET = | ||
ImmutableSet.of( | ||
"grpc.client.attempt.started", | ||
|
@@ -292,9 +293,12 @@ static final class CallAttemptsTracerFactory extends ClientStreamTracer.Factory | |
private final String fullMethodName; | ||
private final List<OpenTelemetryPlugin.ClientCallPlugin> callPlugins; | ||
private Status status; | ||
private long retryDelayNanos; | ||
private long callLatencyNanos; | ||
private final Object lock = new Object(); | ||
private final AtomicLong attemptsPerCall = new AtomicLong(); | ||
private final AtomicLong hedgedAttemptsPerCall = new AtomicLong(); | ||
private final AtomicLong transparentRetriesPerCall = new AtomicLong(); | ||
@GuardedBy("lock") | ||
private int activeStreams; | ||
@GuardedBy("lock") | ||
|
@@ -331,6 +335,7 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada | |
} | ||
if (++activeStreams == 1 && attemptStopwatch.isRunning()) { | ||
attemptStopwatch.stop(); | ||
retryDelayNanos = attemptStopwatch.elapsed(TimeUnit.NANOSECONDS); | ||
} | ||
} | ||
// Skip recording for the first time, since it is already recorded in | ||
|
@@ -344,7 +349,11 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada | |
module.resource.clientAttemptCountCounter().add(1, attribute); | ||
} | ||
} | ||
if (!info.isTransparentRetry()) { | ||
if (info.isTransparentRetry()) { | ||
transparentRetriesPerCall.incrementAndGet(); | ||
} else if (info.isHedging()) { | ||
hedgedAttemptsPerCall.incrementAndGet(); | ||
} else { | ||
attemptsPerCall.incrementAndGet(); | ||
Comment on lines
+352
to
357
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this not assume them to be mutually exclusive ? |
||
} | ||
return newClientTracer(info); | ||
|
@@ -407,14 +416,53 @@ void recordFinishedCall() { | |
tracer.recordFinishedAttempt(); | ||
} | ||
callLatencyNanos = callStopWatch.elapsed(TimeUnit.NANOSECONDS); | ||
io.opentelemetry.api.common.Attributes attribute = | ||
io.opentelemetry.api.common.Attributes.of(METHOD_KEY, fullMethodName, | ||
TARGET_KEY, target, | ||
STATUS_KEY, status.getCode().toString()); | ||
|
||
// Base attributes | ||
io.opentelemetry.api.common.Attributes baseAttributes = | ||
io.opentelemetry.api.common.Attributes.of( | ||
METHOD_KEY, fullMethodName, | ||
TARGET_KEY, target | ||
); | ||
|
||
// Duration | ||
if (module.resource.clientCallDurationCounter() != null) { | ||
module.resource.clientCallDurationCounter() | ||
.record(callLatencyNanos * SECONDS_PER_NANO, attribute); | ||
module.resource.clientCallDurationCounter().record( | ||
callLatencyNanos * SECONDS_PER_NANO, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. callLatencyNanos is already in nanos. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, nanos * seconds / nanos = seconds (duration unit is "s") There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is not addressed. |
||
baseAttributes.toBuilder() | ||
.put(STATUS_KEY, status.getCode().toString()) | ||
.build() | ||
); | ||
} | ||
|
||
// Retry counts | ||
if (module.resource.clientCallRetriesCounter() != null) { | ||
long retriesPerCall = attemptsPerCall.get() - 1 >= 0 ? attemptsPerCall.get() - 1 : 0; | ||
if (retriesPerCall > 0) { | ||
module.resource.clientCallRetriesCounter().record(retriesPerCall, baseAttributes); | ||
kannanjgithub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
// Hedge counts | ||
if (module.resource.clientCallHedgesCounter() != null) { | ||
if (hedgedAttemptsPerCall.get() > 0) { | ||
module.resource.clientCallHedgesCounter() | ||
.record(hedgedAttemptsPerCall.get(), baseAttributes); | ||
} | ||
} | ||
|
||
// Transparent Retry counts | ||
if (module.resource.clientCallTransparentRetriesCounter() != null | ||
&& transparentRetriesPerCall.get() > 0) { | ||
module.resource.clientCallTransparentRetriesCounter().record( | ||
transparentRetriesPerCall.get(), baseAttributes); | ||
} | ||
|
||
// Retry delay | ||
if (module.resource.clientCallRetryDelayCounter() != null) { | ||
module.resource.clientCallRetryDelayCounter().record( | ||
retryDelayNanos / NANOS_PER_SEC, | ||
baseAttributes | ||
); | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add unit tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the gRFC has had recent updates, waiting for it to get merged before doing further changes...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, there had been discussions yesterday about the recent changes to the retry metrics and whether they helped and such. The implementation should be mostly the same, but it isn't probably worth chasing the gRFC for the moment.