From b2b5645e16859073ba5df135be213e8f6bee60fb Mon Sep 17 00:00:00 2001 From: Samantha Frank Date: Fri, 15 Nov 2024 15:25:06 -0500 Subject: [PATCH 1/3] VA: Cleanup performRemoteValidation (#7814) Bring this code more in line with `VA.remoteDoDCV` in #7794. This should make these two easier to diff in review. --- va/va.go | 74 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/va/va.go b/va/va.go index 994110a08c2..d130f4d6a92 100644 --- a/va/va.go +++ b/va/va.go @@ -458,7 +458,8 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( ctx context.Context, req *vapb.PerformValidationRequest, ) *probs.ProblemDetails { - if len(va.remoteVAs) == 0 { + remoteVACount := len(va.remoteVAs) + if remoteVACount == 0 { return nil } @@ -469,65 +470,70 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( }).Observe(va.clk.Since(start).Seconds()) }() - type rvaResult struct { - hostname string - response *vapb.ValidationResult - err error + type response struct { + addr string + result *vapb.ValidationResult + err error } - results := make(chan *rvaResult, len(va.remoteVAs)) - - for _, i := range rand.Perm(len(va.remoteVAs)) { - remoteVA := va.remoteVAs[i] - go func(rva RemoteVA, out chan<- *rvaResult) { + responses := make(chan *response, remoteVACount) + for _, i := range rand.Perm(remoteVACount) { + go func(rva RemoteVA, out chan<- *response) { res, err := rva.PerformValidation(ctx, req) - out <- &rvaResult{ - hostname: rva.Address, - response: res, - err: err, + out <- &response{ + addr: rva.Address, + result: res, + err: err, } - }(remoteVA, results) + }(va.remoteVAs[i], responses) } - required := len(va.remoteVAs) - va.maxRemoteFailures - good := 0 - bad := 0 + required := remoteVACount - va.maxRemoteFailures + var passed []string + // failed contains a list of perspectives that failed to validate the domain + // or the addresses of remote VAs that failed to respond. + var failed []string var firstProb *probs.ProblemDetails - for res := range results { + for resp := range responses { var currProb *probs.ProblemDetails - if res.err != nil { - bad++ + if resp.err != nil { + // Failed to communicate with the remote VA. + failed = append(failed, resp.addr) - if canceled.Is(res.err) { + if canceled.Is(resp.err) { currProb = probs.ServerInternal("Remote PerformValidation RPC canceled") } else { - va.log.Errf("Remote VA %q.PerformValidation failed: %s", res.hostname, res.err) + va.log.Errf("Remote VA %q.PerformValidation failed: %s", resp.addr, resp.err) currProb = probs.ServerInternal("Remote PerformValidation RPC failed") } - } else if res.response.Problems != nil { - bad++ + } else if resp.result.Problems != nil { + // The remote VA returned a problem. + failed = append(failed, resp.result.Perspective) var err error - currProb, err = bgrpc.PBToProblemDetails(res.response.Problems) + currProb, err = bgrpc.PBToProblemDetails(resp.result.Problems) if err != nil { - va.log.Errf("Remote VA %q.PerformValidation returned malformed problem: %s", res.hostname, err) + va.log.Errf("Remote VA %q.PerformValidation returned malformed problem: %s", resp.addr, err) currProb = probs.ServerInternal("Remote PerformValidation RPC returned malformed result") } } else { - good++ + // The remote VA returned a successful result. + passed = append(passed, resp.result.Perspective) } if firstProb == nil && currProb != nil { + // A problem was encountered for the first time. firstProb = currProb } - // Return as soon as we have enough successes or failures for a definitive result. - if good >= required { + if len(passed) >= required { + // Enough successful responses to reach quorum. return nil } - if bad > va.maxRemoteFailures { + if len(failed) > va.maxRemoteFailures { + // Too many failed responses to reach quorum. va.metrics.remoteValidationFailures.Inc() firstProb.Detail = fmt.Sprintf("During secondary validation: %s", firstProb.Detail) return firstProb @@ -535,13 +541,13 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( // If we somehow haven't returned early, we need to break the loop once all // of the VAs have returned a result. - if good+bad >= len(va.remoteVAs) { + if len(passed)+len(failed) >= remoteVACount { break } } - // This condition should not occur - it indicates the good/bad counts neither - // met the required threshold nor the maxRemoteFailures threshold. + // This condition should not occur - it indicates the passed/failed counts + // neither met the required threshold nor the maxRemoteFailures threshold. return probs.ServerInternal("Too few remote PerformValidation RPC results") } From f9fb688427ca52236124b4f7556b331d211be2e2 Mon Sep 17 00:00:00 2001 From: Samantha Frank Date: Fri, 15 Nov 2024 15:49:04 -0500 Subject: [PATCH 2/3] build: Specify go 1.23.0 in go.mod (#7822) We're using `.Keys()` a method of the maps package added in 1.23 (https://pkg.go.dev/maps@master#Keys) but our go.mod states 1.22.0. This causes some in-IDE linting errors in VSCode. --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 839579d61fe..2795da5dedf 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/letsencrypt/boulder -go 1.22.0 +go 1.23.0 require ( github.com/aws/aws-sdk-go-v2 v1.32.2 From 3baac6f6dff3fb32f4e5384c54c9aa563ceda748 Mon Sep 17 00:00:00 2001 From: Samantha Frank Date: Fri, 15 Nov 2024 15:51:39 -0500 Subject: [PATCH 3/3] VA: Consolidate multiple metrics into one histogram (#7816) - Add a new histogram, validationLatency - Add a VA.observeLatency for observing validation latency - Refactor to ensure this metric can be observed exclusively within VA.PerformValidation and VA.IsCAAValid. - Replace validationTime, localValidationTime, remoteValidationTime, remoteValidationFailures, caaCheckTime, localCAACheckTime, remoteCAACheckTime, and remoteCAACheckFailures with validationLatency --- va/caa.go | 79 ++++++++++------------ va/caa_test.go | 55 ++++++++++++++++ va/dns_test.go | 10 +-- va/va.go | 175 ++++++++++++++++++++----------------------------- va/va_test.go | 31 +++++---- 5 files changed, 185 insertions(+), 165 deletions(-) diff --git a/va/caa.go b/va/caa.go index a3f4d6c2980..da35eb2f4bd 100644 --- a/va/caa.go +++ b/va/caa.go @@ -11,7 +11,6 @@ import ( "time" "github.com/miekg/dns" - "github.com/prometheus/client_golang/prometheus" "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/canceled" @@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC Requester: req.AccountURIID, Hostname: req.Domain, } - checkStartTime := va.clk.Now() - validationMethod := core.AcmeChallenge(req.ValidationMethod) - if !validationMethod.IsValid() { + challType := core.AcmeChallenge(req.ValidationMethod) + if !challType.IsValid() { return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod) } acmeID := identifier.NewDNS(req.Domain) params := &caaParams{ accountURIID: req.AccountURIID, - validationMethod: validationMethod, + validationMethod: challType, } + var prob *probs.ProblemDetails + var internalErr error + var localLatency time.Duration + start := va.clk.Now() + + defer func() { + probType := "" + outcome := fail + if prob != nil { + // CAA check failed. + probType = string(prob.Type) + logEvent.Error = prob.Error() + } else { + // CAA check passed. + outcome = pass + } + // Observe local check latency (primary|remote). + va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency) + if va.isPrimaryVA() { + // Observe total check latency (primary+remote). + va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start)) + } + // Log the total check latency. + logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds() + + va.log.AuditObject("CAA check result", logEvent) + }() + var remoteCAAResults chan *remoteVAResult if features.Get().EnforceMultiCAA { if remoteVACount := len(va.remoteVAs); remoteVACount > 0 { @@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC } } - checkResult := "success" - err := va.checkCAA(ctx, acmeID, params) - localCheckLatency := time.Since(checkStartTime) - var prob *probs.ProblemDetails - if err != nil { - prob = detailedError(err) - logEvent.Error = prob.Error() - logEvent.InternalError = err.Error() + internalErr = va.checkCAA(ctx, acmeID, params) + if internalErr != nil { + prob = detailedError(internalErr) prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail) - checkResult = "failure" } else if remoteCAAResults != nil { if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults { // If we're not going to enforce multi CAA but we are logging the @@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC _ = va.processRemoteCAAResults( req.Domain, req.AccountURIID, - string(validationMethod), + string(challType), remoteCAAResults) }() } else if features.Get().EnforceMultiCAA { remoteProb := va.processRemoteCAAResults( req.Domain, req.AccountURIID, - string(validationMethod), + string(challType), remoteCAAResults) // If the remote result was a non-nil problem then fail the CAA check if remoteProb != nil { prob = remoteProb - // We only set .Error here, not InternalError, because the remote VA doesn't send - // us the internal error. But that's okay, because it got logged at the remote VA. - logEvent.Error = remoteProb.Error() - checkResult = "failure" va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s", req.Domain, remoteProb) - va.metrics.remoteCAACheckFailures.Inc() } } } - checkLatency := time.Since(checkStartTime) - logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds() - - va.metrics.localCAACheckTime.With(prometheus.Labels{ - "result": checkResult, - }).Observe(localCheckLatency.Seconds()) - va.metrics.caaCheckTime.With(prometheus.Labels{ - "result": checkResult, - }).Observe(checkLatency.Seconds()) - - va.log.AuditObject("CAA check result", logEvent) if prob != nil { // The ProblemDetails will be serialized through gRPC, which requires UTF-8. @@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( challengeType string, remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails { - state := "failure" - start := va.clk.Now() - - defer func() { - va.metrics.remoteCAACheckTime.With(prometheus.Labels{ - "result": state, - }).Observe(va.clk.Since(start).Seconds()) - }() - required := len(va.remoteVAs) - va.maxRemoteFailures good := 0 bad := 0 @@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( // success or failure threshold is met. if !features.Get().MultiCAAFullResults { if good >= required { - state = "success" return nil } else if bad > va.maxRemoteFailures { modifiedProblem := *result.Problem @@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( // Based on the threshold of good/bad return nil or a problem. if good >= required { - state = "success" return nil } else if bad > va.maxRemoteFailures { modifiedProblem := *firstProb diff --git a/va/caa_test.go b/va/caa_test.go index 9b9789d57b8..802dd01e004 100644 --- a/va/caa_test.go +++ b/va/caa_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/miekg/dns" + "github.com/prometheus/client_golang/prometheus" "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/core" @@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) { expectedProbSubstring string expectedProbType probs.ProblemType expectedDiffLogSubstring string + expectedLabels prometheus.Labels localDNSClient bdns.Client }{ { @@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, + }, }, { name: "broken localVA, RVAs functional, no CAA records", @@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, 1 broken RVA, no CAA records", @@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, all broken RVAs, no CAA records", @@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) { {brokenVA, brokenUA}, {brokenVA, brokenUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "all VAs functional, CAA issue type present", @@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, + }, }, { name: "functional localVA, 1 broken RVA, CAA issue type present", @@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, all broken RVAs, CAA issue type present", @@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) { {brokenVA, brokenUA}, {brokenVA, brokenUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { // The localVA kicks off the background goroutines before doing its @@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) { } else { test.AssertEquals(t, len(gotAnyRemoteFailures), 0) } + + if tc.expectedLabels != nil { + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1) + } }) } } diff --git a/va/dns_test.go b/va/dns_test.go index a545228a47f..d9fc776bd10 100644 --- a/va/dns_test.go +++ b/va/dns_test.go @@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) { test.AssertEquals(t, res.Problems.ProblemType, "unauthorized") test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com") - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "invalid", - "problem_type": "unauthorized", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.UnauthorizedProblem), + "result": fail, }, 1) } diff --git a/va/va.go b/va/va.go index d130f4d6a92..22c2a60b601 100644 --- a/va/va.go +++ b/va/va.go @@ -31,7 +31,16 @@ import ( vapb "github.com/letsencrypt/boulder/va/proto" ) -const PrimaryPerspective = "Primary" +const ( + PrimaryPerspective = "Primary" + allPerspectives = "all" + + opChallAndCAA = "challenge+caa" + opCAA = "caa" + + pass = "pass" + fail = "fail" +) var ( // badTLSHeader contains the string 'HTTP /' which is returned when @@ -83,14 +92,14 @@ type RemoteVA struct { } type vaMetrics struct { - validationTime *prometheus.HistogramVec - localValidationTime *prometheus.HistogramVec - remoteValidationTime *prometheus.HistogramVec - remoteValidationFailures prometheus.Counter - caaCheckTime *prometheus.HistogramVec - localCAACheckTime *prometheus.HistogramVec - remoteCAACheckTime *prometheus.HistogramVec - remoteCAACheckFailures prometheus.Counter + // validationLatency is a histogram of the latency to perform validations + // from the primary and remote VA perspectives. It's labelled by: + // - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa|challenge+caa] + // - perspective: ValidationAuthorityImpl.perspective + // - challenge_type: core.Challenge.Type + // - problem_type: probs.ProblemType + // - result: the result of the validation as [pass|fail] + validationLatency *prometheus.HistogramVec prospectiveRemoteCAACheckFailures prometheus.Counter tlsALPNOIDCounter *prometheus.CounterVec http01Fallbacks prometheus.Counter @@ -100,66 +109,15 @@ type vaMetrics struct { } func initMetrics(stats prometheus.Registerer) *vaMetrics { - validationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "validation_time", - Help: "Total time taken to validate a challenge and aggregate results", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type", "result", "problem_type"}) - stats.MustRegister(validationTime) - localValidationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "local_validation_time", - Help: "Time taken to locally validate a challenge", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type", "result"}) - stats.MustRegister(localValidationTime) - remoteValidationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "remote_validation_time", - Help: "Time taken to remotely validate a challenge", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type"}) - stats.MustRegister(remoteValidationTime) - remoteValidationFailures := prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "remote_validation_failures", - Help: "Number of validations failed due to remote VAs returning failure when consensus is enforced", - }) - stats.MustRegister(remoteValidationFailures) - caaCheckTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "caa_check_time", - Help: "Total time taken to check CAA records and aggregate results", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"result"}) - stats.MustRegister(caaCheckTime) - localCAACheckTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "caa_check_time_local", - Help: "Time taken to locally check CAA records", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"result"}) - stats.MustRegister(localCAACheckTime) - remoteCAACheckTime := prometheus.NewHistogramVec( + validationLatency := prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "caa_check_time_remote", - Help: "Time taken to remotely check CAA records", + Name: "validation_latency", + Help: "Histogram of the latency to perform validations from the primary and remote VA perspectives", Buckets: metrics.InternetFacingBuckets, }, - []string{"result"}) - stats.MustRegister(remoteCAACheckTime) - remoteCAACheckFailures := prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "remote_caa_check_failures", - Help: "Number of CAA checks failed due to remote VAs returning failure when consensus is enforced", - }) - stats.MustRegister(remoteCAACheckFailures) + []string{"operation", "perspective", "challenge_type", "problem_type", "result"}, + ) + stats.MustRegister(validationLatency) prospectiveRemoteCAACheckFailures := prometheus.NewCounter( prometheus.CounterOpts{ Name: "prospective_remote_caa_check_failures", @@ -198,14 +156,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics { stats.MustRegister(ipv4FallbackCounter) return &vaMetrics{ - validationTime: validationTime, - remoteValidationTime: remoteValidationTime, - localValidationTime: localValidationTime, - remoteValidationFailures: remoteValidationFailures, - caaCheckTime: caaCheckTime, - localCAACheckTime: localCAACheckTime, - remoteCAACheckTime: remoteCAACheckTime, - remoteCAACheckFailures: remoteCAACheckFailures, + validationLatency: validationLatency, prospectiveRemoteCAACheckFailures: prospectiveRemoteCAACheckFailures, tlsALPNOIDCounter: tlsALPNOIDCounter, http01Fallbacks: http01Fallbacks, @@ -426,6 +377,11 @@ func detailedError(err error) *probs.ProblemDetails { return probs.Connection("Error getting validation data") } +// isPrimaryVA returns true if the VA is the primary validation perspective. +func (va *ValidationAuthorityImpl) isPrimaryVA() bool { + return va.perspective == PrimaryPerspective +} + // validateChallenge simply passes through to the appropriate validation method // depending on the challenge type. func (va *ValidationAuthorityImpl) validateChallenge( @@ -449,6 +405,25 @@ func (va *ValidationAuthorityImpl) validateChallenge( return nil, berrors.MalformedError("invalid challenge type %s", kind) } +// observeLatency records entries in the validationLatency histogram of the +// latency to perform validations from the primary and remote VA perspectives. +// The labels are: +// - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa] +// - perspective: [ValidationAuthorityImpl.perspective|all] +// - challenge_type: core.Challenge.Type +// - problem_type: probs.ProblemType +// - result: the result of the validation as [pass|fail] +func (va *ValidationAuthorityImpl) observeLatency(op, perspective, challType, probType, result string, latency time.Duration) { + labels := prometheus.Labels{ + "operation": op, + "perspective": perspective, + "challenge_type": challType, + "problem_type": probType, + "result": result, + } + va.metrics.validationLatency.With(labels).Observe(latency.Seconds()) +} + // performRemoteValidation coordinates the whole process of kicking off and // collecting results from calls to remote VAs' PerformValidation function. It // returns a problem if too many remote perspectives failed to corroborate @@ -463,13 +438,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( return nil } - start := va.clk.Now() - defer func() { - va.metrics.remoteValidationTime.With(prometheus.Labels{ - "type": req.Challenge.Type, - }).Observe(va.clk.Since(start).Seconds()) - }() - type response struct { addr string result *vapb.ValidationResult @@ -490,8 +458,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( required := remoteVACount - va.maxRemoteFailures var passed []string - // failed contains a list of perspectives that failed to validate the domain - // or the addresses of remote VAs that failed to respond. var failed []string var firstProb *probs.ProblemDetails @@ -534,7 +500,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( } if len(failed) > va.maxRemoteFailures { // Too many failed responses to reach quorum. - va.metrics.remoteValidationFailures.Inc() firstProb.Detail = fmt.Sprintf("During secondary validation: %s", firstProb.Detail) return firstProb } @@ -649,12 +614,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v return nil, berrors.InternalServerError("Incomplete validation request") } - challenge, err := bgrpc.PBToChallenge(req.Challenge) + chall, err := bgrpc.PBToChallenge(req.Challenge) if err != nil { return nil, errors.New("challenge failed to deserialize") } - err = challenge.CheckPending() + err = chall.CheckPending() if err != nil { return nil, berrors.MalformedError("challenge failed consistency check: %s", err) } @@ -664,36 +629,34 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v // `prob`, or this will fail. var prob *probs.ProblemDetails var localLatency time.Duration - vStart := va.clk.Now() + start := va.clk.Now() logEvent := verificationRequestEvent{ ID: req.Authz.Id, Requester: req.Authz.RegID, Hostname: req.DnsName, - Challenge: challenge, + Challenge: chall, } defer func() { - problemType := "" + probType := "" + outcome := fail if prob != nil { - problemType = string(prob.Type) + probType = string(prob.Type) logEvent.Error = prob.Error() logEvent.Challenge.Error = prob logEvent.Challenge.Status = core.StatusInvalid } else { logEvent.Challenge.Status = core.StatusValid + outcome = pass + } + // Observe local validation latency (primary|remote). + va.observeLatency(opChallAndCAA, va.perspective, string(chall.Type), probType, outcome, localLatency) + if va.isPrimaryVA() { + // Observe total validation latency (primary+remote). + va.observeLatency(opChallAndCAA, allPerspectives, string(chall.Type), probType, outcome, va.clk.Since(start)) } - va.metrics.localValidationTime.With(prometheus.Labels{ - "type": string(logEvent.Challenge.Type), - "result": string(logEvent.Challenge.Status), - }).Observe(localLatency.Seconds()) - - va.metrics.validationTime.With(prometheus.Labels{ - "type": string(logEvent.Challenge.Type), - "result": string(logEvent.Challenge.Status), - "problem_type": problemType, - }).Observe(time.Since(vStart).Seconds()) - - logEvent.ValidationLatency = time.Since(vStart).Round(time.Millisecond).Seconds() + // Log the total validation latency. + logEvent.ValidationLatency = time.Since(start).Round(time.Millisecond).Seconds() va.log.AuditObject("Validation result", logEvent) }() @@ -705,10 +668,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v ctx, identifier.NewDNS(req.DnsName), req.Authz.RegID, - challenge.Type, - challenge.Token, + chall.Type, + chall.Token, req.ExpectedKeyAuthorization) - localLatency = time.Since(vStart) + + // Stop the clock for local validation latency. + localLatency = va.clk.Since(start) // Check for malformed ValidationRecords logEvent.Challenge.ValidationRecord = records diff --git a/va/va_test.go b/va/va_test.go index 90a949766af..f65d31f0342 100644 --- a/va/va_test.go +++ b/va/va_test.go @@ -257,11 +257,12 @@ func TestPerformValidationInvalid(t *testing.T) { req := createValidationRequest("foo.com", core.ChallengeTypeDNS01) res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems != nil, "validation succeeded") - - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "invalid", - "problem_type": "unauthorized", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.UnauthorizedProblem), + "result": fail, }, 1) } @@ -286,10 +287,12 @@ func TestPerformValidationValid(t *testing.T) { res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems)) - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "valid", - "problem_type": "", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, }, 1) resultLog := mockLog.GetAllMatching(`Validation result`) if len(resultLog) != 1 { @@ -311,10 +314,12 @@ func TestPerformValidationWildcard(t *testing.T) { res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems)) - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "valid", - "problem_type": "", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, }, 1) resultLog := mockLog.GetAllMatching(`Validation result`) if len(resultLog) != 1 {