From 3baac6f6dff3fb32f4e5384c54c9aa563ceda748 Mon Sep 17 00:00:00 2001 From: Samantha Frank Date: Fri, 15 Nov 2024 15:51:39 -0500 Subject: [PATCH 1/3] VA: Consolidate multiple metrics into one histogram (#7816) - Add a new histogram, validationLatency - Add a VA.observeLatency for observing validation latency - Refactor to ensure this metric can be observed exclusively within VA.PerformValidation and VA.IsCAAValid. - Replace validationTime, localValidationTime, remoteValidationTime, remoteValidationFailures, caaCheckTime, localCAACheckTime, remoteCAACheckTime, and remoteCAACheckFailures with validationLatency --- va/caa.go | 79 ++++++++++------------ va/caa_test.go | 55 ++++++++++++++++ va/dns_test.go | 10 +-- va/va.go | 175 ++++++++++++++++++++----------------------------- va/va_test.go | 31 +++++---- 5 files changed, 185 insertions(+), 165 deletions(-) diff --git a/va/caa.go b/va/caa.go index a3f4d6c2980..da35eb2f4bd 100644 --- a/va/caa.go +++ b/va/caa.go @@ -11,7 +11,6 @@ import ( "time" "github.com/miekg/dns" - "github.com/prometheus/client_golang/prometheus" "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/canceled" @@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC Requester: req.AccountURIID, Hostname: req.Domain, } - checkStartTime := va.clk.Now() - validationMethod := core.AcmeChallenge(req.ValidationMethod) - if !validationMethod.IsValid() { + challType := core.AcmeChallenge(req.ValidationMethod) + if !challType.IsValid() { return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod) } acmeID := identifier.NewDNS(req.Domain) params := &caaParams{ accountURIID: req.AccountURIID, - validationMethod: validationMethod, + validationMethod: challType, } + var prob *probs.ProblemDetails + var internalErr error + var localLatency time.Duration + start := va.clk.Now() + + defer func() { + probType := "" + outcome := fail + if prob != nil { + // CAA check failed. + probType = string(prob.Type) + logEvent.Error = prob.Error() + } else { + // CAA check passed. + outcome = pass + } + // Observe local check latency (primary|remote). + va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency) + if va.isPrimaryVA() { + // Observe total check latency (primary+remote). + va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start)) + } + // Log the total check latency. + logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds() + + va.log.AuditObject("CAA check result", logEvent) + }() + var remoteCAAResults chan *remoteVAResult if features.Get().EnforceMultiCAA { if remoteVACount := len(va.remoteVAs); remoteVACount > 0 { @@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC } } - checkResult := "success" - err := va.checkCAA(ctx, acmeID, params) - localCheckLatency := time.Since(checkStartTime) - var prob *probs.ProblemDetails - if err != nil { - prob = detailedError(err) - logEvent.Error = prob.Error() - logEvent.InternalError = err.Error() + internalErr = va.checkCAA(ctx, acmeID, params) + if internalErr != nil { + prob = detailedError(internalErr) prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail) - checkResult = "failure" } else if remoteCAAResults != nil { if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults { // If we're not going to enforce multi CAA but we are logging the @@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC _ = va.processRemoteCAAResults( req.Domain, req.AccountURIID, - string(validationMethod), + string(challType), remoteCAAResults) }() } else if features.Get().EnforceMultiCAA { remoteProb := va.processRemoteCAAResults( req.Domain, req.AccountURIID, - string(validationMethod), + string(challType), remoteCAAResults) // If the remote result was a non-nil problem then fail the CAA check if remoteProb != nil { prob = remoteProb - // We only set .Error here, not InternalError, because the remote VA doesn't send - // us the internal error. But that's okay, because it got logged at the remote VA. - logEvent.Error = remoteProb.Error() - checkResult = "failure" va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s", req.Domain, remoteProb) - va.metrics.remoteCAACheckFailures.Inc() } } } - checkLatency := time.Since(checkStartTime) - logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds() - - va.metrics.localCAACheckTime.With(prometheus.Labels{ - "result": checkResult, - }).Observe(localCheckLatency.Seconds()) - va.metrics.caaCheckTime.With(prometheus.Labels{ - "result": checkResult, - }).Observe(checkLatency.Seconds()) - - va.log.AuditObject("CAA check result", logEvent) if prob != nil { // The ProblemDetails will be serialized through gRPC, which requires UTF-8. @@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( challengeType string, remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails { - state := "failure" - start := va.clk.Now() - - defer func() { - va.metrics.remoteCAACheckTime.With(prometheus.Labels{ - "result": state, - }).Observe(va.clk.Since(start).Seconds()) - }() - required := len(va.remoteVAs) - va.maxRemoteFailures good := 0 bad := 0 @@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( // success or failure threshold is met. if !features.Get().MultiCAAFullResults { if good >= required { - state = "success" return nil } else if bad > va.maxRemoteFailures { modifiedProblem := *result.Problem @@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults( // Based on the threshold of good/bad return nil or a problem. if good >= required { - state = "success" return nil } else if bad > va.maxRemoteFailures { modifiedProblem := *firstProb diff --git a/va/caa_test.go b/va/caa_test.go index 9b9789d57b8..802dd01e004 100644 --- a/va/caa_test.go +++ b/va/caa_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/miekg/dns" + "github.com/prometheus/client_golang/prometheus" "github.com/letsencrypt/boulder/bdns" "github.com/letsencrypt/boulder/core" @@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) { expectedProbSubstring string expectedProbType probs.ProblemType expectedDiffLogSubstring string + expectedLabels prometheus.Labels localDNSClient bdns.Client }{ { @@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, + }, }, { name: "broken localVA, RVAs functional, no CAA records", @@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, 1 broken RVA, no CAA records", @@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, all broken RVAs, no CAA records", @@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) { {brokenVA, brokenUA}, {brokenVA, brokenUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "all VAs functional, CAA issue type present", @@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, + }, }, { name: "functional localVA, 1 broken RVA, CAA issue type present", @@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) { {remoteVA, remoteUA}, {remoteVA, remoteUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { name: "functional localVA, all broken RVAs, CAA issue type present", @@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) { {brokenVA, brokenUA}, {brokenVA, brokenUA}, }, + expectedLabels: prometheus.Labels{ + "operation": opCAA, + "perspective": allPerspectives, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.DNSProblem), + "result": fail, + }, }, { // The localVA kicks off the background goroutines before doing its @@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) { } else { test.AssertEquals(t, len(gotAnyRemoteFailures), 0) } + + if tc.expectedLabels != nil { + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1) + } }) } } diff --git a/va/dns_test.go b/va/dns_test.go index a545228a47f..d9fc776bd10 100644 --- a/va/dns_test.go +++ b/va/dns_test.go @@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) { test.AssertEquals(t, res.Problems.ProblemType, "unauthorized") test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com") - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "invalid", - "problem_type": "unauthorized", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.UnauthorizedProblem), + "result": fail, }, 1) } diff --git a/va/va.go b/va/va.go index d130f4d6a92..22c2a60b601 100644 --- a/va/va.go +++ b/va/va.go @@ -31,7 +31,16 @@ import ( vapb "github.com/letsencrypt/boulder/va/proto" ) -const PrimaryPerspective = "Primary" +const ( + PrimaryPerspective = "Primary" + allPerspectives = "all" + + opChallAndCAA = "challenge+caa" + opCAA = "caa" + + pass = "pass" + fail = "fail" +) var ( // badTLSHeader contains the string 'HTTP /' which is returned when @@ -83,14 +92,14 @@ type RemoteVA struct { } type vaMetrics struct { - validationTime *prometheus.HistogramVec - localValidationTime *prometheus.HistogramVec - remoteValidationTime *prometheus.HistogramVec - remoteValidationFailures prometheus.Counter - caaCheckTime *prometheus.HistogramVec - localCAACheckTime *prometheus.HistogramVec - remoteCAACheckTime *prometheus.HistogramVec - remoteCAACheckFailures prometheus.Counter + // validationLatency is a histogram of the latency to perform validations + // from the primary and remote VA perspectives. It's labelled by: + // - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa|challenge+caa] + // - perspective: ValidationAuthorityImpl.perspective + // - challenge_type: core.Challenge.Type + // - problem_type: probs.ProblemType + // - result: the result of the validation as [pass|fail] + validationLatency *prometheus.HistogramVec prospectiveRemoteCAACheckFailures prometheus.Counter tlsALPNOIDCounter *prometheus.CounterVec http01Fallbacks prometheus.Counter @@ -100,66 +109,15 @@ type vaMetrics struct { } func initMetrics(stats prometheus.Registerer) *vaMetrics { - validationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "validation_time", - Help: "Total time taken to validate a challenge and aggregate results", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type", "result", "problem_type"}) - stats.MustRegister(validationTime) - localValidationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "local_validation_time", - Help: "Time taken to locally validate a challenge", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type", "result"}) - stats.MustRegister(localValidationTime) - remoteValidationTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "remote_validation_time", - Help: "Time taken to remotely validate a challenge", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"type"}) - stats.MustRegister(remoteValidationTime) - remoteValidationFailures := prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "remote_validation_failures", - Help: "Number of validations failed due to remote VAs returning failure when consensus is enforced", - }) - stats.MustRegister(remoteValidationFailures) - caaCheckTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "caa_check_time", - Help: "Total time taken to check CAA records and aggregate results", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"result"}) - stats.MustRegister(caaCheckTime) - localCAACheckTime := prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Name: "caa_check_time_local", - Help: "Time taken to locally check CAA records", - Buckets: metrics.InternetFacingBuckets, - }, - []string{"result"}) - stats.MustRegister(localCAACheckTime) - remoteCAACheckTime := prometheus.NewHistogramVec( + validationLatency := prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Name: "caa_check_time_remote", - Help: "Time taken to remotely check CAA records", + Name: "validation_latency", + Help: "Histogram of the latency to perform validations from the primary and remote VA perspectives", Buckets: metrics.InternetFacingBuckets, }, - []string{"result"}) - stats.MustRegister(remoteCAACheckTime) - remoteCAACheckFailures := prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "remote_caa_check_failures", - Help: "Number of CAA checks failed due to remote VAs returning failure when consensus is enforced", - }) - stats.MustRegister(remoteCAACheckFailures) + []string{"operation", "perspective", "challenge_type", "problem_type", "result"}, + ) + stats.MustRegister(validationLatency) prospectiveRemoteCAACheckFailures := prometheus.NewCounter( prometheus.CounterOpts{ Name: "prospective_remote_caa_check_failures", @@ -198,14 +156,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics { stats.MustRegister(ipv4FallbackCounter) return &vaMetrics{ - validationTime: validationTime, - remoteValidationTime: remoteValidationTime, - localValidationTime: localValidationTime, - remoteValidationFailures: remoteValidationFailures, - caaCheckTime: caaCheckTime, - localCAACheckTime: localCAACheckTime, - remoteCAACheckTime: remoteCAACheckTime, - remoteCAACheckFailures: remoteCAACheckFailures, + validationLatency: validationLatency, prospectiveRemoteCAACheckFailures: prospectiveRemoteCAACheckFailures, tlsALPNOIDCounter: tlsALPNOIDCounter, http01Fallbacks: http01Fallbacks, @@ -426,6 +377,11 @@ func detailedError(err error) *probs.ProblemDetails { return probs.Connection("Error getting validation data") } +// isPrimaryVA returns true if the VA is the primary validation perspective. +func (va *ValidationAuthorityImpl) isPrimaryVA() bool { + return va.perspective == PrimaryPerspective +} + // validateChallenge simply passes through to the appropriate validation method // depending on the challenge type. func (va *ValidationAuthorityImpl) validateChallenge( @@ -449,6 +405,25 @@ func (va *ValidationAuthorityImpl) validateChallenge( return nil, berrors.MalformedError("invalid challenge type %s", kind) } +// observeLatency records entries in the validationLatency histogram of the +// latency to perform validations from the primary and remote VA perspectives. +// The labels are: +// - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa] +// - perspective: [ValidationAuthorityImpl.perspective|all] +// - challenge_type: core.Challenge.Type +// - problem_type: probs.ProblemType +// - result: the result of the validation as [pass|fail] +func (va *ValidationAuthorityImpl) observeLatency(op, perspective, challType, probType, result string, latency time.Duration) { + labels := prometheus.Labels{ + "operation": op, + "perspective": perspective, + "challenge_type": challType, + "problem_type": probType, + "result": result, + } + va.metrics.validationLatency.With(labels).Observe(latency.Seconds()) +} + // performRemoteValidation coordinates the whole process of kicking off and // collecting results from calls to remote VAs' PerformValidation function. It // returns a problem if too many remote perspectives failed to corroborate @@ -463,13 +438,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( return nil } - start := va.clk.Now() - defer func() { - va.metrics.remoteValidationTime.With(prometheus.Labels{ - "type": req.Challenge.Type, - }).Observe(va.clk.Since(start).Seconds()) - }() - type response struct { addr string result *vapb.ValidationResult @@ -490,8 +458,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( required := remoteVACount - va.maxRemoteFailures var passed []string - // failed contains a list of perspectives that failed to validate the domain - // or the addresses of remote VAs that failed to respond. var failed []string var firstProb *probs.ProblemDetails @@ -534,7 +500,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation( } if len(failed) > va.maxRemoteFailures { // Too many failed responses to reach quorum. - va.metrics.remoteValidationFailures.Inc() firstProb.Detail = fmt.Sprintf("During secondary validation: %s", firstProb.Detail) return firstProb } @@ -649,12 +614,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v return nil, berrors.InternalServerError("Incomplete validation request") } - challenge, err := bgrpc.PBToChallenge(req.Challenge) + chall, err := bgrpc.PBToChallenge(req.Challenge) if err != nil { return nil, errors.New("challenge failed to deserialize") } - err = challenge.CheckPending() + err = chall.CheckPending() if err != nil { return nil, berrors.MalformedError("challenge failed consistency check: %s", err) } @@ -664,36 +629,34 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v // `prob`, or this will fail. var prob *probs.ProblemDetails var localLatency time.Duration - vStart := va.clk.Now() + start := va.clk.Now() logEvent := verificationRequestEvent{ ID: req.Authz.Id, Requester: req.Authz.RegID, Hostname: req.DnsName, - Challenge: challenge, + Challenge: chall, } defer func() { - problemType := "" + probType := "" + outcome := fail if prob != nil { - problemType = string(prob.Type) + probType = string(prob.Type) logEvent.Error = prob.Error() logEvent.Challenge.Error = prob logEvent.Challenge.Status = core.StatusInvalid } else { logEvent.Challenge.Status = core.StatusValid + outcome = pass + } + // Observe local validation latency (primary|remote). + va.observeLatency(opChallAndCAA, va.perspective, string(chall.Type), probType, outcome, localLatency) + if va.isPrimaryVA() { + // Observe total validation latency (primary+remote). + va.observeLatency(opChallAndCAA, allPerspectives, string(chall.Type), probType, outcome, va.clk.Since(start)) } - va.metrics.localValidationTime.With(prometheus.Labels{ - "type": string(logEvent.Challenge.Type), - "result": string(logEvent.Challenge.Status), - }).Observe(localLatency.Seconds()) - - va.metrics.validationTime.With(prometheus.Labels{ - "type": string(logEvent.Challenge.Type), - "result": string(logEvent.Challenge.Status), - "problem_type": problemType, - }).Observe(time.Since(vStart).Seconds()) - - logEvent.ValidationLatency = time.Since(vStart).Round(time.Millisecond).Seconds() + // Log the total validation latency. + logEvent.ValidationLatency = time.Since(start).Round(time.Millisecond).Seconds() va.log.AuditObject("Validation result", logEvent) }() @@ -705,10 +668,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v ctx, identifier.NewDNS(req.DnsName), req.Authz.RegID, - challenge.Type, - challenge.Token, + chall.Type, + chall.Token, req.ExpectedKeyAuthorization) - localLatency = time.Since(vStart) + + // Stop the clock for local validation latency. + localLatency = va.clk.Since(start) // Check for malformed ValidationRecords logEvent.Challenge.ValidationRecord = records diff --git a/va/va_test.go b/va/va_test.go index 90a949766af..f65d31f0342 100644 --- a/va/va_test.go +++ b/va/va_test.go @@ -257,11 +257,12 @@ func TestPerformValidationInvalid(t *testing.T) { req := createValidationRequest("foo.com", core.ChallengeTypeDNS01) res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems != nil, "validation succeeded") - - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "invalid", - "problem_type": "unauthorized", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": string(probs.UnauthorizedProblem), + "result": fail, }, 1) } @@ -286,10 +287,12 @@ func TestPerformValidationValid(t *testing.T) { res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems)) - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "valid", - "problem_type": "", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, }, 1) resultLog := mockLog.GetAllMatching(`Validation result`) if len(resultLog) != 1 { @@ -311,10 +314,12 @@ func TestPerformValidationWildcard(t *testing.T) { res, _ := va.PerformValidation(context.Background(), req) test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems)) - test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{ - "type": "dns-01", - "result": "valid", - "problem_type": "", + test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{ + "operation": opChallAndCAA, + "perspective": PrimaryPerspective, + "challenge_type": string(core.ChallengeTypeDNS01), + "problem_type": "", + "result": pass, }, 1) resultLog := mockLog.GetAllMatching(`Validation result`) if len(resultLog) != 1 { From 3506f0928564a23c16cbc7cb52d7a29ab3a9d1d5 Mon Sep 17 00:00:00 2001 From: Samantha Frank Date: Fri, 15 Nov 2024 16:33:51 -0500 Subject: [PATCH 2/3] RA: Make calls to countCertificateIssued and countFailedValidations synchronous (#7824) Solves CI flakes in TestCertificatesPerDomain and TestIdentifiersPausedForAccount that are the result of a race on the Redis database. This has the downside of making failed validations and successful finalizations take slightly longer. --- ra/ra.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ra/ra.go b/ra/ra.go index b6ce09a5589..765456e8838 100644 --- a/ra/ra.go +++ b/ra/ra.go @@ -1358,7 +1358,7 @@ func (ra *RegistrationAuthorityImpl) issueCertificateInner( return nil, nil, wrapError(err, "parsing final certificate") } - go ra.countCertificateIssued(ctx, int64(acctID), slices.Clone(parsedCertificate.DNSNames), isRenewal) + ra.countCertificateIssued(ctx, int64(acctID), slices.Clone(parsedCertificate.DNSNames), isRenewal) // Asynchronously submit the final certificate to any configured logs go ra.ctpolicy.SubmitFinalCert(cert.Der, parsedCertificate.NotAfter) @@ -1998,12 +1998,10 @@ func (ra *RegistrationAuthorityImpl) PerformValidation( if prob != nil { challenge.Status = core.StatusInvalid challenge.Error = prob - go func() { - err := ra.countFailedValidations(vaCtx, authz.RegistrationID, authz.Identifier) - if err != nil { - ra.log.Warningf("incrementing failed validations: %s", err) - } - }() + err := ra.countFailedValidations(vaCtx, authz.RegistrationID, authz.Identifier) + if err != nil { + ra.log.Warningf("incrementing failed validations: %s", err) + } } else { challenge.Status = core.StatusValid if features.Get().AutomaticallyPauseZombieClients { From 20fdcbcfe0f8d02b7866d3718cd0b72fdc2da953 Mon Sep 17 00:00:00 2001 From: Jacob Hoffman-Andrews Date: Fri, 15 Nov 2024 13:45:07 -0800 Subject: [PATCH 3/3] ratelimits: always use a pointer for limit (#7804) The zero value for `limit` is invalid, so returning `nil` in error cases avoids silently returning invalid limits (and means that if code makes a mistake and references an invalid limit it will be an obvious clear stack trace). This is more consistent, since the methods on `limit` use a pointer receiver. Also, since `limit` is a fairly large object, this saves some copying. Related to #7803 and #7797. --- ratelimits/gcra_test.go | 4 ++-- ratelimits/limit.go | 14 +++++++------- ratelimits/limit_test.go | 4 ++-- ratelimits/limiter_test.go | 14 +++++++------- ratelimits/transaction.go | 8 ++++---- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ratelimits/gcra_test.go b/ratelimits/gcra_test.go index 08c31255d4f..26b2ca0d082 100644 --- a/ratelimits/gcra_test.go +++ b/ratelimits/gcra_test.go @@ -11,7 +11,7 @@ import ( func TestDecide(t *testing.T) { clk := clock.NewFake() - limit := limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}} + limit := &limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}} limit.precompute() // Begin by using 1 of our 10 requests. @@ -138,7 +138,7 @@ func TestDecide(t *testing.T) { func TestMaybeRefund(t *testing.T) { clk := clock.NewFake() - limit := limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}} + limit := &limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}} limit.precompute() // Begin by using 1 of our 10 requests. diff --git a/ratelimits/limit.go b/ratelimits/limit.go index 32969a8d3af..c6999f1aa83 100644 --- a/ratelimits/limit.go +++ b/ratelimits/limit.go @@ -63,7 +63,7 @@ func (l *limit) precompute() { l.burstOffset = l.emissionInterval * l.Burst } -func validateLimit(l limit) error { +func validateLimit(l *limit) error { if l.Burst <= 0 { return fmt.Errorf("invalid burst '%d', must be > 0", l.Burst) } @@ -76,7 +76,7 @@ func validateLimit(l limit) error { return nil } -type limits map[string]limit +type limits map[string]*limit // loadDefaults marshals the defaults YAML file at path into a map of limits. func loadDefaults(path string) (limits, error) { @@ -156,7 +156,8 @@ func loadAndParseOverrideLimits(path string) (limits, error) { for _, ov := range fromFile { for k, v := range ov { - err = validateLimit(v.limit) + limit := &v.limit + err = validateLimit(limit) if err != nil { return nil, fmt.Errorf("validating override limit %q: %w", k, err) } @@ -167,7 +168,6 @@ func loadAndParseOverrideLimits(path string) (limits, error) { v.limit.name = name for _, entry := range v.Ids { - limit := v.limit id := entry.Id err = validateIdForName(name, id) if err != nil { @@ -248,11 +248,11 @@ func newLimitRegistry(defaults, overrides string) (*limitRegistry, error) { // required, bucketKey is optional. If bucketkey is empty, the default for the // limit specified by name is returned. If no default limit exists for the // specified name, errLimitDisabled is returned. -func (l *limitRegistry) getLimit(name Name, bucketKey string) (limit, error) { +func (l *limitRegistry) getLimit(name Name, bucketKey string) (*limit, error) { if !name.isValid() { // This should never happen. Callers should only be specifying the limit // Name enums defined in this package. - return limit{}, fmt.Errorf("specified name enum %q, is invalid", name) + return nil, fmt.Errorf("specified name enum %q, is invalid", name) } if bucketKey != "" { // Check for override. @@ -265,5 +265,5 @@ func (l *limitRegistry) getLimit(name Name, bucketKey string) (limit, error) { if ok { return dl, nil } - return limit{}, errLimitDisabled + return nil, errLimitDisabled } diff --git a/ratelimits/limit_test.go b/ratelimits/limit_test.go index dc9dc87f7ad..805a97e4cca 100644 --- a/ratelimits/limit_test.go +++ b/ratelimits/limit_test.go @@ -42,11 +42,11 @@ func TestParseOverrideNameId(t *testing.T) { } func TestValidateLimit(t *testing.T) { - err := validateLimit(limit{Burst: 1, Count: 1, Period: config.Duration{Duration: time.Second}}) + err := validateLimit(&limit{Burst: 1, Count: 1, Period: config.Duration{Duration: time.Second}}) test.AssertNotError(t, err, "valid limit") // All of the following are invalid. - for _, l := range []limit{ + for _, l := range []*limit{ {Burst: 0, Count: 1, Period: config.Duration{Duration: time.Second}}, {Burst: 1, Count: 0, Period: config.Duration{Duration: time.Second}}, {Burst: 1, Count: 1, Period: config.Duration{Duration: 0}}, diff --git a/ratelimits/limiter_test.go b/ratelimits/limiter_test.go index 41e89c36ad7..82627cfec25 100644 --- a/ratelimits/limiter_test.go +++ b/ratelimits/limiter_test.go @@ -497,7 +497,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 5 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: NewRegistrationsPerIPAddress, Burst: 10, Period: config.Duration{Duration: time.Hour}, @@ -513,7 +513,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 10 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: NewRegistrationsPerIPv6Range, Burst: 5, Period: config.Duration{Duration: time.Hour}, @@ -529,7 +529,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 10 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: NewOrdersPerAccount, Burst: 2, Period: config.Duration{Duration: time.Hour}, @@ -545,7 +545,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 15 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: FailedAuthorizationsPerDomainPerAccount, Burst: 7, Period: config.Duration{Duration: time.Hour}, @@ -562,7 +562,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 20 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: CertificatesPerDomain, Burst: 3, Period: config.Duration{Duration: time.Hour}, @@ -579,7 +579,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 20 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: CertificatesPerDomainPerAccount, Burst: 3, Period: config.Duration{Duration: time.Hour}, @@ -596,7 +596,7 @@ func TestRateLimitError(t *testing.T) { allowed: false, retryIn: 30 * time.Second, transaction: Transaction{ - limit: limit{ + limit: &limit{ name: 9999999, }, }, diff --git a/ratelimits/transaction.go b/ratelimits/transaction.go index fe3afb0a466..fc5df72c547 100644 --- a/ratelimits/transaction.go +++ b/ratelimits/transaction.go @@ -107,7 +107,7 @@ func newFQDNSetBucketKey(name Name, orderNames []string) (string, error) { //nol // it would fail validateTransaction (for instance because cost and burst are zero). type Transaction struct { bucketKey string - limit limit + limit *limit cost int64 check bool spend bool @@ -143,7 +143,7 @@ func validateTransaction(txn Transaction) (Transaction, error) { return txn, nil } -func newTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) { +func newTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) { return validateTransaction(Transaction{ bucketKey: bucketKey, limit: limit, @@ -153,7 +153,7 @@ func newTransaction(limit limit, bucketKey string, cost int64) (Transaction, err }) } -func newCheckOnlyTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) { +func newCheckOnlyTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) { return validateTransaction(Transaction{ bucketKey: bucketKey, limit: limit, @@ -162,7 +162,7 @@ func newCheckOnlyTransaction(limit limit, bucketKey string, cost int64) (Transac }) } -func newSpendOnlyTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) { +func newSpendOnlyTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) { return validateTransaction(Transaction{ bucketKey: bucketKey, limit: limit,