From 3baac6f6dff3fb32f4e5384c54c9aa563ceda748 Mon Sep 17 00:00:00 2001
From: Samantha Frank <hello@entropy.cat>
Date: Fri, 15 Nov 2024 15:51:39 -0500
Subject: [PATCH 1/3] VA: Consolidate multiple metrics into one histogram
 (#7816)

- Add a new histogram, validationLatency
- Add a VA.observeLatency for observing validation latency
- Refactor to ensure this metric can be observed exclusively within
VA.PerformValidation and VA.IsCAAValid.
- Replace validationTime, localValidationTime, remoteValidationTime,
remoteValidationFailures, caaCheckTime, localCAACheckTime,
remoteCAACheckTime, and remoteCAACheckFailures with validationLatency
---
 va/caa.go      |  79 ++++++++++------------
 va/caa_test.go |  55 ++++++++++++++++
 va/dns_test.go |  10 +--
 va/va.go       | 175 ++++++++++++++++++++-----------------------------
 va/va_test.go  |  31 +++++----
 5 files changed, 185 insertions(+), 165 deletions(-)

diff --git a/va/caa.go b/va/caa.go
index a3f4d6c2980..da35eb2f4bd 100644
--- a/va/caa.go
+++ b/va/caa.go
@@ -11,7 +11,6 @@ import (
 	"time"
 
 	"github.com/miekg/dns"
-	"github.com/prometheus/client_golang/prometheus"
 
 	"github.com/letsencrypt/boulder/bdns"
 	"github.com/letsencrypt/boulder/canceled"
@@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
 		Requester: req.AccountURIID,
 		Hostname:  req.Domain,
 	}
-	checkStartTime := va.clk.Now()
 
-	validationMethod := core.AcmeChallenge(req.ValidationMethod)
-	if !validationMethod.IsValid() {
+	challType := core.AcmeChallenge(req.ValidationMethod)
+	if !challType.IsValid() {
 		return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod)
 	}
 
 	acmeID := identifier.NewDNS(req.Domain)
 	params := &caaParams{
 		accountURIID:     req.AccountURIID,
-		validationMethod: validationMethod,
+		validationMethod: challType,
 	}
 
+	var prob *probs.ProblemDetails
+	var internalErr error
+	var localLatency time.Duration
+	start := va.clk.Now()
+
+	defer func() {
+		probType := ""
+		outcome := fail
+		if prob != nil {
+			// CAA check failed.
+			probType = string(prob.Type)
+			logEvent.Error = prob.Error()
+		} else {
+			// CAA check passed.
+			outcome = pass
+		}
+		// Observe local check latency (primary|remote).
+		va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency)
+		if va.isPrimaryVA() {
+			// Observe total check latency (primary+remote).
+			va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start))
+		}
+		// Log the total check latency.
+		logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds()
+
+		va.log.AuditObject("CAA check result", logEvent)
+	}()
+
 	var remoteCAAResults chan *remoteVAResult
 	if features.Get().EnforceMultiCAA {
 		if remoteVACount := len(va.remoteVAs); remoteVACount > 0 {
@@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
 		}
 	}
 
-	checkResult := "success"
-	err := va.checkCAA(ctx, acmeID, params)
-	localCheckLatency := time.Since(checkStartTime)
-	var prob *probs.ProblemDetails
-	if err != nil {
-		prob = detailedError(err)
-		logEvent.Error = prob.Error()
-		logEvent.InternalError = err.Error()
+	internalErr = va.checkCAA(ctx, acmeID, params)
+	if internalErr != nil {
+		prob = detailedError(internalErr)
 		prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail)
-		checkResult = "failure"
 	} else if remoteCAAResults != nil {
 		if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults {
 			// If we're not going to enforce multi CAA but we are logging the
@@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
 				_ = va.processRemoteCAAResults(
 					req.Domain,
 					req.AccountURIID,
-					string(validationMethod),
+					string(challType),
 					remoteCAAResults)
 			}()
 		} else if features.Get().EnforceMultiCAA {
 			remoteProb := va.processRemoteCAAResults(
 				req.Domain,
 				req.AccountURIID,
-				string(validationMethod),
+				string(challType),
 				remoteCAAResults)
 
 			// If the remote result was a non-nil problem then fail the CAA check
 			if remoteProb != nil {
 				prob = remoteProb
-				// We only set .Error here, not InternalError, because the remote VA doesn't send
-				// us the internal error. But that's okay, because it got logged at the remote VA.
-				logEvent.Error = remoteProb.Error()
-				checkResult = "failure"
 				va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s",
 					req.Domain, remoteProb)
-				va.metrics.remoteCAACheckFailures.Inc()
 			}
 		}
 	}
-	checkLatency := time.Since(checkStartTime)
-	logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds()
-
-	va.metrics.localCAACheckTime.With(prometheus.Labels{
-		"result": checkResult,
-	}).Observe(localCheckLatency.Seconds())
-	va.metrics.caaCheckTime.With(prometheus.Labels{
-		"result": checkResult,
-	}).Observe(checkLatency.Seconds())
-
-	va.log.AuditObject("CAA check result", logEvent)
 
 	if prob != nil {
 		// The ProblemDetails will be serialized through gRPC, which requires UTF-8.
@@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
 	challengeType string,
 	remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails {
 
-	state := "failure"
-	start := va.clk.Now()
-
-	defer func() {
-		va.metrics.remoteCAACheckTime.With(prometheus.Labels{
-			"result": state,
-		}).Observe(va.clk.Since(start).Seconds())
-	}()
-
 	required := len(va.remoteVAs) - va.maxRemoteFailures
 	good := 0
 	bad := 0
@@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
 		// success or failure threshold is met.
 		if !features.Get().MultiCAAFullResults {
 			if good >= required {
-				state = "success"
 				return nil
 			} else if bad > va.maxRemoteFailures {
 				modifiedProblem := *result.Problem
@@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
 
 	// Based on the threshold of good/bad return nil or a problem.
 	if good >= required {
-		state = "success"
 		return nil
 	} else if bad > va.maxRemoteFailures {
 		modifiedProblem := *firstProb
diff --git a/va/caa_test.go b/va/caa_test.go
index 9b9789d57b8..802dd01e004 100644
--- a/va/caa_test.go
+++ b/va/caa_test.go
@@ -9,6 +9,7 @@ import (
 	"testing"
 
 	"github.com/miekg/dns"
+	"github.com/prometheus/client_golang/prometheus"
 
 	"github.com/letsencrypt/boulder/bdns"
 	"github.com/letsencrypt/boulder/core"
@@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) {
 		expectedProbSubstring    string
 		expectedProbType         probs.ProblemType
 		expectedDiffLogSubstring string
+		expectedLabels           prometheus.Labels
 		localDNSClient           bdns.Client
 	}{
 		{
@@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{remoteVA, remoteUA},
 				{remoteVA, remoteUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   "",
+				"result":         pass,
+			},
 		},
 		{
 			name:                  "broken localVA, RVAs functional, no CAA records",
@@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{remoteVA, remoteUA},
 				{remoteVA, remoteUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   string(probs.DNSProblem),
+				"result":         fail,
+			},
 		},
 		{
 			name:                     "functional localVA, 1 broken RVA, no CAA records",
@@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{remoteVA, remoteUA},
 				{remoteVA, remoteUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   string(probs.DNSProblem),
+				"result":         fail,
+			},
 		},
 		{
 			name:                     "functional localVA, all broken RVAs, no CAA records",
@@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{brokenVA, brokenUA},
 				{brokenVA, brokenUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   string(probs.DNSProblem),
+				"result":         fail,
+			},
 		},
 		{
 			name:           "all VAs functional, CAA issue type present",
@@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{remoteVA, remoteUA},
 				{remoteVA, remoteUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   "",
+				"result":         pass,
+			},
 		},
 		{
 			name:                     "functional localVA, 1 broken RVA, CAA issue type present",
@@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{remoteVA, remoteUA},
 				{remoteVA, remoteUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   string(probs.DNSProblem),
+				"result":         fail,
+			},
 		},
 		{
 			name:                     "functional localVA, all broken RVAs, CAA issue type present",
@@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) {
 				{brokenVA, brokenUA},
 				{brokenVA, brokenUA},
 			},
+			expectedLabels: prometheus.Labels{
+				"operation":      opCAA,
+				"perspective":    allPerspectives,
+				"challenge_type": string(core.ChallengeTypeDNS01),
+				"problem_type":   string(probs.DNSProblem),
+				"result":         fail,
+			},
 		},
 		{
 			// The localVA kicks off the background goroutines before doing its
@@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) {
 			} else {
 				test.AssertEquals(t, len(gotAnyRemoteFailures), 0)
 			}
+
+			if tc.expectedLabels != nil {
+				test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1)
+			}
 		})
 	}
 }
diff --git a/va/dns_test.go b/va/dns_test.go
index a545228a47f..d9fc776bd10 100644
--- a/va/dns_test.go
+++ b/va/dns_test.go
@@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) {
 	test.AssertEquals(t, res.Problems.ProblemType, "unauthorized")
 	test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com")
 
-	test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
-		"type":         "dns-01",
-		"result":       "invalid",
-		"problem_type": "unauthorized",
+	test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
+		"operation":      opChallAndCAA,
+		"perspective":    PrimaryPerspective,
+		"challenge_type": string(core.ChallengeTypeDNS01),
+		"problem_type":   string(probs.UnauthorizedProblem),
+		"result":         fail,
 	}, 1)
 }
 
diff --git a/va/va.go b/va/va.go
index d130f4d6a92..22c2a60b601 100644
--- a/va/va.go
+++ b/va/va.go
@@ -31,7 +31,16 @@ import (
 	vapb "github.com/letsencrypt/boulder/va/proto"
 )
 
-const PrimaryPerspective = "Primary"
+const (
+	PrimaryPerspective = "Primary"
+	allPerspectives    = "all"
+
+	opChallAndCAA = "challenge+caa"
+	opCAA         = "caa"
+
+	pass = "pass"
+	fail = "fail"
+)
 
 var (
 	// badTLSHeader contains the string 'HTTP /' which is returned when
@@ -83,14 +92,14 @@ type RemoteVA struct {
 }
 
 type vaMetrics struct {
-	validationTime                    *prometheus.HistogramVec
-	localValidationTime               *prometheus.HistogramVec
-	remoteValidationTime              *prometheus.HistogramVec
-	remoteValidationFailures          prometheus.Counter
-	caaCheckTime                      *prometheus.HistogramVec
-	localCAACheckTime                 *prometheus.HistogramVec
-	remoteCAACheckTime                *prometheus.HistogramVec
-	remoteCAACheckFailures            prometheus.Counter
+	// validationLatency is a histogram of the latency to perform validations
+	// from the primary and remote VA perspectives. It's labelled by:
+	//   - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa|challenge+caa]
+	//   - perspective: ValidationAuthorityImpl.perspective
+	//   - challenge_type: core.Challenge.Type
+	//   - problem_type: probs.ProblemType
+	//   - result: the result of the validation as [pass|fail]
+	validationLatency                 *prometheus.HistogramVec
 	prospectiveRemoteCAACheckFailures prometheus.Counter
 	tlsALPNOIDCounter                 *prometheus.CounterVec
 	http01Fallbacks                   prometheus.Counter
@@ -100,66 +109,15 @@ type vaMetrics struct {
 }
 
 func initMetrics(stats prometheus.Registerer) *vaMetrics {
-	validationTime := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Name:    "validation_time",
-			Help:    "Total time taken to validate a challenge and aggregate results",
-			Buckets: metrics.InternetFacingBuckets,
-		},
-		[]string{"type", "result", "problem_type"})
-	stats.MustRegister(validationTime)
-	localValidationTime := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Name:    "local_validation_time",
-			Help:    "Time taken to locally validate a challenge",
-			Buckets: metrics.InternetFacingBuckets,
-		},
-		[]string{"type", "result"})
-	stats.MustRegister(localValidationTime)
-	remoteValidationTime := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Name:    "remote_validation_time",
-			Help:    "Time taken to remotely validate a challenge",
-			Buckets: metrics.InternetFacingBuckets,
-		},
-		[]string{"type"})
-	stats.MustRegister(remoteValidationTime)
-	remoteValidationFailures := prometheus.NewCounter(
-		prometheus.CounterOpts{
-			Name: "remote_validation_failures",
-			Help: "Number of validations failed due to remote VAs returning failure when consensus is enforced",
-		})
-	stats.MustRegister(remoteValidationFailures)
-	caaCheckTime := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Name:    "caa_check_time",
-			Help:    "Total time taken to check CAA records and aggregate results",
-			Buckets: metrics.InternetFacingBuckets,
-		},
-		[]string{"result"})
-	stats.MustRegister(caaCheckTime)
-	localCAACheckTime := prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Name:    "caa_check_time_local",
-			Help:    "Time taken to locally check CAA records",
-			Buckets: metrics.InternetFacingBuckets,
-		},
-		[]string{"result"})
-	stats.MustRegister(localCAACheckTime)
-	remoteCAACheckTime := prometheus.NewHistogramVec(
+	validationLatency := prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
-			Name:    "caa_check_time_remote",
-			Help:    "Time taken to remotely check CAA records",
+			Name:    "validation_latency",
+			Help:    "Histogram of the latency to perform validations from the primary and remote VA perspectives",
 			Buckets: metrics.InternetFacingBuckets,
 		},
-		[]string{"result"})
-	stats.MustRegister(remoteCAACheckTime)
-	remoteCAACheckFailures := prometheus.NewCounter(
-		prometheus.CounterOpts{
-			Name: "remote_caa_check_failures",
-			Help: "Number of CAA checks failed due to remote VAs returning failure when consensus is enforced",
-		})
-	stats.MustRegister(remoteCAACheckFailures)
+		[]string{"operation", "perspective", "challenge_type", "problem_type", "result"},
+	)
+	stats.MustRegister(validationLatency)
 	prospectiveRemoteCAACheckFailures := prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Name: "prospective_remote_caa_check_failures",
@@ -198,14 +156,7 @@ func initMetrics(stats prometheus.Registerer) *vaMetrics {
 	stats.MustRegister(ipv4FallbackCounter)
 
 	return &vaMetrics{
-		validationTime:                    validationTime,
-		remoteValidationTime:              remoteValidationTime,
-		localValidationTime:               localValidationTime,
-		remoteValidationFailures:          remoteValidationFailures,
-		caaCheckTime:                      caaCheckTime,
-		localCAACheckTime:                 localCAACheckTime,
-		remoteCAACheckTime:                remoteCAACheckTime,
-		remoteCAACheckFailures:            remoteCAACheckFailures,
+		validationLatency:                 validationLatency,
 		prospectiveRemoteCAACheckFailures: prospectiveRemoteCAACheckFailures,
 		tlsALPNOIDCounter:                 tlsALPNOIDCounter,
 		http01Fallbacks:                   http01Fallbacks,
@@ -426,6 +377,11 @@ func detailedError(err error) *probs.ProblemDetails {
 	return probs.Connection("Error getting validation data")
 }
 
+// isPrimaryVA returns true if the VA is the primary validation perspective.
+func (va *ValidationAuthorityImpl) isPrimaryVA() bool {
+	return va.perspective == PrimaryPerspective
+}
+
 // validateChallenge simply passes through to the appropriate validation method
 // depending on the challenge type.
 func (va *ValidationAuthorityImpl) validateChallenge(
@@ -449,6 +405,25 @@ func (va *ValidationAuthorityImpl) validateChallenge(
 	return nil, berrors.MalformedError("invalid challenge type %s", kind)
 }
 
+// observeLatency records entries in the validationLatency histogram of the
+// latency to perform validations from the primary and remote VA perspectives.
+// The labels are:
+//   - operation: VA.ValidateChallenge or VA.CheckCAA as [challenge|caa]
+//   - perspective: [ValidationAuthorityImpl.perspective|all]
+//   - challenge_type: core.Challenge.Type
+//   - problem_type: probs.ProblemType
+//   - result: the result of the validation as [pass|fail]
+func (va *ValidationAuthorityImpl) observeLatency(op, perspective, challType, probType, result string, latency time.Duration) {
+	labels := prometheus.Labels{
+		"operation":      op,
+		"perspective":    perspective,
+		"challenge_type": challType,
+		"problem_type":   probType,
+		"result":         result,
+	}
+	va.metrics.validationLatency.With(labels).Observe(latency.Seconds())
+}
+
 // performRemoteValidation coordinates the whole process of kicking off and
 // collecting results from calls to remote VAs' PerformValidation function. It
 // returns a problem if too many remote perspectives failed to corroborate
@@ -463,13 +438,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
 		return nil
 	}
 
-	start := va.clk.Now()
-	defer func() {
-		va.metrics.remoteValidationTime.With(prometheus.Labels{
-			"type": req.Challenge.Type,
-		}).Observe(va.clk.Since(start).Seconds())
-	}()
-
 	type response struct {
 		addr   string
 		result *vapb.ValidationResult
@@ -490,8 +458,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
 
 	required := remoteVACount - va.maxRemoteFailures
 	var passed []string
-	// failed contains a list of perspectives that failed to validate the domain
-	// or the addresses of remote VAs that failed to respond.
 	var failed []string
 	var firstProb *probs.ProblemDetails
 
@@ -534,7 +500,6 @@ func (va *ValidationAuthorityImpl) performRemoteValidation(
 		}
 		if len(failed) > va.maxRemoteFailures {
 			// Too many failed responses to reach quorum.
-			va.metrics.remoteValidationFailures.Inc()
 			firstProb.Detail = fmt.Sprintf("During secondary validation: %s", firstProb.Detail)
 			return firstProb
 		}
@@ -649,12 +614,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
 		return nil, berrors.InternalServerError("Incomplete validation request")
 	}
 
-	challenge, err := bgrpc.PBToChallenge(req.Challenge)
+	chall, err := bgrpc.PBToChallenge(req.Challenge)
 	if err != nil {
 		return nil, errors.New("challenge failed to deserialize")
 	}
 
-	err = challenge.CheckPending()
+	err = chall.CheckPending()
 	if err != nil {
 		return nil, berrors.MalformedError("challenge failed consistency check: %s", err)
 	}
@@ -664,36 +629,34 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
 	// `prob`, or this will fail.
 	var prob *probs.ProblemDetails
 	var localLatency time.Duration
-	vStart := va.clk.Now()
+	start := va.clk.Now()
 	logEvent := verificationRequestEvent{
 		ID:        req.Authz.Id,
 		Requester: req.Authz.RegID,
 		Hostname:  req.DnsName,
-		Challenge: challenge,
+		Challenge: chall,
 	}
 	defer func() {
-		problemType := ""
+		probType := ""
+		outcome := fail
 		if prob != nil {
-			problemType = string(prob.Type)
+			probType = string(prob.Type)
 			logEvent.Error = prob.Error()
 			logEvent.Challenge.Error = prob
 			logEvent.Challenge.Status = core.StatusInvalid
 		} else {
 			logEvent.Challenge.Status = core.StatusValid
+			outcome = pass
+		}
+		// Observe local validation latency (primary|remote).
+		va.observeLatency(opChallAndCAA, va.perspective, string(chall.Type), probType, outcome, localLatency)
+		if va.isPrimaryVA() {
+			// Observe total validation latency (primary+remote).
+			va.observeLatency(opChallAndCAA, allPerspectives, string(chall.Type), probType, outcome, va.clk.Since(start))
 		}
 
-		va.metrics.localValidationTime.With(prometheus.Labels{
-			"type":   string(logEvent.Challenge.Type),
-			"result": string(logEvent.Challenge.Status),
-		}).Observe(localLatency.Seconds())
-
-		va.metrics.validationTime.With(prometheus.Labels{
-			"type":         string(logEvent.Challenge.Type),
-			"result":       string(logEvent.Challenge.Status),
-			"problem_type": problemType,
-		}).Observe(time.Since(vStart).Seconds())
-
-		logEvent.ValidationLatency = time.Since(vStart).Round(time.Millisecond).Seconds()
+		// Log the total validation latency.
+		logEvent.ValidationLatency = time.Since(start).Round(time.Millisecond).Seconds()
 		va.log.AuditObject("Validation result", logEvent)
 	}()
 
@@ -705,10 +668,12 @@ func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *v
 		ctx,
 		identifier.NewDNS(req.DnsName),
 		req.Authz.RegID,
-		challenge.Type,
-		challenge.Token,
+		chall.Type,
+		chall.Token,
 		req.ExpectedKeyAuthorization)
-	localLatency = time.Since(vStart)
+
+	// Stop the clock for local validation latency.
+	localLatency = va.clk.Since(start)
 
 	// Check for malformed ValidationRecords
 	logEvent.Challenge.ValidationRecord = records
diff --git a/va/va_test.go b/va/va_test.go
index 90a949766af..f65d31f0342 100644
--- a/va/va_test.go
+++ b/va/va_test.go
@@ -257,11 +257,12 @@ func TestPerformValidationInvalid(t *testing.T) {
 	req := createValidationRequest("foo.com", core.ChallengeTypeDNS01)
 	res, _ := va.PerformValidation(context.Background(), req)
 	test.Assert(t, res.Problems != nil, "validation succeeded")
-
-	test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
-		"type":         "dns-01",
-		"result":       "invalid",
-		"problem_type": "unauthorized",
+	test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
+		"operation":      opChallAndCAA,
+		"perspective":    PrimaryPerspective,
+		"challenge_type": string(core.ChallengeTypeDNS01),
+		"problem_type":   string(probs.UnauthorizedProblem),
+		"result":         fail,
 	}, 1)
 }
 
@@ -286,10 +287,12 @@ func TestPerformValidationValid(t *testing.T) {
 	res, _ := va.PerformValidation(context.Background(), req)
 	test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems))
 
-	test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
-		"type":         "dns-01",
-		"result":       "valid",
-		"problem_type": "",
+	test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
+		"operation":      opChallAndCAA,
+		"perspective":    PrimaryPerspective,
+		"challenge_type": string(core.ChallengeTypeDNS01),
+		"problem_type":   "",
+		"result":         pass,
 	}, 1)
 	resultLog := mockLog.GetAllMatching(`Validation result`)
 	if len(resultLog) != 1 {
@@ -311,10 +314,12 @@ func TestPerformValidationWildcard(t *testing.T) {
 	res, _ := va.PerformValidation(context.Background(), req)
 	test.Assert(t, res.Problems == nil, fmt.Sprintf("validation failed: %#v", res.Problems))
 
-	test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
-		"type":         "dns-01",
-		"result":       "valid",
-		"problem_type": "",
+	test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
+		"operation":      opChallAndCAA,
+		"perspective":    PrimaryPerspective,
+		"challenge_type": string(core.ChallengeTypeDNS01),
+		"problem_type":   "",
+		"result":         pass,
 	}, 1)
 	resultLog := mockLog.GetAllMatching(`Validation result`)
 	if len(resultLog) != 1 {

From 3506f0928564a23c16cbc7cb52d7a29ab3a9d1d5 Mon Sep 17 00:00:00 2001
From: Samantha Frank <hello@entropy.cat>
Date: Fri, 15 Nov 2024 16:33:51 -0500
Subject: [PATCH 2/3] RA: Make calls to countCertificateIssued and
 countFailedValidations synchronous (#7824)

Solves CI flakes in TestCertificatesPerDomain and
TestIdentifiersPausedForAccount that are the result of a race on the
Redis database. This has the downside of making failed validations and
successful finalizations take slightly longer.
---
 ra/ra.go | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/ra/ra.go b/ra/ra.go
index b6ce09a5589..765456e8838 100644
--- a/ra/ra.go
+++ b/ra/ra.go
@@ -1358,7 +1358,7 @@ func (ra *RegistrationAuthorityImpl) issueCertificateInner(
 		return nil, nil, wrapError(err, "parsing final certificate")
 	}
 
-	go ra.countCertificateIssued(ctx, int64(acctID), slices.Clone(parsedCertificate.DNSNames), isRenewal)
+	ra.countCertificateIssued(ctx, int64(acctID), slices.Clone(parsedCertificate.DNSNames), isRenewal)
 
 	// Asynchronously submit the final certificate to any configured logs
 	go ra.ctpolicy.SubmitFinalCert(cert.Der, parsedCertificate.NotAfter)
@@ -1998,12 +1998,10 @@ func (ra *RegistrationAuthorityImpl) PerformValidation(
 		if prob != nil {
 			challenge.Status = core.StatusInvalid
 			challenge.Error = prob
-			go func() {
-				err := ra.countFailedValidations(vaCtx, authz.RegistrationID, authz.Identifier)
-				if err != nil {
-					ra.log.Warningf("incrementing failed validations: %s", err)
-				}
-			}()
+			err := ra.countFailedValidations(vaCtx, authz.RegistrationID, authz.Identifier)
+			if err != nil {
+				ra.log.Warningf("incrementing failed validations: %s", err)
+			}
 		} else {
 			challenge.Status = core.StatusValid
 			if features.Get().AutomaticallyPauseZombieClients {

From 20fdcbcfe0f8d02b7866d3718cd0b72fdc2da953 Mon Sep 17 00:00:00 2001
From: Jacob Hoffman-Andrews <jsha+github@letsencrypt.org>
Date: Fri, 15 Nov 2024 13:45:07 -0800
Subject: [PATCH 3/3] ratelimits: always use a pointer for limit (#7804)

The zero value for `limit` is invalid, so returning `nil` in error cases
avoids silently returning invalid limits (and means that if code makes a
mistake and references an invalid limit it will be an obvious clear
stack trace).

This is more consistent, since the methods on `limit` use a pointer
receiver. Also, since `limit` is a fairly large object, this saves some
copying.

Related to #7803 and #7797.
---
 ratelimits/gcra_test.go    |  4 ++--
 ratelimits/limit.go        | 14 +++++++-------
 ratelimits/limit_test.go   |  4 ++--
 ratelimits/limiter_test.go | 14 +++++++-------
 ratelimits/transaction.go  |  8 ++++----
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/ratelimits/gcra_test.go b/ratelimits/gcra_test.go
index 08c31255d4f..26b2ca0d082 100644
--- a/ratelimits/gcra_test.go
+++ b/ratelimits/gcra_test.go
@@ -11,7 +11,7 @@ import (
 
 func TestDecide(t *testing.T) {
 	clk := clock.NewFake()
-	limit := limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}}
+	limit := &limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}}
 	limit.precompute()
 
 	// Begin by using 1 of our 10 requests.
@@ -138,7 +138,7 @@ func TestDecide(t *testing.T) {
 
 func TestMaybeRefund(t *testing.T) {
 	clk := clock.NewFake()
-	limit := limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}}
+	limit := &limit{Burst: 10, Count: 1, Period: config.Duration{Duration: time.Second}}
 	limit.precompute()
 
 	// Begin by using 1 of our 10 requests.
diff --git a/ratelimits/limit.go b/ratelimits/limit.go
index 32969a8d3af..c6999f1aa83 100644
--- a/ratelimits/limit.go
+++ b/ratelimits/limit.go
@@ -63,7 +63,7 @@ func (l *limit) precompute() {
 	l.burstOffset = l.emissionInterval * l.Burst
 }
 
-func validateLimit(l limit) error {
+func validateLimit(l *limit) error {
 	if l.Burst <= 0 {
 		return fmt.Errorf("invalid burst '%d', must be > 0", l.Burst)
 	}
@@ -76,7 +76,7 @@ func validateLimit(l limit) error {
 	return nil
 }
 
-type limits map[string]limit
+type limits map[string]*limit
 
 // loadDefaults marshals the defaults YAML file at path into a map of limits.
 func loadDefaults(path string) (limits, error) {
@@ -156,7 +156,8 @@ func loadAndParseOverrideLimits(path string) (limits, error) {
 
 	for _, ov := range fromFile {
 		for k, v := range ov {
-			err = validateLimit(v.limit)
+			limit := &v.limit
+			err = validateLimit(limit)
 			if err != nil {
 				return nil, fmt.Errorf("validating override limit %q: %w", k, err)
 			}
@@ -167,7 +168,6 @@ func loadAndParseOverrideLimits(path string) (limits, error) {
 			v.limit.name = name
 
 			for _, entry := range v.Ids {
-				limit := v.limit
 				id := entry.Id
 				err = validateIdForName(name, id)
 				if err != nil {
@@ -248,11 +248,11 @@ func newLimitRegistry(defaults, overrides string) (*limitRegistry, error) {
 // required, bucketKey is optional. If bucketkey is empty, the default for the
 // limit specified by name is returned. If no default limit exists for the
 // specified name, errLimitDisabled is returned.
-func (l *limitRegistry) getLimit(name Name, bucketKey string) (limit, error) {
+func (l *limitRegistry) getLimit(name Name, bucketKey string) (*limit, error) {
 	if !name.isValid() {
 		// This should never happen. Callers should only be specifying the limit
 		// Name enums defined in this package.
-		return limit{}, fmt.Errorf("specified name enum %q, is invalid", name)
+		return nil, fmt.Errorf("specified name enum %q, is invalid", name)
 	}
 	if bucketKey != "" {
 		// Check for override.
@@ -265,5 +265,5 @@ func (l *limitRegistry) getLimit(name Name, bucketKey string) (limit, error) {
 	if ok {
 		return dl, nil
 	}
-	return limit{}, errLimitDisabled
+	return nil, errLimitDisabled
 }
diff --git a/ratelimits/limit_test.go b/ratelimits/limit_test.go
index dc9dc87f7ad..805a97e4cca 100644
--- a/ratelimits/limit_test.go
+++ b/ratelimits/limit_test.go
@@ -42,11 +42,11 @@ func TestParseOverrideNameId(t *testing.T) {
 }
 
 func TestValidateLimit(t *testing.T) {
-	err := validateLimit(limit{Burst: 1, Count: 1, Period: config.Duration{Duration: time.Second}})
+	err := validateLimit(&limit{Burst: 1, Count: 1, Period: config.Duration{Duration: time.Second}})
 	test.AssertNotError(t, err, "valid limit")
 
 	// All of the following are invalid.
-	for _, l := range []limit{
+	for _, l := range []*limit{
 		{Burst: 0, Count: 1, Period: config.Duration{Duration: time.Second}},
 		{Burst: 1, Count: 0, Period: config.Duration{Duration: time.Second}},
 		{Burst: 1, Count: 1, Period: config.Duration{Duration: 0}},
diff --git a/ratelimits/limiter_test.go b/ratelimits/limiter_test.go
index 41e89c36ad7..82627cfec25 100644
--- a/ratelimits/limiter_test.go
+++ b/ratelimits/limiter_test.go
@@ -497,7 +497,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 5 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   NewRegistrationsPerIPAddress,
 						Burst:  10,
 						Period: config.Duration{Duration: time.Hour},
@@ -513,7 +513,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 10 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   NewRegistrationsPerIPv6Range,
 						Burst:  5,
 						Period: config.Duration{Duration: time.Hour},
@@ -529,7 +529,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 10 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   NewOrdersPerAccount,
 						Burst:  2,
 						Period: config.Duration{Duration: time.Hour},
@@ -545,7 +545,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 15 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   FailedAuthorizationsPerDomainPerAccount,
 						Burst:  7,
 						Period: config.Duration{Duration: time.Hour},
@@ -562,7 +562,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 20 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   CertificatesPerDomain,
 						Burst:  3,
 						Period: config.Duration{Duration: time.Hour},
@@ -579,7 +579,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 20 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name:   CertificatesPerDomainPerAccount,
 						Burst:  3,
 						Period: config.Duration{Duration: time.Hour},
@@ -596,7 +596,7 @@ func TestRateLimitError(t *testing.T) {
 				allowed: false,
 				retryIn: 30 * time.Second,
 				transaction: Transaction{
-					limit: limit{
+					limit: &limit{
 						name: 9999999,
 					},
 				},
diff --git a/ratelimits/transaction.go b/ratelimits/transaction.go
index fe3afb0a466..fc5df72c547 100644
--- a/ratelimits/transaction.go
+++ b/ratelimits/transaction.go
@@ -107,7 +107,7 @@ func newFQDNSetBucketKey(name Name, orderNames []string) (string, error) { //nol
 // it would fail validateTransaction (for instance because cost and burst are zero).
 type Transaction struct {
 	bucketKey string
-	limit     limit
+	limit     *limit
 	cost      int64
 	check     bool
 	spend     bool
@@ -143,7 +143,7 @@ func validateTransaction(txn Transaction) (Transaction, error) {
 	return txn, nil
 }
 
-func newTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) {
+func newTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) {
 	return validateTransaction(Transaction{
 		bucketKey: bucketKey,
 		limit:     limit,
@@ -153,7 +153,7 @@ func newTransaction(limit limit, bucketKey string, cost int64) (Transaction, err
 	})
 }
 
-func newCheckOnlyTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) {
+func newCheckOnlyTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) {
 	return validateTransaction(Transaction{
 		bucketKey: bucketKey,
 		limit:     limit,
@@ -162,7 +162,7 @@ func newCheckOnlyTransaction(limit limit, bucketKey string, cost int64) (Transac
 	})
 }
 
-func newSpendOnlyTransaction(limit limit, bucketKey string, cost int64) (Transaction, error) {
+func newSpendOnlyTransaction(limit *limit, bucketKey string, cost int64) (Transaction, error) {
 	return validateTransaction(Transaction{
 		bucketKey: bucketKey,
 		limit:     limit,