Skip to content

Commit

Permalink
VA: Consolidate multiple metrics into one histogram (#7816)
Browse files Browse the repository at this point in the history
- Add a new histogram, validationLatency
- Add a VA.observeLatency for observing validation latency
- Refactor to ensure this metric can be observed exclusively within
VA.PerformValidation and VA.IsCAAValid.
- Replace validationTime, localValidationTime, remoteValidationTime,
remoteValidationFailures, caaCheckTime, localCAACheckTime,
remoteCAACheckTime, and remoteCAACheckFailures with validationLatency
  • Loading branch information
beautifulentropy authored Nov 15, 2024
1 parent f9fb688 commit 3baac6f
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 165 deletions.
79 changes: 36 additions & 43 deletions va/caa.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"time"

"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"

"github.com/letsencrypt/boulder/bdns"
"github.com/letsencrypt/boulder/canceled"
Expand Down Expand Up @@ -42,19 +41,46 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
Requester: req.AccountURIID,
Hostname: req.Domain,
}
checkStartTime := va.clk.Now()

validationMethod := core.AcmeChallenge(req.ValidationMethod)
if !validationMethod.IsValid() {
challType := core.AcmeChallenge(req.ValidationMethod)
if !challType.IsValid() {
return nil, berrors.InternalServerError("unrecognized validation method %q", req.ValidationMethod)
}

acmeID := identifier.NewDNS(req.Domain)
params := &caaParams{
accountURIID: req.AccountURIID,
validationMethod: validationMethod,
validationMethod: challType,
}

var prob *probs.ProblemDetails
var internalErr error
var localLatency time.Duration
start := va.clk.Now()

defer func() {
probType := ""
outcome := fail
if prob != nil {
// CAA check failed.
probType = string(prob.Type)
logEvent.Error = prob.Error()
} else {
// CAA check passed.
outcome = pass
}
// Observe local check latency (primary|remote).
va.observeLatency(opCAA, va.perspective, string(challType), probType, outcome, localLatency)
if va.isPrimaryVA() {
// Observe total check latency (primary+remote).
va.observeLatency(opCAA, allPerspectives, string(challType), probType, outcome, va.clk.Since(start))
}
// Log the total check latency.
logEvent.ValidationLatency = va.clk.Since(start).Round(time.Millisecond).Seconds()

va.log.AuditObject("CAA check result", logEvent)
}()

var remoteCAAResults chan *remoteVAResult
if features.Get().EnforceMultiCAA {
if remoteVACount := len(va.remoteVAs); remoteVACount > 0 {
Expand All @@ -63,16 +89,10 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
}
}

checkResult := "success"
err := va.checkCAA(ctx, acmeID, params)
localCheckLatency := time.Since(checkStartTime)
var prob *probs.ProblemDetails
if err != nil {
prob = detailedError(err)
logEvent.Error = prob.Error()
logEvent.InternalError = err.Error()
internalErr = va.checkCAA(ctx, acmeID, params)
if internalErr != nil {
prob = detailedError(internalErr)
prob.Detail = fmt.Sprintf("While processing CAA for %s: %s", req.Domain, prob.Detail)
checkResult = "failure"
} else if remoteCAAResults != nil {
if !features.Get().EnforceMultiCAA && features.Get().MultiCAAFullResults {
// If we're not going to enforce multi CAA but we are logging the
Expand All @@ -82,40 +102,24 @@ func (va *ValidationAuthorityImpl) IsCAAValid(ctx context.Context, req *vapb.IsC
_ = va.processRemoteCAAResults(
req.Domain,
req.AccountURIID,
string(validationMethod),
string(challType),
remoteCAAResults)
}()
} else if features.Get().EnforceMultiCAA {
remoteProb := va.processRemoteCAAResults(
req.Domain,
req.AccountURIID,
string(validationMethod),
string(challType),
remoteCAAResults)

// If the remote result was a non-nil problem then fail the CAA check
if remoteProb != nil {
prob = remoteProb
// We only set .Error here, not InternalError, because the remote VA doesn't send
// us the internal error. But that's okay, because it got logged at the remote VA.
logEvent.Error = remoteProb.Error()
checkResult = "failure"
va.log.Infof("CAA check failed due to remote failures: identifier=%v err=%s",
req.Domain, remoteProb)
va.metrics.remoteCAACheckFailures.Inc()
}
}
}
checkLatency := time.Since(checkStartTime)
logEvent.ValidationLatency = checkLatency.Round(time.Millisecond).Seconds()

va.metrics.localCAACheckTime.With(prometheus.Labels{
"result": checkResult,
}).Observe(localCheckLatency.Seconds())
va.metrics.caaCheckTime.With(prometheus.Labels{
"result": checkResult,
}).Observe(checkLatency.Seconds())

va.log.AuditObject("CAA check result", logEvent)

if prob != nil {
// The ProblemDetails will be serialized through gRPC, which requires UTF-8.
Expand Down Expand Up @@ -154,15 +158,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
challengeType string,
remoteResultsChan <-chan *remoteVAResult) *probs.ProblemDetails {

state := "failure"
start := va.clk.Now()

defer func() {
va.metrics.remoteCAACheckTime.With(prometheus.Labels{
"result": state,
}).Observe(va.clk.Since(start).Seconds())
}()

required := len(va.remoteVAs) - va.maxRemoteFailures
good := 0
bad := 0
Expand Down Expand Up @@ -190,7 +185,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(
// success or failure threshold is met.
if !features.Get().MultiCAAFullResults {
if good >= required {
state = "success"
return nil
} else if bad > va.maxRemoteFailures {
modifiedProblem := *result.Problem
Expand All @@ -217,7 +211,6 @@ func (va *ValidationAuthorityImpl) processRemoteCAAResults(

// Based on the threshold of good/bad return nil or a problem.
if good >= required {
state = "success"
return nil
} else if bad > va.maxRemoteFailures {
modifiedProblem := *firstProb
Expand Down
55 changes: 55 additions & 0 deletions va/caa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"testing"

"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"

"github.com/letsencrypt/boulder/bdns"
"github.com/letsencrypt/boulder/core"
Expand Down Expand Up @@ -676,6 +677,7 @@ func TestMultiCAARechecking(t *testing.T) {
expectedProbSubstring string
expectedProbType probs.ProblemType
expectedDiffLogSubstring string
expectedLabels prometheus.Labels
localDNSClient bdns.Client
}{
{
Expand All @@ -687,6 +689,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
},
},
{
name: "broken localVA, RVAs functional, no CAA records",
Expand All @@ -699,6 +708,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, 1 broken RVA, no CAA records",
Expand All @@ -712,6 +728,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, all broken RVAs, no CAA records",
Expand All @@ -725,6 +748,13 @@ func TestMultiCAARechecking(t *testing.T) {
{brokenVA, brokenUA},
{brokenVA, brokenUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "all VAs functional, CAA issue type present",
Expand All @@ -735,6 +765,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": "",
"result": pass,
},
},
{
name: "functional localVA, 1 broken RVA, CAA issue type present",
Expand All @@ -748,6 +785,13 @@ func TestMultiCAARechecking(t *testing.T) {
{remoteVA, remoteUA},
{remoteVA, remoteUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
name: "functional localVA, all broken RVAs, CAA issue type present",
Expand All @@ -761,6 +805,13 @@ func TestMultiCAARechecking(t *testing.T) {
{brokenVA, brokenUA},
{brokenVA, brokenUA},
},
expectedLabels: prometheus.Labels{
"operation": opCAA,
"perspective": allPerspectives,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.DNSProblem),
"result": fail,
},
},
{
// The localVA kicks off the background goroutines before doing its
Expand Down Expand Up @@ -954,6 +1005,10 @@ func TestMultiCAARechecking(t *testing.T) {
} else {
test.AssertEquals(t, len(gotAnyRemoteFailures), 0)
}

if tc.expectedLabels != nil {
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, tc.expectedLabels, 1)
}
})
}
}
Expand Down
10 changes: 6 additions & 4 deletions va/dns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ func TestDNSValidationEmpty(t *testing.T) {
test.AssertEquals(t, res.Problems.ProblemType, "unauthorized")
test.AssertEquals(t, res.Problems.Detail, "No TXT record found at _acme-challenge.empty-txts.com")

test.AssertMetricWithLabelsEquals(t, va.metrics.validationTime, prometheus.Labels{
"type": "dns-01",
"result": "invalid",
"problem_type": "unauthorized",
test.AssertMetricWithLabelsEquals(t, va.metrics.validationLatency, prometheus.Labels{
"operation": opChallAndCAA,
"perspective": PrimaryPerspective,
"challenge_type": string(core.ChallengeTypeDNS01),
"problem_type": string(probs.UnauthorizedProblem),
"result": fail,
}, 1)
}

Expand Down
Loading

0 comments on commit 3baac6f

Please sign in to comment.