Skip to content

Commit

Permalink
ratelimits: Auto pause zombie clients (#7763)
Browse files Browse the repository at this point in the history
- Added a new key-value ratelimit
`FailedAuthorizationsForPausingPerDomainPerAccount` which is incremented
each time a client fails a validation.
- As long as capacity exists in the bucket, a successful validation
attempt will reset the bucket back to full capacity.
- Upon exhausting bucket capacity, the RA will send a gRPC to the SA to
pause the `account:identifier`. Further validation attempts will be
rejected by the [WFE](#7599).
- Added a new feature flag, `AutomaticallyPauseZombieClients`, which
enables automatic pausing of zombie clients in the RA.
- Added a new RA metric `paused_pairs{"paused":[bool],
"repaused":[bool], "grace":[bool]}` to monitor use of this new
functionality.
- Updated `ra_test.go` `initAuthorities` to allow accessing the
`*ratelimits.RedisSource` for checking that the new ratelimit functions
as intended.

Co-authored-by: @pgporada 

Fixes #7738

---------

Co-authored-by: Phil Porada <[email protected]>
Co-authored-by: Phil Porada <[email protected]>
  • Loading branch information
3 people authored Nov 8, 2024
1 parent 2058d98 commit a79a830
Show file tree
Hide file tree
Showing 14 changed files with 598 additions and 84 deletions.
7 changes: 7 additions & 0 deletions features/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ type Config struct {
// unique "INSERT ... RETURNING" functionality.
InsertAuthzsIndividually bool

// AutomaticallyPauseZombieClients configures the RA to automatically track
// limiter to be the authoritative source of rate limiting information for
// automatically pausing clients who systemically fail every validation
// attempt. When disabled, only manually paused accountID:identifier pairs
// will be rejected.
AutomaticallyPauseZombieClients bool

// IncrementRateLimits uses Redis' IncrBy, instead of Set, for rate limit
// accounting. This catches and denies spikes of requests much more
// reliably.
Expand Down
71 changes: 66 additions & 5 deletions ra/ra.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ type RegistrationAuthorityImpl struct {
orderAges *prometheus.HistogramVec
inflightFinalizes prometheus.Gauge
certCSRMismatch prometheus.Counter
pauseCounter *prometheus.CounterVec
}

var _ rapb.RegistrationAuthorityServer = (*RegistrationAuthorityImpl)(nil)
Expand Down Expand Up @@ -241,6 +242,12 @@ func NewRegistrationAuthorityImpl(
})
stats.MustRegister(certCSRMismatch)

pauseCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "paused_pairs",
Help: "Number of times a pause operation is performed, labeled by paused=[bool], repaused=[bool], grace=[bool]",
}, []string{"paused", "repaused", "grace"})
stats.MustRegister(pauseCounter)

issuersByNameID := make(map[issuance.NameID]*issuance.Certificate)
for _, issuer := range issuers {
issuersByNameID[issuer.NameID()] = issuer
Expand Down Expand Up @@ -276,6 +283,7 @@ func NewRegistrationAuthorityImpl(
orderAges: orderAges,
inflightFinalizes: inflightFinalizes,
certCSRMismatch: certCSRMismatch,
pauseCounter: pauseCounter,
}
return ra
}
Expand Down Expand Up @@ -1810,15 +1818,17 @@ func (ra *RegistrationAuthorityImpl) recordValidation(ctx context.Context, authI
}

// countFailedValidation increments the failed authorizations per domain per
// account rate limit. There is no reason to surface errors from this function
// to the Subscriber, spends against this limit are best effort.
func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context, regId int64, name string) {
// account rate limit. If the AutomaticallyPauseZombieClients feature has been
// enabled, it also increments the failed authorizations for pausing per domain
// per account rate limit. There is no reason to surface errors from this
// function to the Subscriber, spends against this limit are best effort.
func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context, regId int64, ident identifier.ACMEIdentifier) {
if ra.limiter == nil || ra.txnBuilder == nil {
// Limiter is disabled.
return
}

txn, err := ra.txnBuilder.FailedAuthorizationsPerDomainPerAccountSpendOnlyTransaction(regId, name)
txn, err := ra.txnBuilder.FailedAuthorizationsPerDomainPerAccountSpendOnlyTransaction(regId, ident.Value)
if err != nil {
ra.log.Warningf("building rate limit transaction for the %s rate limit: %s", ratelimits.FailedAuthorizationsPerDomainPerAccount, err)
}
Expand All @@ -1830,6 +1840,54 @@ func (ra *RegistrationAuthorityImpl) countFailedValidation(ctx context.Context,
}
ra.log.Warningf("spending against the %s rate limit: %s", ratelimits.FailedAuthorizationsPerDomainPerAccount, err)
}

if features.Get().AutomaticallyPauseZombieClients {
txn, err = ra.txnBuilder.FailedAuthorizationsForPausingPerDomainPerAccountTransaction(regId, ident.Value)
if err != nil {
ra.log.Warningf("building rate limit transaction for the %s rate limit: %s", ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, err)
}

decision, err := ra.limiter.Spend(ctx, txn)
if err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return
}
ra.log.Warningf("spending against the %s rate limit: %s", ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, err)
}

if decision.Result(ra.clk.Now()) != nil {
resp, err := ra.SA.PauseIdentifiers(ctx, &sapb.PauseRequest{
RegistrationID: regId,
Identifiers: []*corepb.Identifier{
{
Type: string(ident.Type),
Value: ident.Value,
},
},
})
if err != nil {
ra.log.Warningf("failed to pause %d/%q: %s", regId, ident.Value, err)
}
ra.pauseCounter.With(prometheus.Labels{
"paused": strconv.FormatBool(resp.Paused > 0),
"repaused": strconv.FormatBool(resp.Repaused > 0),
"grace": strconv.FormatBool(resp.Paused <= 0 && resp.Repaused <= 0),
}).Inc()
}
}
}

// resetAccountPausingLimit resets bucket to maximum capacity for given account.
// There is no reason to surface errors from this function to the Subscriber.
func (ra *RegistrationAuthorityImpl) resetAccountPausingLimit(ctx context.Context, regId int64, ident identifier.ACMEIdentifier) {
bucketKey, err := ratelimits.NewRegIdDomainBucketKey(ratelimits.FailedAuthorizationsForPausingPerDomainPerAccount, regId, ident.Value)
if err != nil {
ra.log.Warningf("creating bucket key for regID=[%d] identifier=[%s]: %s", regId, ident.Value, err)
}
err = ra.limiter.Reset(ctx, bucketKey)
if err != nil {
ra.log.Warningf("resetting bucket for regID=[%d] identifier=[%s]: %s", regId, ident.Value, err)
}
}

// PerformValidation initiates validation for a specific challenge associated
Expand Down Expand Up @@ -1953,9 +2011,12 @@ func (ra *RegistrationAuthorityImpl) PerformValidation(
if prob != nil {
challenge.Status = core.StatusInvalid
challenge.Error = prob
go ra.countFailedValidation(vaCtx, authz.RegistrationID, authz.Identifier.Value)
go ra.countFailedValidation(vaCtx, authz.RegistrationID, authz.Identifier)
} else {
challenge.Status = core.StatusValid
if features.Get().AutomaticallyPauseZombieClients {
ra.resetAccountPausingLimit(vaCtx, authz.RegistrationID, authz.Identifier)
}
}
challenge.Validated = &vStart
authz.Challenges[challIndex] = *challenge
Expand Down
Loading

0 comments on commit a79a830

Please sign in to comment.