Skip to content

Commit

Permalink
ratelimit: Overhaul the way we observe existing rate limits
Browse files Browse the repository at this point in the history
  • Loading branch information
beautifulentropy committed Aug 28, 2023
1 parent 8deb67f commit d2ef470
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 81 deletions.
202 changes: 124 additions & 78 deletions ra/ra.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ import (
"github.com/letsencrypt/boulder/web"
)

const (
// allowed is used for rate limit metrics, it's the value of the 'decision'
// label when a request was allowed.
allowed = "allowed"

// denied is used for rate limit metrics, it's the value of the 'decision'
// label when a request was denied.
denied = "denied"
)

var (
errIncompleteGRPCRequest = errors.New("incomplete gRPC request message")
errIncompleteGRPCResponse = errors.New("incomplete gRPC response message")
Expand Down Expand Up @@ -108,9 +118,9 @@ type RegistrationAuthorityImpl struct {
ctpolicy *ctpolicy.CTPolicy

ctpolicyResults *prometheus.HistogramVec
rateLimitCounter *prometheus.CounterVec
revocationReasonCounter *prometheus.CounterVec
namesPerCert *prometheus.HistogramVec
rlCheckLatency *prometheus.HistogramVec
newRegCounter prometheus.Counter
recheckCAACounter prometheus.Counter
newCertCounter prometheus.Counter
Expand Down Expand Up @@ -161,11 +171,11 @@ func NewRegistrationAuthorityImpl(
)
stats.MustRegister(namesPerCert)

rateLimitCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ra_ratelimits",
Help: "A counter of RA ratelimit checks labelled by type and pass/exceed",
}, []string{"limit", "result"})
stats.MustRegister(rateLimitCounter)
rlCheckLatency := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "ratelimitsv1_check_latency_seconds",
Help: fmt.Sprintf("Latency of ratelimit checks labeled by limit=[name] and decision=[%s|%s], in seconds", allowed, denied),
}, []string{"limit", "decision"})
stats.MustRegister(rlCheckLatency)

newRegCounter := prometheus.NewCounter(prometheus.CounterOpts{
Name: "new_registrations",
Expand Down Expand Up @@ -253,7 +263,7 @@ func NewRegistrationAuthorityImpl(
issuersByNameID: issuersByNameID,
issuersByID: issuersByID,
namesPerCert: namesPerCert,
rateLimitCounter: rateLimitCounter,
rlCheckLatency: rlCheckLatency,
newRegCounter: newRegCounter,
recheckCAACounter: recheckCAACounter,
newCertCounter: newCertCounter,
Expand Down Expand Up @@ -366,10 +376,6 @@ type registrationCounter func(context.Context, *sapb.CountRegistrationsByIPReque
// provided registrationCounter function to determine if the limit has been
// exceeded for a given IP or IP range
func (ra *RegistrationAuthorityImpl) checkRegistrationIPLimit(ctx context.Context, limit ratelimit.RateLimitPolicy, ip net.IP, counter registrationCounter) error {
if !limit.Enabled() {
return nil
}

now := ra.clk.Now()
count, err := counter(ctx, &sapb.CountRegistrationsByIPRequest{
Ip: ip,
Expand All @@ -395,13 +401,19 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationLimits(ctx context.Context
// Check the registrations per IP limit using the CountRegistrationsByIP SA
// function that matches IP addresses exactly
exactRegLimit := ra.rlPolicies.RegistrationsPerIP()
err := ra.checkRegistrationIPLimit(ctx, exactRegLimit, ip, ra.SA.CountRegistrationsByIP)
if err != nil {
ra.rateLimitCounter.WithLabelValues("registrations_by_ip", "exceeded").Inc()
ra.log.Infof("Rate limit exceeded, RegistrationsByIP, IP: %s", ip)
return err
if exactRegLimit.Enabled() {
started := ra.clk.Now()
err := ra.checkRegistrationIPLimit(ctx, exactRegLimit, ip, ra.SA.CountRegistrationsByIP)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIP, denied).Observe(elapsed.Seconds())
ra.log.Infof("Rate limit exceeded, RegistrationsPerIP, by IP: %q", ip)
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIP, allowed).Observe(elapsed.Seconds())
}
ra.rateLimitCounter.WithLabelValues("registrations_by_ip", "pass").Inc()

// We only apply the fuzzy reg limit to IPv6 addresses.
// Per https://golang.org/pkg/net/#IP.To4 "If ip is not an IPv4 address, To4
Expand All @@ -414,15 +426,23 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationLimits(ctx context.Context
// CountRegistrationsByIPRange SA function that fuzzy-matches IPv6 addresses
// within a larger address range
fuzzyRegLimit := ra.rlPolicies.RegistrationsPerIPRange()
err = ra.checkRegistrationIPLimit(ctx, fuzzyRegLimit, ip, ra.SA.CountRegistrationsByIPRange)
if err != nil {
ra.rateLimitCounter.WithLabelValues("registrations_by_ip_range", "exceeded").Inc()
ra.log.Infof("Rate limit exceeded, RegistrationsByIPRange, IP: %s", ip)
// For the fuzzyRegLimit we use a new error message that specifically
// mentions that the limit being exceeded is applied to a *range* of IPs
return berrors.RateLimitError(0, "too many registrations for this IP range")
if fuzzyRegLimit.Enabled() {
started := ra.clk.Now()
err := ra.checkRegistrationIPLimit(ctx, fuzzyRegLimit, ip, ra.SA.CountRegistrationsByIPRange)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIPRange, denied).Observe(elapsed.Seconds())
ra.log.Infof("Rate limit exceeded, RegistrationsByIPRange, IP: %q", ip)

// For the fuzzyRegLimit we use a new error message that specifically
// mentions that the limit being exceeded is applied to a *range* of IPs
return berrors.RateLimitError(0, "too many registrations for this IP range")
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIPRange, allowed).Observe(elapsed.Seconds())
}
ra.rateLimitCounter.WithLabelValues("registrations_by_ip_range", "pass").Inc()

return nil
}
Expand Down Expand Up @@ -555,38 +575,33 @@ func (ra *RegistrationAuthorityImpl) validateContacts(contacts []string) error {
return nil
}

func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context.Context, regID int64) error {
limit := ra.rlPolicies.PendingAuthorizationsPerAccount()
if limit.Enabled() {
// This rate limit's threshold can only be overridden on a per-regID basis,
// not based on any other key.
threshold := limit.GetThreshold("", regID)
if threshold == -1 {
return nil
}
countPB, err := ra.SA.CountPendingAuthorizations2(ctx, &sapb.RegistrationID{
Id: regID,
})
if err != nil {
return err
}
if countPB.Count >= threshold {
ra.rateLimitCounter.WithLabelValues("pending_authorizations_by_registration_id", "exceeded").Inc()
ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID)
return berrors.RateLimitError(0, "too many currently pending authorizations: %d", countPB.Count)
}
ra.rateLimitCounter.WithLabelValues("pending_authorizations_by_registration_id", "pass").Inc()
func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context.Context, regID int64, limit ratelimit.RateLimitPolicy) error {
// This rate limit's threshold can only be overridden on a per-regID basis,
// not based on any other key.
threshold := limit.GetThreshold("", regID)
if threshold == -1 {
return nil
}
countPB, err := ra.SA.CountPendingAuthorizations2(ctx, &sapb.RegistrationID{
Id: regID,
})
if err != nil {
return err
}
if countPB.Count >= threshold {
ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID)
return berrors.RateLimitError(0, "too many currently pending authorizations: %d", countPB.Count)
}
return nil
}

// checkInvalidAuthorizationLimits checks the failed validation limit for each
// of the provided hostnames. It returns the first error.
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context.Context, regID int64, hostnames []string) error {
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context.Context, regID int64, hostnames []string, limits ratelimit.RateLimitPolicy) error {
results := make(chan error, len(hostnames))
for _, hostname := range hostnames {
go func(hostname string) {
results <- ra.checkInvalidAuthorizationLimit(ctx, regID, hostname)
results <- ra.checkInvalidAuthorizationLimit(ctx, regID, hostname, limits)
}(hostname)
}
// We don't have to wait for all of the goroutines to finish because there's
Expand All @@ -601,11 +616,7 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context
return nil
}

func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context.Context, regID int64, hostname string) error {
limit := ra.rlPolicies.InvalidAuthorizationsPerAccount()
if !limit.Enabled() {
return nil
}
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context.Context, regID int64, hostname string, limit ratelimit.RateLimitPolicy) error {
latest := ra.clk.Now().Add(ra.pendingAuthorizationLifetime)
earliest := latest.Add(-limit.Window.Duration)
req := &sapb.CountInvalidAuthorizationsRequest{
Expand Down Expand Up @@ -633,11 +644,7 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context.
// checkNewOrdersPerAccountLimit enforces the rlPolicies `NewOrdersPerAccount`
// rate limit. This rate limit ensures a client can not create more than the
// specified threshold of new orders within the specified time window.
func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.Context, acctID int64) error {
limit := ra.rlPolicies.NewOrdersPerAccount()
if !limit.Enabled() {
return nil
}
func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.Context, acctID int64, limit ratelimit.RateLimitPolicy) error {
now := ra.clk.Now()
count, err := ra.SA.CountOrders(ctx, &sapb.CountOrdersRequest{
AccountID: acctID,
Expand All @@ -652,10 +659,8 @@ func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.C
// There is no meaningful override key to use for this rate limit
noKey := ""
if count.Count >= limit.GetThreshold(noKey, acctID) {
ra.rateLimitCounter.WithLabelValues("new_order_by_registration_id", "exceeded").Inc()
return berrors.RateLimitError(0, "too many new orders recently")
}
ra.rateLimitCounter.WithLabelValues("new_order_by_registration_id", "pass").Inc()
return nil
}

Expand Down Expand Up @@ -1410,7 +1415,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C
return fmt.Errorf("checking renewal exemption for %q: %s", names, err)
}
if exists.Exists {
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "FQDN set bypass").Inc()
return nil
}

Expand All @@ -1428,7 +1432,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C
retryString := earliest.Add(limit.Window.Duration).Format(time.RFC3339)

ra.log.Infof("Rate limit exceeded, CertificatesForDomain, regID: %d, domains: %s", regID, strings.Join(namesOutOfLimit, ", "))
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "exceeded").Inc()
if len(namesOutOfLimit) > 1 {
var subErrors []berrors.SubBoulderError
for _, name := range namesOutOfLimit {
Expand All @@ -1441,7 +1444,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C
}
return berrors.RateLimitError(retryAfter, "too many certificates already issued for %q. Retry after %s", namesOutOfLimit[0], retryString)
}
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "pass").Inc()

return nil
}
Expand Down Expand Up @@ -1490,40 +1492,75 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx contex
}
}

func (ra *RegistrationAuthorityImpl) checkLimits(ctx context.Context, names []string, regID int64) error {
// Check if there is rate limit space for a new order within the current window.
err := ra.checkNewOrdersPerAccountLimit(ctx, regID)
if err != nil {
return err
func (ra *RegistrationAuthorityImpl) checkNewOrderLimits(ctx context.Context, names []string, regID int64) error {
newOrdersPerAccountLimits := ra.rlPolicies.NewOrdersPerAccount()
if newOrdersPerAccountLimits.Enabled() {
started := ra.clk.Now()
err := ra.checkNewOrdersPerAccountLimit(ctx, regID, newOrdersPerAccountLimits)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.NewOrdersPerAccount, denied).Observe(elapsed.Seconds())
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.NewOrdersPerAccount, allowed).Observe(elapsed.Seconds())
}

certNameLimits := ra.rlPolicies.CertificatesPerName()
if certNameLimits.Enabled() {
started := ra.clk.Now()
err := ra.checkCertificatesPerNameLimit(ctx, names, certNameLimits, regID)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerName, denied).Observe(elapsed.Seconds())
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerName, allowed).Observe(elapsed.Seconds())
}

fqdnFastLimits := ra.rlPolicies.CertificatesPerFQDNSetFast()
if fqdnFastLimits.Enabled() {
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnFastLimits, regID)
fqdnLimitsFast := ra.rlPolicies.CertificatesPerFQDNSetFast()
if fqdnLimitsFast.Enabled() {
started := ra.clk.Now()
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnLimitsFast, regID)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSetFast, denied).Observe(elapsed.Seconds())
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSetFast, allowed).Observe(elapsed.Seconds())
}

fqdnLimits := ra.rlPolicies.CertificatesPerFQDNSet()
if fqdnLimits.Enabled() {
started := ra.clk.Now()
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnLimits, regID)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSet, denied).Observe(elapsed.Seconds())
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSet, allowed).Observe(elapsed.Seconds())
}

err = ra.checkInvalidAuthorizationLimits(ctx, regID, names)
if err != nil {
return err
invalidAuthzPerAccountLimits := ra.rlPolicies.InvalidAuthorizationsPerAccount()
if invalidAuthzPerAccountLimits.Enabled() {
started := ra.clk.Now()
err := ra.checkInvalidAuthorizationLimits(ctx, regID, names, invalidAuthzPerAccountLimits)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, denied).Observe(elapsed.Seconds())
}
return err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, allowed).Observe(elapsed.Seconds())
}

return nil
Expand Down Expand Up @@ -2373,7 +2410,7 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New
}

// Check if there is rate limit space for issuing a certificate.
err = ra.checkLimits(ctx, newOrder.Names, newOrder.RegistrationID)
err = ra.checkNewOrderLimits(ctx, newOrder.Names, newOrder.RegistrationID)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -2451,9 +2488,18 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New
// If the order isn't fully authorized we need to check that the client has
// rate limit room for more pending authorizations
if len(missingAuthzNames) > 0 {
err := ra.checkPendingAuthorizationLimit(ctx, newOrder.RegistrationID)
if err != nil {
return nil, err
pendingAuthzLimits := ra.rlPolicies.PendingAuthorizationsPerAccount()
if pendingAuthzLimits.Enabled() {
started := ra.clk.Now()
err := ra.checkPendingAuthorizationLimit(ctx, newOrder.RegistrationID, pendingAuthzLimits)
elapsed := ra.clk.Since(started)
if err != nil {
if errors.Is(err, berrors.RateLimit) {
ra.rlCheckLatency.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, denied).Observe(elapsed.Seconds())
}
return nil, err
}
ra.rlCheckLatency.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, allowed).Observe(elapsed.Seconds())
}
}

Expand Down
Loading

0 comments on commit d2ef470

Please sign in to comment.