-
-
Notifications
You must be signed in to change notification settings - Fork 610
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ratelimit: Overhaul metrics for the our existing rate limits #7054
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,7 @@ import ( | |
pubpb "github.com/letsencrypt/boulder/publisher/proto" | ||
rapb "github.com/letsencrypt/boulder/ra/proto" | ||
"github.com/letsencrypt/boulder/ratelimit" | ||
"github.com/letsencrypt/boulder/ratelimits" | ||
"github.com/letsencrypt/boulder/reloader" | ||
"github.com/letsencrypt/boulder/revocation" | ||
sapb "github.com/letsencrypt/boulder/sa/proto" | ||
|
@@ -108,9 +109,9 @@ type RegistrationAuthorityImpl struct { | |
ctpolicy *ctpolicy.CTPolicy | ||
|
||
ctpolicyResults *prometheus.HistogramVec | ||
rateLimitCounter *prometheus.CounterVec | ||
revocationReasonCounter *prometheus.CounterVec | ||
namesPerCert *prometheus.HistogramVec | ||
rlCheckLatency *prometheus.HistogramVec | ||
newRegCounter prometheus.Counter | ||
recheckCAACounter prometheus.Counter | ||
newCertCounter prometheus.Counter | ||
|
@@ -161,11 +162,11 @@ func NewRegistrationAuthorityImpl( | |
) | ||
stats.MustRegister(namesPerCert) | ||
|
||
rateLimitCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ | ||
Name: "ra_ratelimits", | ||
Help: "A counter of RA ratelimit checks labelled by type and pass/exceed", | ||
}, []string{"limit", "result"}) | ||
stats.MustRegister(rateLimitCounter) | ||
rlCheckLatency := prometheus.NewHistogramVec(prometheus.HistogramOpts{ | ||
Name: "ratelimitsv1_check_latency_seconds", | ||
Help: fmt.Sprintf("Latency of ratelimit checks labeled by limit=[name] and decision=[%s|%s], in seconds", ratelimits.Allowed, ratelimits.Denied), | ||
}, []string{"limit", "decision"}) | ||
stats.MustRegister(rlCheckLatency) | ||
|
||
newRegCounter := prometheus.NewCounter(prometheus.CounterOpts{ | ||
Name: "new_registrations", | ||
|
@@ -253,7 +254,7 @@ func NewRegistrationAuthorityImpl( | |
issuersByNameID: issuersByNameID, | ||
issuersByID: issuersByID, | ||
namesPerCert: namesPerCert, | ||
rateLimitCounter: rateLimitCounter, | ||
rlCheckLatency: rlCheckLatency, | ||
newRegCounter: newRegCounter, | ||
recheckCAACounter: recheckCAACounter, | ||
newCertCounter: newCertCounter, | ||
|
@@ -366,10 +367,6 @@ type registrationCounter func(context.Context, *sapb.CountRegistrationsByIPReque | |
// provided registrationCounter function to determine if the limit has been | ||
// exceeded for a given IP or IP range | ||
func (ra *RegistrationAuthorityImpl) checkRegistrationIPLimit(ctx context.Context, limit ratelimit.RateLimitPolicy, ip net.IP, counter registrationCounter) error { | ||
if !limit.Enabled() { | ||
return nil | ||
} | ||
|
||
now := ra.clk.Now() | ||
count, err := counter(ctx, &sapb.CountRegistrationsByIPRequest{ | ||
Ip: ip, | ||
|
@@ -395,13 +392,19 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationLimits(ctx context.Context | |
// Check the registrations per IP limit using the CountRegistrationsByIP SA | ||
// function that matches IP addresses exactly | ||
exactRegLimit := ra.rlPolicies.RegistrationsPerIP() | ||
err := ra.checkRegistrationIPLimit(ctx, exactRegLimit, ip, ra.SA.CountRegistrationsByIP) | ||
if err != nil { | ||
ra.rateLimitCounter.WithLabelValues("registrations_by_ip", "exceeded").Inc() | ||
ra.log.Infof("Rate limit exceeded, RegistrationsByIP, IP: %s", ip) | ||
return err | ||
if exactRegLimit.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkRegistrationIPLimit(ctx, exactRegLimit, ip, ra.SA.CountRegistrationsByIP) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIP, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
ra.log.Infof("Rate limit exceeded, RegistrationsPerIP, by IP: %q", ip) | ||
} | ||
return err | ||
pgporada marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIP, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
ra.rateLimitCounter.WithLabelValues("registrations_by_ip", "pass").Inc() | ||
|
||
// We only apply the fuzzy reg limit to IPv6 addresses. | ||
// Per https://golang.org/pkg/net/#IP.To4 "If ip is not an IPv4 address, To4 | ||
|
@@ -414,15 +417,23 @@ func (ra *RegistrationAuthorityImpl) checkRegistrationLimits(ctx context.Context | |
// CountRegistrationsByIPRange SA function that fuzzy-matches IPv6 addresses | ||
// within a larger address range | ||
fuzzyRegLimit := ra.rlPolicies.RegistrationsPerIPRange() | ||
err = ra.checkRegistrationIPLimit(ctx, fuzzyRegLimit, ip, ra.SA.CountRegistrationsByIPRange) | ||
if err != nil { | ||
ra.rateLimitCounter.WithLabelValues("registrations_by_ip_range", "exceeded").Inc() | ||
ra.log.Infof("Rate limit exceeded, RegistrationsByIPRange, IP: %s", ip) | ||
// For the fuzzyRegLimit we use a new error message that specifically | ||
// mentions that the limit being exceeded is applied to a *range* of IPs | ||
return berrors.RateLimitError(0, "too many registrations for this IP range") | ||
if fuzzyRegLimit.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkRegistrationIPLimit(ctx, fuzzyRegLimit, ip, ra.SA.CountRegistrationsByIPRange) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIPRange, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
ra.log.Infof("Rate limit exceeded, RegistrationsByIPRange, IP: %q", ip) | ||
|
||
// For the fuzzyRegLimit we use a new error message that specifically | ||
// mentions that the limit being exceeded is applied to a *range* of IPs | ||
return berrors.RateLimitError(0, "too many registrations for this IP range") | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.RegistrationsPerIPRange, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
ra.rateLimitCounter.WithLabelValues("registrations_by_ip_range", "pass").Inc() | ||
|
||
return nil | ||
} | ||
|
@@ -555,38 +566,33 @@ func (ra *RegistrationAuthorityImpl) validateContacts(contacts []string) error { | |
return nil | ||
} | ||
|
||
func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context.Context, regID int64) error { | ||
limit := ra.rlPolicies.PendingAuthorizationsPerAccount() | ||
if limit.Enabled() { | ||
// This rate limit's threshold can only be overridden on a per-regID basis, | ||
// not based on any other key. | ||
threshold := limit.GetThreshold("", regID) | ||
if threshold == -1 { | ||
return nil | ||
} | ||
countPB, err := ra.SA.CountPendingAuthorizations2(ctx, &sapb.RegistrationID{ | ||
Id: regID, | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
if countPB.Count >= threshold { | ||
ra.rateLimitCounter.WithLabelValues("pending_authorizations_by_registration_id", "exceeded").Inc() | ||
ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID) | ||
return berrors.RateLimitError(0, "too many currently pending authorizations: %d", countPB.Count) | ||
} | ||
ra.rateLimitCounter.WithLabelValues("pending_authorizations_by_registration_id", "pass").Inc() | ||
func (ra *RegistrationAuthorityImpl) checkPendingAuthorizationLimit(ctx context.Context, regID int64, limit ratelimit.RateLimitPolicy) error { | ||
// This rate limit's threshold can only be overridden on a per-regID basis, | ||
// not based on any other key. | ||
threshold := limit.GetThreshold("", regID) | ||
if threshold == -1 { | ||
return nil | ||
} | ||
countPB, err := ra.SA.CountPendingAuthorizations2(ctx, &sapb.RegistrationID{ | ||
Id: regID, | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
if countPB.Count >= threshold { | ||
ra.log.Infof("Rate limit exceeded, PendingAuthorizationsByRegID, regID: %d", regID) | ||
return berrors.RateLimitError(0, "too many currently pending authorizations: %d", countPB.Count) | ||
} | ||
return nil | ||
} | ||
|
||
// checkInvalidAuthorizationLimits checks the failed validation limit for each | ||
// of the provided hostnames. It returns the first error. | ||
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context.Context, regID int64, hostnames []string) error { | ||
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context.Context, regID int64, hostnames []string, limits ratelimit.RateLimitPolicy) error { | ||
results := make(chan error, len(hostnames)) | ||
for _, hostname := range hostnames { | ||
go func(hostname string) { | ||
results <- ra.checkInvalidAuthorizationLimit(ctx, regID, hostname) | ||
results <- ra.checkInvalidAuthorizationLimit(ctx, regID, hostname, limits) | ||
}(hostname) | ||
} | ||
// We don't have to wait for all of the goroutines to finish because there's | ||
|
@@ -601,11 +607,7 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimits(ctx context | |
return nil | ||
} | ||
|
||
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context.Context, regID int64, hostname string) error { | ||
limit := ra.rlPolicies.InvalidAuthorizationsPerAccount() | ||
if !limit.Enabled() { | ||
return nil | ||
} | ||
func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context.Context, regID int64, hostname string, limit ratelimit.RateLimitPolicy) error { | ||
latest := ra.clk.Now().Add(ra.pendingAuthorizationLifetime) | ||
earliest := latest.Add(-limit.Window.Duration) | ||
req := &sapb.CountInvalidAuthorizationsRequest{ | ||
|
@@ -633,11 +635,7 @@ func (ra *RegistrationAuthorityImpl) checkInvalidAuthorizationLimit(ctx context. | |
// checkNewOrdersPerAccountLimit enforces the rlPolicies `NewOrdersPerAccount` | ||
// rate limit. This rate limit ensures a client can not create more than the | ||
// specified threshold of new orders within the specified time window. | ||
func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.Context, acctID int64) error { | ||
limit := ra.rlPolicies.NewOrdersPerAccount() | ||
if !limit.Enabled() { | ||
return nil | ||
} | ||
func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.Context, acctID int64, limit ratelimit.RateLimitPolicy) error { | ||
now := ra.clk.Now() | ||
count, err := ra.SA.CountOrders(ctx, &sapb.CountOrdersRequest{ | ||
AccountID: acctID, | ||
|
@@ -652,10 +650,8 @@ func (ra *RegistrationAuthorityImpl) checkNewOrdersPerAccountLimit(ctx context.C | |
// There is no meaningful override key to use for this rate limit | ||
noKey := "" | ||
if count.Count >= limit.GetThreshold(noKey, acctID) { | ||
ra.rateLimitCounter.WithLabelValues("new_order_by_registration_id", "exceeded").Inc() | ||
return berrors.RateLimitError(0, "too many new orders recently") | ||
} | ||
ra.rateLimitCounter.WithLabelValues("new_order_by_registration_id", "pass").Inc() | ||
return nil | ||
} | ||
|
||
|
@@ -1410,7 +1406,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C | |
return fmt.Errorf("checking renewal exemption for %q: %s", names, err) | ||
} | ||
if exists.Exists { | ||
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "FQDN set bypass").Inc() | ||
return nil | ||
} | ||
|
||
|
@@ -1428,7 +1423,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C | |
retryString := earliest.Add(limit.Window.Duration).Format(time.RFC3339) | ||
|
||
ra.log.Infof("Rate limit exceeded, CertificatesForDomain, regID: %d, domains: %s", regID, strings.Join(namesOutOfLimit, ", ")) | ||
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "exceeded").Inc() | ||
if len(namesOutOfLimit) > 1 { | ||
var subErrors []berrors.SubBoulderError | ||
for _, name := range namesOutOfLimit { | ||
|
@@ -1441,7 +1435,6 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerNameLimit(ctx context.C | |
} | ||
return berrors.RateLimitError(retryAfter, "too many certificates already issued for %q. Retry after %s", namesOutOfLimit[0], retryString) | ||
} | ||
ra.rateLimitCounter.WithLabelValues("certificates_for_domain", "pass").Inc() | ||
|
||
return nil | ||
} | ||
|
@@ -1490,40 +1483,75 @@ func (ra *RegistrationAuthorityImpl) checkCertificatesPerFQDNSetLimit(ctx contex | |
} | ||
} | ||
|
||
func (ra *RegistrationAuthorityImpl) checkLimits(ctx context.Context, names []string, regID int64) error { | ||
// Check if there is rate limit space for a new order within the current window. | ||
err := ra.checkNewOrdersPerAccountLimit(ctx, regID) | ||
if err != nil { | ||
return err | ||
func (ra *RegistrationAuthorityImpl) checkNewOrderLimits(ctx context.Context, names []string, regID int64) error { | ||
newOrdersPerAccountLimits := ra.rlPolicies.NewOrdersPerAccount() | ||
if newOrdersPerAccountLimits.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkNewOrdersPerAccountLimit(ctx, regID, newOrdersPerAccountLimits) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.NewOrdersPerAccount, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.NewOrdersPerAccount, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
|
||
certNameLimits := ra.rlPolicies.CertificatesPerName() | ||
if certNameLimits.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkCertificatesPerNameLimit(ctx, names, certNameLimits, regID) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerName, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerName, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
|
||
fqdnFastLimits := ra.rlPolicies.CertificatesPerFQDNSetFast() | ||
if fqdnFastLimits.Enabled() { | ||
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnFastLimits, regID) | ||
fqdnLimitsFast := ra.rlPolicies.CertificatesPerFQDNSetFast() | ||
if fqdnLimitsFast.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnLimitsFast, regID) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSetFast, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSetFast, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
|
||
fqdnLimits := ra.rlPolicies.CertificatesPerFQDNSet() | ||
if fqdnLimits.Enabled() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not the change for it, but can't we get rid of the not-fast version of the CertsPerFQDNSet limit by now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're still setting a limit and an override for this in our integration and unit tests, so I'm not sure that statement is true. I could certainly dig into it more though. |
||
started := ra.clk.Now() | ||
err := ra.checkCertificatesPerFQDNSetLimit(ctx, names, fqdnLimits, regID) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSet, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.CertificatesPerFQDNSet, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
|
||
err = ra.checkInvalidAuthorizationLimits(ctx, regID, names) | ||
if err != nil { | ||
return err | ||
invalidAuthzPerAccountLimits := ra.rlPolicies.InvalidAuthorizationsPerAccount() | ||
if invalidAuthzPerAccountLimits.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkInvalidAuthorizationLimits(ctx, regID, names, invalidAuthzPerAccountLimits) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return err | ||
beautifulentropy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.InvalidAuthorizationsPerAccount, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
|
||
return nil | ||
|
@@ -2373,7 +2401,7 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New | |
} | ||
|
||
// Check if there is rate limit space for issuing a certificate. | ||
err = ra.checkLimits(ctx, newOrder.Names, newOrder.RegistrationID) | ||
err = ra.checkNewOrderLimits(ctx, newOrder.Names, newOrder.RegistrationID) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
@@ -2451,9 +2479,18 @@ func (ra *RegistrationAuthorityImpl) NewOrder(ctx context.Context, req *rapb.New | |
// If the order isn't fully authorized we need to check that the client has | ||
// rate limit room for more pending authorizations | ||
if len(missingAuthzNames) > 0 { | ||
err := ra.checkPendingAuthorizationLimit(ctx, newOrder.RegistrationID) | ||
if err != nil { | ||
return nil, err | ||
pendingAuthzLimits := ra.rlPolicies.PendingAuthorizationsPerAccount() | ||
if pendingAuthzLimits.Enabled() { | ||
started := ra.clk.Now() | ||
err := ra.checkPendingAuthorizationLimit(ctx, newOrder.RegistrationID, pendingAuthzLimits) | ||
elapsed := ra.clk.Since(started) | ||
if err != nil { | ||
if errors.Is(err, berrors.RateLimit) { | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, ratelimits.Denied).Observe(elapsed.Seconds()) | ||
} | ||
return nil, err | ||
} | ||
ra.rlCheckLatency.WithLabelValues(ratelimit.PendingAuthorizationsPerAccount, ratelimits.Allowed).Observe(elapsed.Seconds()) | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's intended that rate limits names between v1
ratelimit
and v2ratelimits
packages will be identical. You could add a labelversion="v1"
orversion="v2"
rather than have two separate metrics.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had precisely this idea myself, however:
If I'm wrong on either of these points, please let me know. Generally though, this choice was very-much intentional.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, reading the prometheus docs states that to change the old metrics we'd need to do relabeling and at that point just make a new metric.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something about this conclusion strikes me as unlikely or surprising. IIRC it's totally possible to set metric datapoints without specifying values for every label they have. And old data points which have the "version" label set will age out of the grafana database at the same speed as old datapoints from this whole metric would.
Most of what grafana/prometheus are talking about when they talk about "relabeling" is transforming the labels on one metric to match the labels on another metric so that you can query both of them at the same time. That's definitely a pain, but I don't think we'd need to do that here. We'd simply remove the "version" label from this code, stop exporting it, and the timeseries database will catch up when the old datapoints eventually fall out.
I think? Maybe I'm totally wrong.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer to leave this alone rather than juggle histogram buckets and labels for v1 and v2 in the same time series. I could also omit v2 when I add the corresponding time-series to key-value rate limits.