From db020fcc0da491d349c09e21693b3c932114772a Mon Sep 17 00:00:00 2001 From: Nico D'Cotta <45274424+cottand@users.noreply.github.com> Date: Sun, 10 Nov 2024 16:26:43 +0000 Subject: [PATCH] metrics: histogram for upstream resolve duration (#71) adds metric `leng_upstream_request_duration_*` metrics to keep track of duration of upstream resolving --- config.go | 5 +++++ grimd_test.go | 2 +- internal/metric/metric.go | 26 +++++++++++++++++++++++++- main.go | 6 +++++- resolver.go | 10 +++++++--- 5 files changed, 43 insertions(+), 6 deletions(-) diff --git a/config.go b/config.go index 14e2d82..59fbfae 100644 --- a/config.go +++ b/config.go @@ -62,6 +62,7 @@ type Metrics struct { Path string HighCardinalityEnabled bool ResetPeriodMinutes int64 + HistogramsEnabled bool } type DnsOverHttpServer struct { @@ -112,6 +113,9 @@ interval = 200 # question cache capacity, 0 for infinite but not recommended (this is used for storing logs) questioncachecap = 5000 +# timeout for upstream DNS queries, in ms +timeout = 5000 + # manual whitelist entries - comments for reference whitelist = [ # "getsentry.com", @@ -170,6 +174,7 @@ followCnameDepth = 12 path = "/metrics" # see https://cottand.github.io/leng/Prometheus-Metrics.html highCardinalityEnabled = false + histogramsEnabled = false resetPeriodMinutes = 60 [DnsOverHttpServer] diff --git a/grimd_test.go b/grimd_test.go index 5eed9cb..3b04902 100644 --- a/grimd_test.go +++ b/grimd_test.go @@ -37,7 +37,7 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client, changeConfig(&config) - cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled) + cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled, false) defer cancelMetrics() quitActivation := make(chan bool) actChannel := make(chan *ActivationHandler) diff --git a/internal/metric/metric.go b/internal/metric/metric.go index 60f5fb3..51148f9 100644 --- a/internal/metric/metric.go +++ b/internal/metric/metric.go @@ -57,6 +57,16 @@ var ( Name: "upstream_request_doh", Help: "Upstream DoH requests - only works when DoH configured", }, []string{"success"}) + + RequestUpstreamResolveDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: Namespace, + Name: "upstream_request_duration", + Help: "Upstream requests duration in seconds, by request type", + Buckets: []float64{0.0001, 0.0025, .005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}, + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 32, + }, []string{"upstream_type"}) + CustomDNSConfigReload = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: Namespace, @@ -79,6 +89,7 @@ var ( } configHighCardinality = false + configHistograms = false ) func init() { @@ -92,8 +103,12 @@ func init() { ) } -func Start(resetPeriodMinutes int64, highCardinality bool) (closeChan context.CancelFunc) { +func Start(resetPeriodMinutes int64, highCardinality bool, histogramsEnabled bool) (closeChan context.CancelFunc) { configHighCardinality = highCardinality + if histogramsEnabled { + prometheus.MustRegister(RequestUpstreamResolveDuration) + configHistograms = true + } ctx, cancel := context.WithCancel(context.Background()) mark := time.Now() @@ -161,3 +176,12 @@ func ReportDNSRespond(remote net.IP, message *dns.Msg, blocked bool, cached bool cachedResponseCounter.Inc() } } + +func ReportUpstreamResolve(upstreamType string, duration time.Duration) { + if !configHistograms { + return + } + RequestUpstreamResolveDuration. + With(prometheus.Labels{"upstream_type": upstreamType}). + Observe(duration.Seconds()) +} diff --git a/main.go b/main.go index e3c0047..2d80838 100644 --- a/main.go +++ b/main.go @@ -59,7 +59,11 @@ func main() { loggingState.cleanUp() }() - cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled) + cancelMetrics := metric.Start( + config.Metrics.ResetPeriodMinutes, + config.Metrics.HighCardinalityEnabled, + config.Metrics.HistogramsEnabled, + ) lengActive = true quitActivation := make(chan bool) diff --git a/resolver.go b/resolver.go index 6e8cbf1..a9d326c 100644 --- a/resolver.go +++ b/resolver.go @@ -40,7 +40,7 @@ type Resolver struct { // Lookup will ask each nameserver in top-to-bottom fashion, starting a new request // in every second, and return as early as possible (have an answer). // It returns an error if no request has succeeded. -func (r *Resolver) Lookup(net string, req *dns.Msg, timeout int, interval int, nameServers []string, DoH string) (message *dns.Msg, err error) { +func (resolver *Resolver) Lookup(net string, req *dns.Msg, timeout int, interval int, nameServers []string, DoH string) (message *dns.Msg, err error) { logger.Debugf("Lookup %s, timeout: %d, interval: %d, nameservers: %v, Using DoH: %v", net, timeout, interval, nameServers, DoH != "") question := req.Question[0] @@ -50,10 +50,13 @@ func (r *Resolver) Lookup(net string, req *dns.Msg, timeout int, interval int, n "q_name": question.Name, }) + mark := time.Now() + //Is DoH enabled if DoH != "" { //First try and use DOH. Privacy First - ans, err := r.DoHLookup(DoH, timeout, req) + ans, err := resolver.DoHLookup(DoH, timeout, req) + metric.ReportUpstreamResolve("doh", time.Since(mark)) if err == nil { // No error so result is ok metricUpstreamResolveCounter.With( @@ -129,6 +132,7 @@ func (r *Resolver) Lookup(net string, req *dns.Msg, timeout int, interval int, n "rcode": dns.RcodeToString[r.answer.Rcode], "upstream": r.nameserver, }).Inc() + metric.ReportUpstreamResolve(net, time.Since(mark)) return r.answer, nil default: return nil, ResolvError{qname, net, nameServers} @@ -136,7 +140,7 @@ func (r *Resolver) Lookup(net string, req *dns.Msg, timeout int, interval int, n } // DoHLookup performs a DNS lookup over https -func (r *Resolver) DoHLookup(url string, timeout int, req *dns.Msg) (msg *dns.Msg, err error) { +func (resolver *Resolver) DoHLookup(url string, timeout int, req *dns.Msg) (msg *dns.Msg, err error) { qname := req.Question[0].Name defer func() {