diff --git a/director/director.go b/director/director.go index ddf39e967..5e30eac29 100644 --- a/director/director.go +++ b/director/director.go @@ -105,6 +105,10 @@ var ( statUtilsMutex = sync.RWMutex{} ) +func init() { + hookServerAdsCache() +} + func getRedirectURL(reqPath string, ad server_structs.ServerAd, requiresAuth bool) (redirectURL url.URL) { var serverURL url.URL if requiresAuth && ad.AuthURL.String() != "" { diff --git a/director/director_api.go b/director/director_api.go index e01cd3999..f91c16ec8 100644 --- a/director/director_api.go +++ b/director/director_api.go @@ -198,30 +198,28 @@ func LaunchMapMetrics(ctx context.Context, egrp *errgroup.Group) { }) } -func LaunchServerCountMetric(ctx context.Context, egrp *errgroup.Group) { - egrp.Go(func() error { - ticker := time.NewTicker(15 * time.Second) - defer ticker.Stop() +func hookServerAdsCache() { + // Hook into server ads cache + // By hooking into the insertion and eviction events, we can keep track of the number of servers in the director + // The metric is updated based on the server type, server name, and whether the server is from the topology + // At any given moment, the metric represents the number of servers in the director - for { - select { - case <-ctx.Done(): - return nil - case <-ticker.C: - for _, ad := range serverAds.Items() { - serverAd := ad.Value() - metrics.PelicanDirectorServerCount.With(prometheus.Labels{ - "server_name": serverAd.Name, - "server_type": string(serverAd.Type), - "server_url": serverAd.URL.String(), - "server_web_url": serverAd.WebURL.String(), - "server_lat": fmt.Sprintf("%.4f", serverAd.Latitude), - "server_long": fmt.Sprintf("%.4f", serverAd.Longitude), - "from_topology": strconv.FormatBool(serverAd.FromTopology), - }).Inc() - } - } - } + serverAds.OnInsertion(func(ctx context.Context, ad *ttlcache.Item[string, *server_structs.Advertisement]) { + serverAd := ad.Value() + metrics.PelicanDirectorServerCount.With(prometheus.Labels{ + "server_name": serverAd.Name, + "server_type": string(serverAd.Type), + "from_topology": strconv.FormatBool(serverAd.FromTopology), + }).Inc() + }) + + serverAds.OnEviction(func(ctx context.Context, er ttlcache.EvictionReason, ad *ttlcache.Item[string, *server_structs.Advertisement]) { + serverAd := ad.Value() + metrics.PelicanDirectorServerCount.With(prometheus.Labels{ + "server_name": serverAd.Name, + "server_type": string(serverAd.Type), + "from_topology": strconv.FormatBool(serverAd.FromTopology), + }).Dec() }) } diff --git a/launchers/director_serve.go b/launchers/director_serve.go index df7e080cc..44c72b462 100644 --- a/launchers/director_serve.go +++ b/launchers/director_serve.go @@ -45,8 +45,6 @@ func DirectorServe(ctx context.Context, engine *gin.Engine, egrp *errgroup.Group director.LaunchMapMetrics(ctx, egrp) - director.LaunchServerCountMetric(ctx, egrp) - director.ConfigFilterdServers() director.LaunchServerIOQuery(ctx, egrp) diff --git a/metrics/director.go b/metrics/director.go index a1e9ca2df..7be3927a8 100644 --- a/metrics/director.go +++ b/metrics/director.go @@ -82,10 +82,10 @@ var ( Help: "The total stat queries the director issues. The status can be Succeeded, Cancelled, Timeout, Forbidden, or UnknownErr", }, []string{"server_name", "server_url", "server_type", "result", "cached_result"}) // result: see enums for DirectorStatResult - PelicanDirectorServerCount = promauto.NewCounterVec(prometheus.CounterOpts{ + PelicanDirectorServerCount = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "pelican_director_server_count", - Help: "Total number of servers, delineated by pelican/non-pelican and origin/cache", - }, []string{"server_name", "server_type", "server_url", "server_web_url", "server_lat", "server_long", "from_topology"}) + Help: "The number of servers currently recognized by the Director, delineated by pelican/non-pelican and origin/cache", + }, []string{"server_name", "server_type", "from_topology"}) PelicanDirectorClientVersionTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "pelican_director_client_version_total",