From 2944dc9246ccfc4128c43c7207f1b07370ae9a0f Mon Sep 17 00:00:00 2001 From: Surya Seetharaman Date: Tue, 16 Apr 2024 14:48:37 +0200 Subject: [PATCH] (B)ANP: Add rule count metrics Signed-off-by: Surya Seetharaman --- .../pkg/metrics/ovnkube_controller.go | 34 +++++ .../admin_network_policy.go | 7 + .../admin_network_policy_controller.go | 1 + .../baseline_admin_network_policy.go | 2 + .../admin_network_policy/metrics.go | 126 ++++++++++++++++++ .../controller/admin_network_policy/utils.go | 30 +++++ 6 files changed, 200 insertions(+) create mode 100644 go-controller/pkg/ovn/controller/admin_network_policy/metrics.go diff --git a/go-controller/pkg/metrics/ovnkube_controller.go b/go-controller/pkg/metrics/ovnkube_controller.go index 587031a841..c4bd3252d5 100644 --- a/go-controller/pkg/metrics/ovnkube_controller.go +++ b/go-controller/pkg/metrics/ovnkube_controller.go @@ -294,6 +294,28 @@ var metricBANPDBObjects = prometheus.NewGaugeVec(prometheus.GaugeOpts{ }, ) +var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemController, + Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster + Help: "The total number of rules in a given admin network policy in the cluster"}, + []string{ + "direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label + "action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label + }, +) + +var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemController, + Name: "baseline_admin_network_policy_rules", + Help: "The total number of rules in a given baseline admin network policy in the cluster"}, + []string{ + "direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label + "action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label + }, +) + /** AdminNetworkPolicyMetrics End**/ // metricFirstSeenLSPLatency is the time between a pod first seen in OVN-Kubernetes and its Logical Switch Port is created @@ -432,6 +454,8 @@ func RegisterOVNKubeControllerFunctional() { prometheus.MustRegister(metricEgressRoutingViaHost) prometheus.MustRegister(metricANPCount) prometheus.MustRegister(metricBANPCount) + //prometheus.MustRegister(metricANPRuleCount) + //prometheus.MustRegister(metricBANPRuleCount) if err := prometheus.Register(MetricResourceRetryFailuresCount); err != nil { if _, ok := err.(prometheus.AlreadyRegisteredError); !ok { panic(err) @@ -665,6 +689,16 @@ func DecrementBANPCount() { metricBANPCount.Dec() } +// UpdateANPRuleCount records the number of AdminNetworkPolicy rules. +func UpdateANPRuleCount(direction, action string, count float64) { + metricANPRuleCount.WithLabelValues(direction, action).Set(count) +} + +// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules. +func UpdateBANPRuleCount(direction, action string, count float64) { + metricBANPRuleCount.WithLabelValues(direction, action).Set(count) +} + type ( timestampType int operation int diff --git a/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy.go b/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy.go index db451f6c18..bd955a3e76 100644 --- a/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy.go +++ b/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy.go @@ -150,6 +150,7 @@ func (c *Controller) ensureAdminNetworkPolicy(anp *anpapi.AdminNetworkPolicy) er // since transact was successful we can finally populate the cache c.anpCache[anp.Name] = desiredANPState metrics.IncrementANPCount() + updateANPRuleCountMetric(desiredANPState, false) return nil } // ANP state existed in the cache, which means its either an ANP update or pod/namespace add/update/delete @@ -405,6 +406,7 @@ func (c *Controller) clearAdminNetworkPolicy(anpName string) error { if err != nil { return fmt.Errorf("failed to delete address-sets for ANP %s/%d: %w", anp.name, anp.anpPriority, err) } + updateANPRuleCountMetric(anp, false) // we can delete the object from the cache now. delete(c.anpPriorityMap, anp.anpPriority) delete(c.anpCache, anpName) @@ -547,6 +549,11 @@ func (c *Controller) updateExistingANP(currentANPState, desiredANPState *adminNe if err != nil { return fmt.Errorf("failed to create ACL-on-PG update ops for anp %s: %v", desiredANPState.name, err) } + if fullPeerRecompute || atLeastOneRuleUpdated { + // this means either rules were inserted or deleted or the actions on the rules were updated + // let's update the rule count metrics + updateANPRuleCountMetric(desiredANPState, isBanp) + } } // Did the ANP.Spec.Subject Change? diff --git a/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy_controller.go b/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy_controller.go index d138e171da..de668f46ae 100644 --- a/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy_controller.go +++ b/go-controller/pkg/ovn/controller/admin_network_policy/admin_network_policy_controller.go @@ -206,6 +206,7 @@ func NewController( // TODO(tssurya): We don't use recorder now but will add events in future iterations c.eventRecorder = recorder + c.initMetricsCollector() return c, nil } diff --git a/go-controller/pkg/ovn/controller/admin_network_policy/baseline_admin_network_policy.go b/go-controller/pkg/ovn/controller/admin_network_policy/baseline_admin_network_policy.go index a31c9d2a93..16acc3b7e9 100644 --- a/go-controller/pkg/ovn/controller/admin_network_policy/baseline_admin_network_policy.go +++ b/go-controller/pkg/ovn/controller/admin_network_policy/baseline_admin_network_policy.go @@ -106,6 +106,7 @@ func (c *Controller) clearBaselineAdminNetworkPolicy(banpName string) error { if err != nil { return fmt.Errorf("failed to delete address-sets for BANP %s: %w", banp.name, err) } + updateANPRuleCountMetric(banp, true) // we can delete the object from the cache now (set the cache back to empty value). c.banpCache = &adminNetworkPolicyState{} metrics.DecrementBANPCount() @@ -153,6 +154,7 @@ func (c *Controller) ensureBaselineAdminNetworkPolicy(banp *anpapi.BaselineAdmin // since transact was successful we can finally populate the cache c.banpCache = desiredBANPState metrics.IncrementBANPCount() + updateANPRuleCountMetric(currentBANPState, true) return nil } // BANP state existed in the cache, which means its either a BANP update or pod/namespace add/update/delete diff --git a/go-controller/pkg/ovn/controller/admin_network_policy/metrics.go b/go-controller/pkg/ovn/controller/admin_network_policy/metrics.go new file mode 100644 index 0000000000..6d7b09bd7d --- /dev/null +++ b/go-controller/pkg/ovn/controller/admin_network_policy/metrics.go @@ -0,0 +1,126 @@ +package adminnetworkpolicy + +import ( + "fmt" + + libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" +) + +/*var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metrics.MetricOvnkubeNamespace, + Subsystem: metrics.MetricOvnkubeSubsystemController, + Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster + Help: "The total number of rules in a given admin network policy in the cluster"}, + []string{ + "direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label + "action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label + }, +) + +var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metrics.MetricOvnkubeNamespace, + Subsystem: metrics.MetricOvnkubeSubsystemController, + Name: "baseline_admin_network_policy_rules", + Help: "The total number of rules in a given baseline admin network policy in the cluster"}, + []string{ + "direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label + "action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label + }, +)*/ + +// Descriptors used by the ClusterManagerCollector below. +var ( + anpRuleCountDesc = prometheus.NewDesc( + "admin_network_policy_rules", + "The total number of rules in a given admin network policy in the cluster", + []string{"direction", "action"}, nil, + ) +) + + +// ANPControllerCollector implements the Collector interface. +type ANPControllerCollector struct { + ANPController *Controller +} + +func (c *Controller) initMetricsCollector() { + reg := prometheus.NewPedanticRegistry() + cc := ANPControllerCollector{ANPController: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": c.zone}, reg).MustRegister(cc) + // Add the standard process and Go metrics to the custom registry. + reg.MustRegister( + // expose process metrics like CPU, Memory, file descriptor usage etc. + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + // expose Go runtime metrics like GC stats, memory stats etc. + collectors.NewGoCollector(), + ) +} + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ANPControllerCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cc, ch) +} + +func (cc ANPControllerCollector) updateANPRuleCountMetric(anpCache map[string]*adminNetworkPolicyState) int { + for _, state := range anpCache { + return cc.updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), state.ingressRules, false) + //updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), state.egressRules, false) + } + return 0 +} + +func (cc ANPControllerCollector) updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) int { + var passCount, allowCount, denyCount int + for _, rule := range rules { + switch rule.action { + case nbdb.ACLActionAllowRelated: + allowCount++ + case nbdb.ACLActionDrop: + denyCount++ + case nbdb.ACLActionPass: + passCount++ + default: + panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action)) + } + } + return allowCount+passCount+denyCount + /*if isBanp { + cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount)) + cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount)) + } else { + cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount)) + cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount)) + cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount)) + }*/ +} + +// UpdateANPRuleCount records the number of AdminNetworkPolicy rules. +/*func (cc ANPControllerCollector) UpdateANPRuleCount(direction, action string, count float64) { + metricANPRuleCount.WithLabelValues(direction, action).Set(count) +} + +// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules. +func (cc ANPControllerCollector) UpdateBANPRuleCount(direction, action string, count float64) { + metricBANPRuleCount.WithLabelValues(direction, action).Set(count) +}*/ + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ANPControllerCollector) Collect(ch chan<- prometheus.Metric) { + ruleCount := cc.updateANPRuleCountMetric(cc.ANPController.anpCache) + ch <- prometheus.MustNewConstMetric( + anpRuleCountDesc, + prometheus.CounterValue, + float64(ruleCount), + string(libovsdbutil.ACLIngress), + "Allow", + ) +} diff --git a/go-controller/pkg/ovn/controller/admin_network_policy/utils.go b/go-controller/pkg/ovn/controller/admin_network_policy/utils.go index ba02042d36..4254019932 100644 --- a/go-controller/pkg/ovn/controller/admin_network_policy/utils.go +++ b/go-controller/pkg/ovn/controller/admin_network_policy/utils.go @@ -7,6 +7,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -168,3 +169,32 @@ func getACLLoggingLevelsForANP(annotations map[string]string) (*libovsdbutil.ACL } return aclLogLevels, apierrors.NewAggregate(errors) } + +func updateANPRuleCountMetric(desiredState *adminNetworkPolicyState, isBanp bool) { + updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), desiredState.ingressRules, isBanp) + updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), desiredState.egressRules, isBanp) +} + +func updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) { + var passCount, allowCount, denyCount int + for _, rule := range rules { + switch rule.action { + case nbdb.ACLActionAllowRelated: + allowCount++ + case nbdb.ACLActionDrop: + denyCount++ + case nbdb.ACLActionPass: + passCount++ + default: + panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action)) + } + } + if isBanp { + metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount)) + metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount)) + } else { + metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount)) + metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount)) + metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount)) + } +}