Skip to content

Commit

Permalink
(B)ANP: Add rule count metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Surya Seetharaman <[email protected]>
  • Loading branch information
tssurya committed Apr 20, 2024
1 parent 83a598a commit 2944dc9
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 0 deletions.
34 changes: 34 additions & 0 deletions go-controller/pkg/metrics/ovnkube_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,28 @@ var metricBANPDBObjects = prometheus.NewGaugeVec(prometheus.GaugeOpts{
},
)

var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemController,
Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster
Help: "The total number of rules in a given admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label
},
)

var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemController,
Name: "baseline_admin_network_policy_rules",
Help: "The total number of rules in a given baseline admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label
},
)

/** AdminNetworkPolicyMetrics End**/

// metricFirstSeenLSPLatency is the time between a pod first seen in OVN-Kubernetes and its Logical Switch Port is created
Expand Down Expand Up @@ -432,6 +454,8 @@ func RegisterOVNKubeControllerFunctional() {
prometheus.MustRegister(metricEgressRoutingViaHost)
prometheus.MustRegister(metricANPCount)
prometheus.MustRegister(metricBANPCount)
//prometheus.MustRegister(metricANPRuleCount)
//prometheus.MustRegister(metricBANPRuleCount)
if err := prometheus.Register(MetricResourceRetryFailuresCount); err != nil {
if _, ok := err.(prometheus.AlreadyRegisteredError); !ok {
panic(err)
Expand Down Expand Up @@ -665,6 +689,16 @@ func DecrementBANPCount() {
metricBANPCount.Dec()
}

// UpdateANPRuleCount records the number of AdminNetworkPolicy rules.
func UpdateANPRuleCount(direction, action string, count float64) {
metricANPRuleCount.WithLabelValues(direction, action).Set(count)
}

// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules.
func UpdateBANPRuleCount(direction, action string, count float64) {
metricBANPRuleCount.WithLabelValues(direction, action).Set(count)
}

type (
timestampType int
operation int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ func (c *Controller) ensureAdminNetworkPolicy(anp *anpapi.AdminNetworkPolicy) er
// since transact was successful we can finally populate the cache
c.anpCache[anp.Name] = desiredANPState
metrics.IncrementANPCount()
updateANPRuleCountMetric(desiredANPState, false)
return nil
}
// ANP state existed in the cache, which means its either an ANP update or pod/namespace add/update/delete
Expand Down Expand Up @@ -405,6 +406,7 @@ func (c *Controller) clearAdminNetworkPolicy(anpName string) error {
if err != nil {
return fmt.Errorf("failed to delete address-sets for ANP %s/%d: %w", anp.name, anp.anpPriority, err)
}
updateANPRuleCountMetric(anp, false)
// we can delete the object from the cache now.
delete(c.anpPriorityMap, anp.anpPriority)
delete(c.anpCache, anpName)
Expand Down Expand Up @@ -547,6 +549,11 @@ func (c *Controller) updateExistingANP(currentANPState, desiredANPState *adminNe
if err != nil {
return fmt.Errorf("failed to create ACL-on-PG update ops for anp %s: %v", desiredANPState.name, err)
}
if fullPeerRecompute || atLeastOneRuleUpdated {
// this means either rules were inserted or deleted or the actions on the rules were updated
// let's update the rule count metrics
updateANPRuleCountMetric(desiredANPState, isBanp)
}
}

// Did the ANP.Spec.Subject Change?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ func NewController(

// TODO(tssurya): We don't use recorder now but will add events in future iterations
c.eventRecorder = recorder
c.initMetricsCollector()

return c, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ func (c *Controller) clearBaselineAdminNetworkPolicy(banpName string) error {
if err != nil {
return fmt.Errorf("failed to delete address-sets for BANP %s: %w", banp.name, err)
}
updateANPRuleCountMetric(banp, true)
// we can delete the object from the cache now (set the cache back to empty value).
c.banpCache = &adminNetworkPolicyState{}
metrics.DecrementBANPCount()
Expand Down Expand Up @@ -153,6 +154,7 @@ func (c *Controller) ensureBaselineAdminNetworkPolicy(banp *anpapi.BaselineAdmin
// since transact was successful we can finally populate the cache
c.banpCache = desiredBANPState
metrics.IncrementBANPCount()
updateANPRuleCountMetric(currentBANPState, true)
return nil
}
// BANP state existed in the cache, which means its either a BANP update or pod/namespace add/update/delete
Expand Down
126 changes: 126 additions & 0 deletions go-controller/pkg/ovn/controller/admin_network_policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package adminnetworkpolicy

import (
"fmt"

libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
)

/*var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.MetricOvnkubeNamespace,
Subsystem: metrics.MetricOvnkubeSubsystemController,
Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster
Help: "The total number of rules in a given admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label
},
)
var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.MetricOvnkubeNamespace,
Subsystem: metrics.MetricOvnkubeSubsystemController,
Name: "baseline_admin_network_policy_rules",
Help: "The total number of rules in a given baseline admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label
},
)*/

// Descriptors used by the ClusterManagerCollector below.
var (
anpRuleCountDesc = prometheus.NewDesc(
"admin_network_policy_rules",
"The total number of rules in a given admin network policy in the cluster",
[]string{"direction", "action"}, nil,
)
)


// ANPControllerCollector implements the Collector interface.
type ANPControllerCollector struct {
ANPController *Controller
}

func (c *Controller) initMetricsCollector() {
reg := prometheus.NewPedanticRegistry()
cc := ANPControllerCollector{ANPController: c}
prometheus.WrapRegistererWith(prometheus.Labels{"zone": c.zone}, reg).MustRegister(cc)
// Add the standard process and Go metrics to the custom registry.
reg.MustRegister(
// expose process metrics like CPU, Memory, file descriptor usage etc.
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// expose Go runtime metrics like GC stats, memory stats etc.
collectors.NewGoCollector(),
)
}

// Describe is implemented with DescribeByCollect. That's possible because the
// Collect method will always return the same two metrics with the same two
// descriptors.
func (cc ANPControllerCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(cc, ch)
}

func (cc ANPControllerCollector) updateANPRuleCountMetric(anpCache map[string]*adminNetworkPolicyState) int {
for _, state := range anpCache {
return cc.updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), state.ingressRules, false)
//updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), state.egressRules, false)
}
return 0
}

func (cc ANPControllerCollector) updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) int {
var passCount, allowCount, denyCount int
for _, rule := range rules {
switch rule.action {
case nbdb.ACLActionAllowRelated:
allowCount++
case nbdb.ACLActionDrop:
denyCount++
case nbdb.ACLActionPass:
passCount++
default:
panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action))
}
}
return allowCount+passCount+denyCount
/*if isBanp {
cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount))
cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount))
} else {
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount))
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount))
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount))
}*/
}

// UpdateANPRuleCount records the number of AdminNetworkPolicy rules.
/*func (cc ANPControllerCollector) UpdateANPRuleCount(direction, action string, count float64) {
metricANPRuleCount.WithLabelValues(direction, action).Set(count)
}
// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules.
func (cc ANPControllerCollector) UpdateBANPRuleCount(direction, action string, count float64) {
metricBANPRuleCount.WithLabelValues(direction, action).Set(count)
}*/

// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it
// creates constant metrics for each host on the fly based on the returned data.
//
// Note that Collect could be called concurrently, so we depend on
// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe.
func (cc ANPControllerCollector) Collect(ch chan<- prometheus.Metric) {
ruleCount := cc.updateANPRuleCountMetric(cc.ANPController.anpCache)
ch <- prometheus.MustNewConstMetric(
anpRuleCountDesc,
prometheus.CounterValue,
float64(ruleCount),
string(libovsdbutil.ACLIngress),
"Allow",
)
}
30 changes: 30 additions & 0 deletions go-controller/pkg/ovn/controller/admin_network_policy/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config"
libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops"
libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb"
addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util"
Expand Down Expand Up @@ -168,3 +169,32 @@ func getACLLoggingLevelsForANP(annotations map[string]string) (*libovsdbutil.ACL
}
return aclLogLevels, apierrors.NewAggregate(errors)
}

func updateANPRuleCountMetric(desiredState *adminNetworkPolicyState, isBanp bool) {
updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), desiredState.ingressRules, isBanp)
updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), desiredState.egressRules, isBanp)
}

func updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) {
var passCount, allowCount, denyCount int
for _, rule := range rules {
switch rule.action {
case nbdb.ACLActionAllowRelated:
allowCount++
case nbdb.ACLActionDrop:
denyCount++
case nbdb.ACLActionPass:
passCount++
default:
panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action))
}
}
if isBanp {
metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount))
metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount))
} else {
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount))
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount))
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount))
}
}

0 comments on commit 2944dc9

Please sign in to comment.