Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DNM] Pre merge testing server side apply; Unblock QE #2116

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions go-controller/pkg/metrics/ovnkube_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,28 @@ var metricBANPDBObjects = prometheus.NewGaugeVec(prometheus.GaugeOpts{
},
)

var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemController,
Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster
Help: "The total number of rules in a given admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label
},
)

var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemController,
Name: "baseline_admin_network_policy_rules",
Help: "The total number of rules in a given baseline admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label
},
)

/** AdminNetworkPolicyMetrics End**/

// metricFirstSeenLSPLatency is the time between a pod first seen in OVN-Kubernetes and its Logical Switch Port is created
Expand Down Expand Up @@ -432,6 +454,8 @@ func RegisterOVNKubeControllerFunctional() {
prometheus.MustRegister(metricEgressRoutingViaHost)
prometheus.MustRegister(metricANPCount)
prometheus.MustRegister(metricBANPCount)
//prometheus.MustRegister(metricANPRuleCount)
//prometheus.MustRegister(metricBANPRuleCount)
if err := prometheus.Register(MetricResourceRetryFailuresCount); err != nil {
if _, ok := err.(prometheus.AlreadyRegisteredError); !ok {
panic(err)
Expand Down Expand Up @@ -665,6 +689,16 @@ func DecrementBANPCount() {
metricBANPCount.Dec()
}

// UpdateANPRuleCount records the number of AdminNetworkPolicy rules.
func UpdateANPRuleCount(direction, action string, count float64) {
metricANPRuleCount.WithLabelValues(direction, action).Set(count)
}

// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules.
func UpdateBANPRuleCount(direction, action string, count float64) {
metricBANPRuleCount.WithLabelValues(direction, action).Set(count)
}

type (
timestampType int
operation int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ func (c *Controller) ensureAdminNetworkPolicy(anp *anpapi.AdminNetworkPolicy) er
// since transact was successful we can finally populate the cache
c.anpCache[anp.Name] = desiredANPState
metrics.IncrementANPCount()
updateANPRuleCountMetric(desiredANPState, false)
return nil
}
// ANP state existed in the cache, which means its either an ANP update or pod/namespace add/update/delete
Expand Down Expand Up @@ -405,6 +406,7 @@ func (c *Controller) clearAdminNetworkPolicy(anpName string) error {
if err != nil {
return fmt.Errorf("failed to delete address-sets for ANP %s/%d: %w", anp.name, anp.anpPriority, err)
}
updateANPRuleCountMetric(anp, false)
// we can delete the object from the cache now.
delete(c.anpPriorityMap, anp.anpPriority)
delete(c.anpCache, anpName)
Expand Down Expand Up @@ -547,6 +549,11 @@ func (c *Controller) updateExistingANP(currentANPState, desiredANPState *adminNe
if err != nil {
return fmt.Errorf("failed to create ACL-on-PG update ops for anp %s: %v", desiredANPState.name, err)
}
if fullPeerRecompute || atLeastOneRuleUpdated {
// this means either rules were inserted or deleted or the actions on the rules were updated
// let's update the rule count metrics
updateANPRuleCountMetric(desiredANPState, isBanp)
}
}

// Did the ANP.Spec.Subject Change?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ func NewController(

// TODO(tssurya): We don't use recorder now but will add events in future iterations
c.eventRecorder = recorder
c.initMetricsCollector()

return c, nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ func (c *Controller) clearBaselineAdminNetworkPolicy(banpName string) error {
if err != nil {
return fmt.Errorf("failed to delete address-sets for BANP %s: %w", banp.name, err)
}
updateANPRuleCountMetric(banp, true)
// we can delete the object from the cache now (set the cache back to empty value).
c.banpCache = &adminNetworkPolicyState{}
metrics.DecrementBANPCount()
Expand Down Expand Up @@ -153,6 +154,7 @@ func (c *Controller) ensureBaselineAdminNetworkPolicy(banp *anpapi.BaselineAdmin
// since transact was successful we can finally populate the cache
c.banpCache = desiredBANPState
metrics.IncrementBANPCount()
updateANPRuleCountMetric(currentBANPState, true)
return nil
}
// BANP state existed in the cache, which means its either a BANP update or pod/namespace add/update/delete
Expand Down
126 changes: 126 additions & 0 deletions go-controller/pkg/ovn/controller/admin_network_policy/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package adminnetworkpolicy

import (
"fmt"

libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
)

/*var metricANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.MetricOvnkubeNamespace,
Subsystem: metrics.MetricOvnkubeSubsystemController,
Name: "admin_network_policy_rules", // doing a sum across all ANPs will give the absolute count in the cluster
Help: "The total number of rules in a given admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Pass" or "Allow" or "Deny"; so cardinality is max 3 for this label
},
)

var metricBANPRuleCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metrics.MetricOvnkubeNamespace,
Subsystem: metrics.MetricOvnkubeSubsystemController,
Name: "baseline_admin_network_policy_rules",
Help: "The total number of rules in a given baseline admin network policy in the cluster"},
[]string{
"direction", // direction is either "ingress" or "egress"; so cardinality is max 2 for this label
"action", // action is either "Allow" or "Deny"; so cardinality is max 2 for this label
},
)*/

// Descriptors used by the ClusterManagerCollector below.
var (
anpRuleCountDesc = prometheus.NewDesc(
"admin_network_policy_rules",
"The total number of rules in a given admin network policy in the cluster",
[]string{"direction", "action"}, nil,
)
)


// ANPControllerCollector implements the Collector interface.
type ANPControllerCollector struct {
ANPController *Controller
}

func (c *Controller) initMetricsCollector() {
reg := prometheus.NewPedanticRegistry()
cc := ANPControllerCollector{ANPController: c}
prometheus.WrapRegistererWith(prometheus.Labels{"zone": c.zone}, reg).MustRegister(cc)
// Add the standard process and Go metrics to the custom registry.
reg.MustRegister(
// expose process metrics like CPU, Memory, file descriptor usage etc.
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// expose Go runtime metrics like GC stats, memory stats etc.
collectors.NewGoCollector(),
)
}

// Describe is implemented with DescribeByCollect. That's possible because the
// Collect method will always return the same two metrics with the same two
// descriptors.
func (cc ANPControllerCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(cc, ch)
}

func (cc ANPControllerCollector) updateANPRuleCountMetric(anpCache map[string]*adminNetworkPolicyState) int {
for _, state := range anpCache {
return cc.updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), state.ingressRules, false)
//updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), state.egressRules, false)
}
return 0
}

func (cc ANPControllerCollector) updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) int {
var passCount, allowCount, denyCount int
for _, rule := range rules {
switch rule.action {
case nbdb.ACLActionAllowRelated:
allowCount++
case nbdb.ACLActionDrop:
denyCount++
case nbdb.ACLActionPass:
passCount++
default:
panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action))
}
}
return allowCount+passCount+denyCount
/*if isBanp {
cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount))
cc.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount))
} else {
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount))
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount))
cc.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount))
}*/
}

// UpdateANPRuleCount records the number of AdminNetworkPolicy rules.
/*func (cc ANPControllerCollector) UpdateANPRuleCount(direction, action string, count float64) {
metricANPRuleCount.WithLabelValues(direction, action).Set(count)
}

// UpdateBANPRuleCount records the number of BaselineAdminNetworkPolicy rules.
func (cc ANPControllerCollector) UpdateBANPRuleCount(direction, action string, count float64) {
metricBANPRuleCount.WithLabelValues(direction, action).Set(count)
}*/

// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it
// creates constant metrics for each host on the fly based on the returned data.
//
// Note that Collect could be called concurrently, so we depend on
// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe.
func (cc ANPControllerCollector) Collect(ch chan<- prometheus.Metric) {
ruleCount := cc.updateANPRuleCountMetric(cc.ANPController.anpCache)
ch <- prometheus.MustNewConstMetric(
anpRuleCountDesc,
prometheus.CounterValue,
float64(ruleCount),
string(libovsdbutil.ACLIngress),
"Allow",
)
}
30 changes: 30 additions & 0 deletions go-controller/pkg/ovn/controller/admin_network_policy/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config"
libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops"
libovsdbutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/util"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb"
addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set"
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util"
Expand Down Expand Up @@ -168,3 +169,32 @@ func getACLLoggingLevelsForANP(annotations map[string]string) (*libovsdbutil.ACL
}
return aclLogLevels, apierrors.NewAggregate(errors)
}

func updateANPRuleCountMetric(desiredState *adminNetworkPolicyState, isBanp bool) {
updateANPGressRuleCountMetric(string(libovsdbutil.ACLIngress), desiredState.ingressRules, isBanp)
updateANPGressRuleCountMetric(string(libovsdbutil.ACLEgress), desiredState.egressRules, isBanp)
}

func updateANPGressRuleCountMetric(direction string, rules []*gressRule, isBanp bool) {
var passCount, allowCount, denyCount int
for _, rule := range rules {
switch rule.action {
case nbdb.ACLActionAllowRelated:
allowCount++
case nbdb.ACLActionDrop:
denyCount++
case nbdb.ACLActionPass:
passCount++
default:
panic(fmt.Sprintf("Failed to count rule type: unknown acl action %s", rule.action))
}
}
if isBanp {
metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionAllow), float64(allowCount))
metrics.UpdateBANPRuleCount(direction, string(anpapi.BaselineAdminNetworkPolicyRuleActionDeny), float64(denyCount))
} else {
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionAllow), float64(allowCount))
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionDeny), float64(denyCount))
metrics.UpdateANPRuleCount(direction, string(anpapi.AdminNetworkPolicyRuleActionPass), float64(passCount))
}
}