Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CNF-11231: Compare Multus and SR-IOV metrics #2025

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 268 additions & 0 deletions cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
package metrics

import (
"context"
"encoding/json"
"fmt"
"net/url"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

sriovtestclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client"
sriovcluster "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
sriovnamespaces "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
sriovnetwork "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/client"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/discovery"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/images"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/namespaces"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/networks"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/pods"

"github.com/prometheus/common/model"
)

const testNamespace string = "test-sriov-metrics"

var sriovclient *sriovtestclient.ClientSet

func init() {
sriovclient = sriovtestclient.New("")
}

var _ = Describe("[sriov] SR-IOV Network Metrics Exporter", func() {

var sriovCapableNodes *sriovcluster.EnabledNodes

BeforeEach(func() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this one be BeforeAll if in the future we add more tests?

if discovery.Enabled() {
Skip("Discovery mode not supported")
}

restoreFeatureGates := enableMetricsExporterFeatureGate()
DeferCleanup(restoreFeatureGates)

By("Adding monitoring label to " + namespaces.SRIOVOperator)
err := sriovnamespaces.AddLabel(sriovclient, context.Background(), namespaces.SRIOVOperator, "openshift.io/cluster-monitoring", "true")
Expect(err).ToNot(HaveOccurred())

By("Clean SRIOV policies and networks")
networks.CleanSriov(sriovclient)

By("Discover SRIOV devices")
sriovCapableNodes, err = sriovcluster.DiscoverSriov(sriovclient, namespaces.SRIOVOperator)
Expect(err).ToNot(HaveOccurred())

err = namespaces.Create(testNamespace, client.Client)
Expect(err).ToNot(HaveOccurred())
namespaces.CleanPods(testNamespace, client.Client)
})

It("should provide the same metrics as network-metrics-daemon", func() {
testNode, testDevice, err := sriovCapableNodes.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + testDevice.Name + " on node " + testNode)

sriovNetworkNodePolicy, err := sriovnetwork.CreateSriovPolicy(
sriovclient, "test-metrics-", namespaces.SRIOVOperator,
testDevice.Name, testNode, 8,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think most of the tests use 5 vfs can we maintain the same number? It should help us if the device is a Mellanox card to have fewer reboots.

"testsriovmetricsresource", "netdevice",
)
Expect(err).ToNot(HaveOccurred())
DeferCleanup(sriovclient.Delete, context.Background(), sriovNetworkNodePolicy)

ipam := `{ "type": "host-local", "subnet": "192.0.2.0/24" }`
err = sriovnetwork.CreateSriovNetwork(sriovclient, testDevice, "test-metrics-network",
testNamespace, namespaces.SRIOVOperator, "testsriovmetricsresource", ipam)
Expect(err).ToNot(HaveOccurred())

serverPod, clientPod := makeClientAndServerNetcatPod()

// Do not verify pairs
// "container_network_receive_packets_total": "sriov_vf_rx_packets",
// "container_network_receive_bytes_total": "sriov_vf_rx_bytes",
// because there might be traffic on the wire that disturbs the counters.
// An example is a DHCP traffic that other nodes are producing, e.g. (tcpdump):
//
// 13:28:00.442893 04:3f:72:fe:d1:d1 > ff:ff:ff:ff:ff:ff, ethertype IPv4 (0x0800), length 327: 0.0.0.0.68 > 255.255.255.255.67: BOOTP/DHCP, Request from 04:3f:72:fe:d1:d1, length 285
metricsToMatch := map[string]string{
"container_network_transmit_packets_total": "sriov_vf_tx_packets",
"container_network_transmit_bytes_total": "sriov_vf_tx_bytes",
}
containerQuery := `%s + on(namespace,pod,interface) group_left(network_name) (pod_network_name_info{interface="net1",pod="%s"})`
sriovQuery := `%s * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice{pod="%s"}`

for containerMetricName, sriovMetricName := range metricsToMatch {
By(fmt.Sprintf("verifying metrics %s == %s", containerMetricName, sriovMetricName))
assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, serverPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, serverPod.Name),
)

assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, clientPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, clientPod.Name),
)
}
})
})

func makeClientAndServerNetcatPod() (*corev1.Pod, *corev1.Pod) {
serverPod := pods.DefinePod(testNamespace)
serverPod.GenerateName = "testpod-nc-server-"
serverPod = pods.RedefinePodWithNetwork(serverPod, `[{"name": "test-metrics-network","ips":["192.0.2.101/24"]}]`)
serverPod.Spec.Containers = append(serverPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-server",
Image: images.For(images.TestUtils),
Command: []string{"nc", "-vv", "--keep-open", "--listen", "5000"},
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
serverPod, err := pods.CreateAndStart(serverPod)
Expect(err).ToNot(HaveOccurred())

clientPod := pods.DefinePod(testNamespace)
clientPod.GenerateName = "testpod-nc-client-"
clientPod = pods.RedefinePodWithNetwork(clientPod, `[{"name": "test-metrics-network","ips":["192.0.2.102/24"]}]`)
clientPod.Spec.Containers = append(clientPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-client",
Image: images.For(images.TestUtils),
Command: makeNetcatClientCommand("192.0.2.101 5000"),
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
clientPod, err = pods.CreateAndStart(clientPod)
Expect(err).ToNot(HaveOccurred())

return clientPod, serverPod
}

func makeNetcatClientCommand(targetIpAddress string) []string {
// This command send 1001 bytes via netcat
script := fmt.Sprintf(
`
sleep 10;
printf %%01000d 1 | nc -w 1 %s;
sleep inf
`, targetIpAddress)
return []string{"bash", "-xec", script}
}

func runPromQLQuery(query string) (model.Vector, error) {
prometheusPods, err := client.Client.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
if err != nil {
return nil, fmt.Errorf("can't find a Prometheus pod: %w", err)
}

if len(prometheusPods.Items) == 0 {
return nil, fmt.Errorf("no instance of Prometheus found")
}

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
outputBuffer, err := pods.ExecCommand(client.Client, prometheusPod, command)
if err != nil {
return nil, fmt.Errorf("promQL query : [%s/%s] command: [%v]\nout: %s\n%w",
prometheusPod.Namespace, prometheusPod.Name, command, outputBuffer.String(), err)
}

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal(outputBuffer.Bytes(), &result)
if err != nil {
return nil, fmt.Errorf("can't unmarshal PromQL result: query[%s] response[%s] error: %w", query, outputBuffer.String(), err)
}
if result.Status != "success" {
return nil, fmt.Errorf("PromQL statement failed: query[%s] result[%v]", query, result)
}

return result.Data.Result, nil
}

func enableMetricsExporterFeatureGate() func() {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: remove empty line

operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

// Save the current feature gates map to allowing restore
oldFeatureGates := make(map[string]bool)
for k, v := range operatorConfig.Spec.FeatureGates {
oldFeatureGates[k] = v
}

if operatorConfig.Spec.FeatureGates == nil {
operatorConfig.Spec.FeatureGates = make(map[string]bool)
}

if operatorConfig.Spec.FeatureGates["metricsExporter"] {
// The feature is already enabled: nothing to do
return func() {}
}

By("Enabling metricsExporter feature gate")
operatorConfig.Spec.FeatureGates["metricsExporter"] = true

_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())

return func() {
By("Resetting feature gate to its previous value")
operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

operatorConfig.Spec.FeatureGates = oldFeatureGates
_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())
}
}

// assertPromQLHasTheSameResult evaluates both PromQL queries and checks if both return the same value.
func assertPromQLHasTheSameResult(queryA, queryB string) {
failedValues := "time A - B\n "

Eventually(func(g Gomega) {
samplesA, errA := runPromQLQuery(queryA)
samplesB, errB := runPromQLQuery(queryB)

failedValues += fmt.Sprintf("%s %v - %v\n", time.Now().Format(time.StampMilli), samplesA, samplesB)

g.Expect(errA).ToNot(HaveOccurred())
g.Expect(samplesA).To(HaveLen(1), "queryA[%s]", queryA)
valueA := float64(samplesA[0].Value)

g.Expect(errB).ToNot(HaveOccurred())
g.Expect(samplesB).To(HaveLen(1), "queryB[%s]", queryB)
valueB := float64(samplesB[0].Value)

g.Expect(valueA).To(
Equal(valueB),
"queries returned different values:\nqueryA[%s]=%f\nqueryB[%s]=%f",
queryA, valueA, queryB, valueB,
)
}).
WithPolling(1*time.Second).
WithTimeout(2*time.Minute).
WithOffset(1).
Should(Succeed(), func() string {
return fmt.Sprintf(`queries didn't return congruent values
queryA = [%s]
queryB = [%s],
recent values
%s`, queryA, queryB, failedValues)
})
}
1 change: 1 addition & 0 deletions cnf-tests/testsuites/e2esuite/test_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/dpdk"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/fec"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/knmstate"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/metrics"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/multinetworkpolicy"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/ovs_qos"
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/s2i"
Expand Down
11 changes: 10 additions & 1 deletion cnf-tests/testsuites/pkg/utils/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package utils
import (
"errors"
"os"
"strings"

gkopv1alpha "github.com/gatekeeper/gatekeeper-operator/api/v1alpha1"
sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
Expand Down Expand Up @@ -145,7 +146,15 @@ func NewReporter(reportPath string) (*k8sreporter.KubernetesReporter, error) {

namespaceToLog := func(ns string) bool {
_, found := namespacesToDump[ns]
return found
if found {
return true
}

if strings.HasPrefix(ns, "test-") {
return true
}

return false
}

err := os.Mkdir(reportPath, 0755)
Expand Down
4 changes: 2 additions & 2 deletions cnf-tests/testsuites/validationsuite/cluster/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ var _ = Describe("validation", func() {
err := testclient.Client.Get(context.TODO(), goclient.ObjectKey{Name: "default", Namespace: namespaces.SRIOVOperator}, operatorConfig)
Expect(err).ToNot(HaveOccurred())

if *operatorConfig.Spec.EnableInjector {
if operatorConfig.Spec.EnableInjector {
daemonset, err := testclient.Client.DaemonSets(namespaces.SRIOVOperator).Get(context.Background(), "network-resources-injector", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(daemonset.Status.DesiredNumberScheduled).To(Equal(daemonset.Status.NumberReady))
Expand All @@ -200,7 +200,7 @@ var _ = Describe("validation", func() {
err := testclient.Client.Get(context.TODO(), goclient.ObjectKey{Name: "default", Namespace: namespaces.SRIOVOperator}, operatorConfig)
Expect(err).ToNot(HaveOccurred())

if *operatorConfig.Spec.EnableOperatorWebhook {
if operatorConfig.Spec.EnableOperatorWebhook {
daemonset, err := testclient.Client.DaemonSets(namespaces.SRIOVOperator).Get(context.Background(), "operator-webhook", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(daemonset.Status.DesiredNumberScheduled).To(Equal(daemonset.Status.NumberReady))
Expand Down
24 changes: 13 additions & 11 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ module github.com/openshift-kni/cnf-features-deploy
// - openshift-ci/Dockerfile*
// - ztp/resource-generator/Containerfile
// - ztp/tools/pgt2acmpg/go.mod
go 1.22
go 1.22.4

toolchain go1.22.7

require (
github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f
Expand All @@ -29,14 +31,14 @@ require (
github.com/open-policy-agent/gatekeeper/v3 v3.13.0
github.com/openshift-kni/k8sreporter v1.0.5
github.com/openshift-psap/special-resource-operator v0.0.0-00010101000000-000000000000
github.com/openshift/api v0.0.0-20230807121159-a81c3efc8824
github.com/openshift/api v0.0.0-20230807132801-600991d550ac
github.com/openshift/client-go v0.0.0-20230807132528-be5346fb33cb
github.com/openshift/cluster-nfd-operator v0.0.0-00010101000000-000000000000
github.com/openshift/cluster-node-tuning-operator v0.0.0-00010101000000-000000000000
github.com/openshift/machine-config-operator v0.0.1-0.20230807154212-886c5c3fc7a9
github.com/openshift/machine-config-operator v0.0.1-0.20231024085435-7e1fb719c1ba
github.com/openshift/ptp-operator v0.0.0-00010101000000-000000000000
github.com/stretchr/testify v1.8.4
golang.org/x/sys v0.16.0
golang.org/x/sys v0.20.0
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.28.3
k8s.io/apiextensions-apiserver v0.28.3
Expand Down Expand Up @@ -105,12 +107,12 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/openshift/custom-resource-status v1.1.3-0.20220503160415-f2fdb4999d87 // indirect
github.com/openshift/library-go v0.0.0-20230803043003-e1dfb9bf12bb // indirect
github.com/openshift/library-go v0.0.0-20231020125025-211b32f1a1f2 // indirect
github.com/operator-framework/api v0.10.7 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.57.0 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/client v0.57.0 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.68.0 // indirect
github.com/prometheus-operator/prometheus-operator/pkg/client v0.68.0 // indirect
github.com/prometheus/client_golang v1.17.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
Expand All @@ -126,12 +128,12 @@ require (
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.25.0 // indirect
go4.org v0.0.0-20200104003542-c7e774b10ea0 // indirect
golang.org/x/crypto v0.17.0 // indirect
golang.org/x/crypto v0.21.0 // indirect
golang.org/x/exp v0.0.0-20231006140011-7918f672742d // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/net v0.23.0 // indirect
golang.org/x/oauth2 v0.13.0 // indirect
golang.org/x/sync v0.5.0 // indirect
golang.org/x/term v0.15.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.16.1 // indirect
Expand Down Expand Up @@ -200,7 +202,7 @@ replace (

// Test deps
replace (
github.com/k8snetworkplumbingwg/sriov-network-operator => github.com/openshift/sriov-network-operator v0.0.0-20240125124104-58986501f2b4 // release-4.16
github.com/k8snetworkplumbingwg/sriov-network-operator => github.com/openshift/sriov-network-operator v0.0.0-20241030164825-c0e75feba48a // release-4.16
github.com/nmstate/kubernetes-nmstate/api => github.com/openshift/kubernetes-nmstate/api v0.0.0-20240726065608-fbf9eb6f75e6
github.com/openshift-psap/special-resource-operator => github.com/openshift/special-resource-operator v0.0.0-20211202035230-4c86f99c426b // release-4.10
github.com/openshift/cluster-nfd-operator => github.com/openshift/cluster-nfd-operator v0.0.0-20240125121050-830c889e311e // release-4.9
Expand Down
Loading