Skip to content

Commit

Permalink
cnf-tests: Compare Multus and SR-IOV metrics
Browse files Browse the repository at this point in the history
Statistics that relates to Multus interfaces can be collected by
joining network-metrics-daemon [1] and cAdvisor [2] (see [3]).
The same information, for kernel netdevice SR-IOV interface can be
collected via the `sriov-network-metrics-exporter` [4], which leverages
the Physical Function to get statistics about the Virtual Functions.

Proposed test case verifies both sources produces congruent values.

Only TX statistics are verified, as receiving ones might be spoiled by
noise traffic on the wire (e.g. other nodes sending DHCP broadcast
requests).

[1] https://github.com/openshift/network-metrics-daemon
[2] https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md
[3] https://docs.openshift.com/container-platform/4.16/networking/associating-secondary-interfaces-metrics-to-network-attachments.html#cnf-associating-secondary-interfaces-metrics-with-network-name_secondary-interfaces-metrics
[4] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Sep 10, 2024
1 parent 00f4750 commit 7a22067
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 1 deletion.
256 changes: 256 additions & 0 deletions cnf-tests/testsuites/e2esuite/metrics/sriovnetworkmetricsexporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
package metrics

import (
"context"
"encoding/json"
"fmt"
"net/url"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

sriovtestclient "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/client"
sriovcluster "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/cluster"
sriovnamespaces "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/namespaces"
sriovnetwork "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/network"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/client"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/discovery"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/images"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/namespaces"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/networks"
"github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/pkg/pods"

"github.com/prometheus/common/model"
)

const testNamespace string = "test-sriov-metrics"

var sriovclient *sriovtestclient.ClientSet

func init() {
sriovclient = sriovtestclient.New("")
}

var _ = Describe("[sriov] SR-IOV Network Metrics Exporter", func() {

var sriovCapableNodes *sriovcluster.EnabledNodes

BeforeEach(func() {
if discovery.Enabled() {
Skip("Discovery mode not supported")
}

restoreFeatureGates := enableMetricsExporterFeatureGate()
DeferCleanup(restoreFeatureGates)

By("Adding monitoring label to " + namespaces.SRIOVOperator)
err := sriovnamespaces.AddLabel(sriovclient, context.Background(), namespaces.SRIOVOperator, "openshift.io/cluster-monitoring", "true")
Expect(err).ToNot(HaveOccurred())

By("Clean SRIOV policies and networks")
networks.CleanSriov(sriovclient)

By("Discover SRIOV devices")
sriovCapableNodes, err = sriovcluster.DiscoverSriov(sriovclient, namespaces.SRIOVOperator)
Expect(err).ToNot(HaveOccurred())

err = namespaces.Create(testNamespace, client.Client)
Expect(err).ToNot(HaveOccurred())
namespaces.CleanPods(testNamespace, client.Client)
})

It("should provide the same metrics as network-metrics-daemon", func() {
testNode, testDevice, err := sriovCapableNodes.FindOneSriovNodeAndDevice()
Expect(err).ToNot(HaveOccurred())
By("Using device " + testDevice.Name + " on node " + testNode)

sriovNetworkNodePolicy, err := sriovnetwork.CreateSriovPolicy(
sriovclient, "test-metrics-", namespaces.SRIOVOperator,
testDevice.Name, testNode, 8,
"testsriovmetricsresource", "netdevice",
)
Expect(err).ToNot(HaveOccurred())
DeferCleanup(sriovclient.Delete, context.Background(), sriovNetworkNodePolicy)

ipam := `{ "type": "host-local", "subnet": "192.0.2.0/24" }`
err = sriovnetwork.CreateSriovNetwork(sriovclient, testDevice, "test-metrics-network",
testNamespace, namespaces.SRIOVOperator, "testsriovmetricsresource", ipam)
Expect(err).ToNot(HaveOccurred())

serverPod, clientPod := makeClientAndServerNetcatPod()

// Do not verify pairs
// "container_network_receive_packets_total": "sriov_vf_rx_packets",
// "container_network_receive_bytes_total": "sriov_vf_rx_bytes",
// because there might be traffic on the wire that disturbs the counters.
// An example is a DHCP traffic that other nodes are producing, e.g. (tcpdump):
//
// 13:28:00.442893 04:3f:72:fe:d1:d1 > ff:ff:ff:ff:ff:ff, ethertype IPv4 (0x0800), length 327: 0.0.0.0.68 > 255.255.255.255.67: BOOTP/DHCP, Request from 04:3f:72:fe:d1:d1, length 285
metricsToMatch := map[string]string{
"container_network_transmit_packets_total": "sriov_vf_tx_packets",
"container_network_transmit_bytes_total": "sriov_vf_tx_bytes",
}
containerQuery := `%s + on(namespace,pod,interface) group_left(network_name) (pod_network_name_info{interface="net1",pod="%s"})`
sriovQuery := `%s * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice{pod="%s"}`

for containerMetricName, sriovMetricName := range metricsToMatch {
By(fmt.Sprintf("verifying metrics %s == %s", containerMetricName, sriovMetricName))
assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, serverPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, serverPod.Name),
)

assertPromQLHasTheSameResult(
fmt.Sprintf(containerQuery, containerMetricName, clientPod.Name),
fmt.Sprintf(sriovQuery, sriovMetricName, clientPod.Name),
)
}
})
})

func makeClientAndServerNetcatPod() (*corev1.Pod, *corev1.Pod) {
serverPod := pods.DefinePod(testNamespace)
serverPod.GenerateName = "testpod-nc-server-"
serverPod = pods.RedefinePodWithNetwork(serverPod, `[{"name": "test-metrics-network","ips":["192.0.2.101/24"]}]`)
serverPod.Spec.Containers = append(serverPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-server",
Image: images.For(images.TestUtils),
Command: []string{"nc", "-vv", "--keep-open", "--listen", "5000"},
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
serverPod, err := pods.CreateAndStart(serverPod)
Expect(err).ToNot(HaveOccurred())

clientPod := pods.DefinePod(testNamespace)
clientPod.GenerateName = "testpod-nc-client-"
clientPod = pods.RedefinePodWithNetwork(clientPod, `[{"name": "test-metrics-network","ips":["192.0.2.102/24"]}]`)
clientPod.Spec.Containers = append(clientPod.Spec.Containers, corev1.Container{
Name: "netcat-tcp-client",
Image: images.For(images.TestUtils),
Command: makeNetcatClientCommand("192.0.2.101 5000"),
SecurityContext: &corev1.SecurityContext{Privileged: ptr.To(true)},
})
clientPod, err = pods.CreateAndStart(clientPod)
Expect(err).ToNot(HaveOccurred())

return clientPod, serverPod
}

func makeNetcatClientCommand(targetIpAddress string) []string {
// This command send 1001 bytes via netcat
script := fmt.Sprintf(
`
sleep 10;
printf %%01000d 1 | nc -w 1 %s;
sleep inf
`, targetIpAddress)
return []string{"bash", "-xec", script}
}

func runPromQLQuery(query string) model.Vector {
prometheusPods, err := client.Client.Pods("").List(context.Background(), metav1.ListOptions{
LabelSelector: "app.kubernetes.io/component=prometheus",
})
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, prometheusPods.Items).ToNot(HaveLen(0), "At least one Prometheus operator pod expected")

prometheusPod := prometheusPods.Items[0]

url := fmt.Sprintf("localhost:9090/api/v1/query?%s", (url.Values{"query": []string{query}}).Encode())
command := []string{"curl", url}
outputBuffer, err := pods.ExecCommand(client.Client, prometheusPod, command)
ExpectWithOffset(1, err).
ToNot(HaveOccurred(),
"promQL query failed: [%s/%s] command: [%v]\nstdout: %s\nstderr: %s", prometheusPod.Namespace, prometheusPod.Name, command, outputBuffer)

result := struct {
Status string `json:"status"`
Data struct {
ResultType string `json:"resultType"`
Result model.Vector `json:"result"`
} `json:"data"`
}{}

json.Unmarshal(outputBuffer.Bytes(), &result)
ExpectWithOffset(1, err).ToNot(HaveOccurred())
ExpectWithOffset(1, result.Status).To(Equal("success"), "cURL for [%s] failed: %s", url, outputBuffer.String())

return result.Data.Result
}

func enableMetricsExporterFeatureGate() func() {

operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

// Save the current feature gates map to allowing restore
oldFeatureGates := make(map[string]bool)
for k, v := range operatorConfig.Spec.FeatureGates {
oldFeatureGates[k] = v
}

if operatorConfig.Spec.FeatureGates == nil {
operatorConfig.Spec.FeatureGates = make(map[string]bool)
}

if operatorConfig.Spec.FeatureGates["metricsExporter"] {
// The feature is already enabled: nothing to do
return func() {}
}

By("Enabling metricsExporter feature gate")
operatorConfig.Spec.FeatureGates["metricsExporter"] = true

_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())

return func() {
By("Resetting feature gate to its previous value")
operatorConfig, err := sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Get(context.Background(), "default", metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())

operatorConfig.Spec.FeatureGates = oldFeatureGates
_, err = sriovclient.SriovOperatorConfigs(namespaces.SRIOVOperator).Update(context.Background(), operatorConfig, metav1.UpdateOptions{})
Expect(err).ToNot(HaveOccurred())
}
}

// assertPromQLHasTheSameResult evaluates both PromQL queries and checks if both return the same value.
func assertPromQLHasTheSameResult(queryA, queryB string) {
failedValues := "A - B"

Eventually(func(g Gomega) {
samplesA := runPromQLQuery(queryA)
samplesB := runPromQLQuery(queryB)

failedValues += fmt.Sprintf("%s %v - %v\n", time.Now().Format(time.StampMilli), samplesA, samplesB)

g.Expect(samplesB).To(HaveLen(1), "queryB[%s]", queryB)
valueB := float64(samplesB[0].Value)

g.Expect(samplesA).To(HaveLen(1), "queryA[%s]", queryA)
valueA := float64(samplesA[0].Value)

g.Expect(valueA).To(
Equal(valueB),
"queries returned different values:\nqueryA[%s]=%f\nqueryB[%s]=%f",
queryA, valueA, queryB, valueB,
)
}).
WithPolling(1*time.Second).
WithTimeout(2*time.Minute).
WithOffset(1).
Should(Succeed(), func() string {
return fmt.Sprintf(`queries didn't return congruent values
queryA = [%s]
queryB = [%s],
recent values
%s`, queryA, queryB, failedValues)
})
}
1 change: 1 addition & 0 deletions cnf-tests/testsuites/e2esuite/test_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/bond" // this is needed otherwise the bond test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/dpdk" // this is needed otherwise the dpdk test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/fec" // this is needed otherwise the fec test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/metrics" // this is needed otherwise the metrics test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/multinetworkpolicy" // this is needed otherwise the multinetworkpolicy test won't be executed'
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/ovs_qos" // this is needed otherwise the ovs_qos test won't be executed
_ "github.com/openshift-kni/cnf-features-deploy/cnf-tests/testsuites/e2esuite/s2i" // this is needed otherwise the dpdk test won't be executed
Expand Down
11 changes: 10 additions & 1 deletion cnf-tests/testsuites/pkg/utils/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package utils
import (
"errors"
"os"
"strings"

gkopv1alpha "github.com/gatekeeper/gatekeeper-operator/api/v1alpha1"
sriovv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1"
Expand Down Expand Up @@ -145,7 +146,15 @@ func NewReporter(reportPath string) (*k8sreporter.KubernetesReporter, error) {

namespaceToLog := func(ns string) bool {
_, found := namespacesToDump[ns]
return found
if found {
return true
}

if strings.HasPrefix(ns, "test-") {
return true
}

return false
}

err := os.Mkdir(reportPath, 0755)
Expand Down

0 comments on commit 7a22067

Please sign in to comment.