From 257cd883b71ea568f988df0494fcde231a8a70af Mon Sep 17 00:00:00 2001 From: lou-lan Date: Wed, 25 Sep 2024 18:39:41 +0800 Subject: [PATCH] Add RDAM metrics Signed-off-by: lou-lan --- charts/spiderpool/README.md | 1 + .../files/grafana-rdma-cluster.json | 496 ++ .../spiderpool/files/grafana-rdma-node.json | 945 ++++ charts/spiderpool/files/grafana-rdma-pod.json | 4009 +++++++++++++++++ charts/spiderpool/templates/daemonset.yaml | 341 +- .../grafanaDashboardRdmaCluster.yaml | 26 + .../templates/grafanaDashboardRdmaNode.yaml | 26 + .../templates/grafanaDashboardRdmaPod.yaml | 26 + charts/spiderpool/values.yaml | 3 + cmd/spiderpool-agent/cmd/config.go | 2 + cmd/spiderpool-agent/cmd/daemon.go | 6 +- cmd/spiderpool-agent/cmd/metrics_server.go | 9 +- docs/mkdocs.yml | 1 + docs/usage/rdma-metrics-zh-CN.md | 81 + docs/usage/rdma-metrics.md | 80 + go.mod | 2 +- images/spiderpool-agent/Dockerfile | 2 +- images/spiderpool-base/install-others.sh | 1 + .../ippool_manager_suite_test.go | 11 +- pkg/metric/metrics.go | 3 +- pkg/metric/metrics_instance.go | 12 +- pkg/rdmametrics/ethtool/ethtool.go | 27 + pkg/rdmametrics/metrics.go | 628 +++ pkg/rdmametrics/metrics_test.go | 1226 +++++ .../subnet_manager_suite_test.go | 12 +- test/doc/metric.md | 7 +- vendor/k8s.io/utils/exec/testing/fake_exec.go | 277 ++ vendor/modules.txt | 1 + 28 files changed, 8070 insertions(+), 191 deletions(-) create mode 100644 charts/spiderpool/files/grafana-rdma-cluster.json create mode 100644 charts/spiderpool/files/grafana-rdma-node.json create mode 100644 charts/spiderpool/files/grafana-rdma-pod.json create mode 100644 charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml create mode 100644 charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml create mode 100644 charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml create mode 100644 docs/usage/rdma-metrics-zh-CN.md create mode 100644 docs/usage/rdma-metrics.md create mode 100644 pkg/rdmametrics/ethtool/ethtool.go create mode 100644 pkg/rdmametrics/metrics.go create mode 100644 pkg/rdmametrics/metrics_test.go create mode 100644 vendor/k8s.io/utils/exec/testing/fake_exec.go diff --git a/charts/spiderpool/README.md b/charts/spiderpool/README.md index a95db4f81e..6c9ed700e3 100644 --- a/charts/spiderpool/README.md +++ b/charts/spiderpool/README.md @@ -279,6 +279,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `spiderpoolAgent.healthChecking.readinessProbe.failureThreshold` | the failure threshold of startup probe for spiderpoolAgent health checking | `3` | | `spiderpoolAgent.healthChecking.readinessProbe.periodSeconds` | the period seconds of startup probe for spiderpoolAgent health checking | `10` | | `spiderpoolAgent.prometheus.enabled` | enable spiderpool agent to collect metrics | `false` | +| `spiderpoolAgent.prometheus.enabledRdmaMetric` | enable spiderpool agent to collect RDMA metrics | `false` | | `spiderpoolAgent.prometheus.enabledDebugMetric` | enable spiderpool agent to collect debug level metrics | `false` | | `spiderpoolAgent.prometheus.port` | the metrics port of spiderpool agent | `5711` | | `spiderpoolAgent.prometheus.serviceMonitor.install` | install serviceMonitor for spiderpool agent. This requires the prometheus CRDs to be available | `false` | diff --git a/charts/spiderpool/files/grafana-rdma-cluster.json b/charts/spiderpool/files/grafana-rdma-cluster.json new file mode 100644 index 0000000000..f786ea619c --- /dev/null +++ b/charts/spiderpool/files/grafana-rdma-cluster.json @@ -0,0 +1,496 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 4, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "editorMode": "code", + "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}) by (pod_name))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RDMA Pod Count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (rate(rdma_rx_vport_rdma_unicast_bytes_total{}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (rate(rdma_tx_vport_rdma_unicast_bytes_total{}[5m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Write", + "type": "timeseries" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 2, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spiderpool RDMA Dashboard | Cluster", + "uid": "2ZOHs2ZHk", + "version": 16, + "weekStart": "" +} \ No newline at end of file diff --git a/charts/spiderpool/files/grafana-rdma-node.json b/charts/spiderpool/files/grafana-rdma-node.json new file mode 100644 index 0000000000..aeda6315a0 --- /dev/null +++ b/charts/spiderpool/files/grafana-rdma-node.json @@ -0,0 +1,945 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 11, + "panels": [], + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (rdma_parent_name) (rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]))", + "legendFormat": "{{net_dev_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Node bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (rdma_parent_name) (rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]))", + "legendFormat": "{{net_dev_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Node bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\"} * 1000000 / 8) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\"} * 1000000 / 8) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Write", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 13, + "panels": [], + "title": "Host RDMA Device", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "ibp13s0" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "legendFormat": "{{net_dev_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Host RDMA Devices | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "ibp13s0" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "legendFormat": "{{net_dev_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Host RDMA Devices | Write", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 27, + "panels": [], + "title": "Pod RDMA Device", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "ibp13s0" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", + "legendFormat": "{{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod RDMA Devices | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "ibp13s0" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", + "legendFormat": "{{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Host RDMA Devices | Write", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 2, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "10-20-1-50", + "value": "10-20-1-50" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spiderpool RDMA Dashboard | Node", + "uid": "A0T4f2ZNz", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/charts/spiderpool/files/grafana-rdma-pod.json b/charts/spiderpool/files/grafana-rdma-pod.json new file mode 100644 index 0000000000..d7ba00e22b --- /dev/null +++ b/charts/spiderpool/files/grafana-rdma-pod.json @@ -0,0 +1,4009 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 47, + "panels": [], + "title": "Pod Level", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 40, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Throughput | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Throughput | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])/(rdma_vport_speed_mbps_total{pod_name!=\"\"}*1000000/8)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 63, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])/(rdma_vport_speed_mbps_total{pod_name!=\"\"}*1000000/8)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth rate | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 17 + }, + "id": 42, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_read_unicast_packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 17 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_vport_rdma_multicast_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_read_multicast_packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 17 + }, + "id": 39, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "read_rdma_multicast_bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 17 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_read_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rx_read_requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 25 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_tx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_write_unicast_packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 25 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(rdma_tx_vport_rdma_multicast_packets_total{pod_namespace!=\"\"}[1m])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_write_multicast_packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Mbits" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 25 + }, + "id": 41, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[1m]) * 8 / 1000000", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "write_rdma_multicast", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of received WRITE requests for the associated Queue Pairs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 25 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_write_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rx_write_requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 33 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "req cqe error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 33 + }, + "id": 43, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "husBV6iNz" + }, + "editorMode": "code", + "exemplar": false, + "expr": "irate(rdma_duplicate_request_total{pod_namespace!=\"\"}[1m])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "duplicate request", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 33 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_resp_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_resp_remote_access_errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 33 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_req_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_req_remote_access_errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 41 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_dct_connect_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rx_dct_connect", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 41 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rx_atomic_requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 41 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_req_remote_invalid_request_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_req_remote_invalid_request", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 41 + }, + "id": 33, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rdma_duplicate_request_total{pod_namespace!=\"\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_duplicate_request", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 49 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rx_atomic_requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 49 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_resp_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_resp_cqe_flush_error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 49 + }, + "id": 22, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_req_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_req_cqe_flush_error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 49 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_resp_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_resp_cqe_error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 57 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rnr_nak_retry_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rnr_nak_retry_err", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 57 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_out_of_sequence_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_out_of_sequence", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 57 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_packet_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_packet_seq_err", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 57 + }, + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_resp_local_length_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_resp_local_length_error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 65 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_implied_nak_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_implied_nak_seq_err", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 65 + }, + "id": 35, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_local_ack_timeout_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_local_ack_timeout_err", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 65 + }, + "id": 32, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_out_of_buffer_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_out_of_buffer", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 65 + }, + "id": 26, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_req_cqe_error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 73 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_np_cnp_sent_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_np_cnp_sent", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 73 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_roce_adp_retrans_to_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_roce_adp_retrans_to", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 73 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_roce_slow_restart_cnps_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{ifname}} - {{pod_namespace}}/{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_roce_slow_restart_cnps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 73 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_np_ecn_marked_roce_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_np_ecn_marked_roce_packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 81 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rp_cnp_handled_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rp_cnp_handled", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 81 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_roce_slow_restart_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{ifname}} - {{pod_namespace}}/{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_roce_slow_restart", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 81 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rp_cnp_ignored_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rp_cnp_ignored", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 81 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_roce_adp_retrans_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{ifname}} - {{pod_namespace}}/{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_roce_adp_retrans", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 89 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_rp_cnp_handled_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{ifname}} - {{pod_namespace}}/{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "rdma_rp_cnp_handled", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 2, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_namespace)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "rdma-test-gpu-tool-8zrh6", + "value": "rdma-test-gpu-tool-8zrh6" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "hide": 0, + "includeAll": false, + "label": "pod", + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "10-20-1-50", + "value": "10-20-1-50" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "node", + "options": [], + "query": { + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spiderpool RDMA Dashboard | Pod", + "uid": "DenUibiNk", + "version": 158, + "weekStart": "" +} \ No newline at end of file diff --git a/charts/spiderpool/templates/daemonset.yaml b/charts/spiderpool/templates/daemonset.yaml index 09b92745e7..3b2469f65b 100644 --- a/charts/spiderpool/templates/daemonset.yaml +++ b/charts/spiderpool/templates/daemonset.yaml @@ -85,18 +85,18 @@ spec: image: {{ include "plugins.image" . | quote }} imagePullPolicy: {{ .Values.plugins.image.pullPolicy }} env: - - name: INSTALL_CNI_PLUGINS - value: {{ .Values.plugins.installCNI | quote }} - - name: INSTALL_OVS_PLUGIN - value: {{ .Values.plugins.installOvsCNI | quote }} - - name: INSTALL_RDMA_PLUGIN - value: {{ .Values.plugins.installRdmaCNI | quote }} - - name: INSTALL_SRIOV_PLUGIN - value: {{ .Values.plugins.installSriovCNI | quote }} - - name: INSTALL_IB_SRIOV_PLUGIN - value: {{ .Values.plugins.installibSriovCNI | quote }} - - name: INSTALL_IPOIB_PLUGIN - value: {{ .Values.plugins.installIpoibCNI | quote }} + - name: INSTALL_CNI_PLUGINS + value: {{ .Values.plugins.installCNI | quote }} + - name: INSTALL_OVS_PLUGIN + value: {{ .Values.plugins.installOvsCNI | quote }} + - name: INSTALL_RDMA_PLUGIN + value: {{ .Values.plugins.installRdmaCNI | quote }} + - name: INSTALL_SRIOV_PLUGIN + value: {{ .Values.plugins.installSriovCNI | quote }} + - name: INSTALL_IB_SRIOV_PLUGIN + value: {{ .Values.plugins.installibSriovCNI | quote }} + - name: INSTALL_IPOIB_PLUGIN + value: {{ .Values.plugins.installIpoibCNI | quote }} command: - "/bin/sh" - "entrypoint.sh" @@ -107,195 +107,206 @@ spec: mountPath: /host/opt/cni/bin {{- end }} containers: - - name: {{ .Values.spiderpoolAgent.name | trunc 63 | trimSuffix "-" }} - image: {{ include "spiderpool.spiderpoolAgent.image" . | quote }} - imagePullPolicy: {{ .Values.spiderpoolAgent.image.pullPolicy }} - command: - - {{ .Values.spiderpoolAgent.binName }} - args: - - daemon - - --config-path=/tmp/spiderpool/config-map/conf.yml + - name: {{ .Values.spiderpoolAgent.name | trunc 63 | trimSuffix "-" }} + image: {{ include "spiderpool.spiderpoolAgent.image" . | quote }} + imagePullPolicy: {{ .Values.spiderpoolAgent.image.pullPolicy }} + command: + - {{ .Values.spiderpoolAgent.binName }} + args: + - daemon + - --config-path=/tmp/spiderpool/config-map/conf.yml {{- with .Values.spiderpoolAgent.extraArgs }} {{- toYaml . | trim | nindent 8 }} {{- end }} {{- if .Values.spiderpoolAgent.prometheus.enabled }} - ports: - - name: metrics - containerPort: {{ .Values.spiderpoolAgent.prometheus.port }} - protocol: TCP + ports: + - name: metrics + containerPort: {{ .Values.spiderpoolAgent.prometheus.port }} + protocol: TCP {{- end }} {{- if semverCompare ">=1.20-0" .Capabilities.KubeVersion.Version }} - startupProbe: - httpGet: - host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} - path: /v1/runtime/startup - port: {{ .Values.spiderpoolAgent.httpPort }} - scheme: HTTP - failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.startupProbe.failureThreshold }} - periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.startupProbe.periodSeconds }} - successThreshold: 1 + startupProbe: + httpGet: + host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} + path: /v1/runtime/startup + port: {{ .Values.spiderpoolAgent.httpPort }} + scheme: HTTP + failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.startupProbe.failureThreshold }} + periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.startupProbe.periodSeconds }} + successThreshold: 1 {{- end }} - livenessProbe: - httpGet: - host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} - path: /v1/runtime/liveness - port: {{ .Values.spiderpoolAgent.httpPort }} - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.livenessProbe.periodSeconds }} - successThreshold: 1 - failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.livenessProbe.failureThreshold }} - timeoutSeconds: 1 - readinessProbe: - httpGet: - host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} - path: /v1/runtime/readiness - port: {{ .Values.spiderpoolAgent.httpPort }} - scheme: HTTP - periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.readinessProbe.periodSeconds }} - successThreshold: 1 - failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.readinessProbe.failureThreshold }} - timeoutSeconds: 1 + livenessProbe: + httpGet: + host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} + path: /v1/runtime/liveness + port: {{ .Values.spiderpoolAgent.httpPort }} + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.livenessProbe.periodSeconds }} + successThreshold: 1 + failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.livenessProbe.failureThreshold }} + timeoutSeconds: 1 + readinessProbe: + httpGet: + host: {{ .Values.ipam.enableIPv4 | ternary "127.0.0.1" "::1" | quote }} + path: /v1/runtime/readiness + port: {{ .Values.spiderpoolAgent.httpPort }} + scheme: HTTP + periodSeconds: {{ .Values.spiderpoolAgent.healthChecking.readinessProbe.periodSeconds }} + successThreshold: 1 + failureThreshold: {{ .Values.spiderpoolAgent.healthChecking.readinessProbe.failureThreshold }} + timeoutSeconds: 1 {{- with .Values.spiderpoolAgent.resources }} - resources: - {{- toYaml . | trim | nindent 10 }} + resources: + {{- toYaml . | trim | nindent 12 }} {{- end }} - lifecycle: - postStart: - # Move an eventual old existing binary out of the way, we can't delete it as it might be in use right now - exec: - command: - - "/bin/sh" - - "-c" - - | - BIN_LIST="spiderpool coordinator ifacer" - for ITEM in ${BIN_LIST} ; do - rm -f /host/opt/cni/bin/${ITEM}.old || true - ( [ -f "/host/opt/cni/bin/${ITEM}" ] && mv /host/opt/cni/bin/${ITEM} /host/opt/cni/bin/${ITEM}.old ) || true - cp /usr/bin/${ITEM} /host/opt/cni/bin/${ITEM} - rm -f /host/opt/cni/bin/${ITEM}.old &>/dev/null || true - done - env: - - name: SPIDERPOOL_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: SPIDERPOOL_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: SPIDERPOOL_LOG_LEVEL - value: {{ .Values.spiderpoolAgent.debug.logLevel | quote }} - - name: SPIDERPOOL_ENABLED_METRIC - value: {{ .Values.spiderpoolAgent.prometheus.enabled | quote }} - - name: SPIDERPOOL_ENABLED_DEBUG_METRIC - value: {{ .Values.spiderpoolAgent.prometheus.enabledDebugMetric | quote }} - - name: SPIDERPOOL_METRIC_HTTP_PORT - value: {{ .Values.spiderpoolAgent.prometheus.port | quote }} - - name: SPIDERPOOL_HEALTH_PORT - value: {{ .Values.spiderpoolAgent.httpPort | quote }} - - name: SPIDERPOOL_GOPS_LISTEN_PORT - value: {{ .Values.spiderpoolAgent.debug.gopsPort | quote }} + lifecycle: + postStart: + # Move an eventual old existing binary out of the way, we can't delete it as it might be in use right now + exec: + command: + - "/bin/sh" + - "-c" + - | + BIN_LIST="spiderpool coordinator ifacer" + for ITEM in ${BIN_LIST} ; do + rm -f /host/opt/cni/bin/${ITEM}.old || true + ( [ -f "/host/opt/cni/bin/${ITEM}" ] && mv /host/opt/cni/bin/${ITEM} /host/opt/cni/bin/${ITEM}.old ) || true + cp /usr/bin/${ITEM} /host/opt/cni/bin/${ITEM} + rm -f /host/opt/cni/bin/${ITEM}.old &>/dev/null || true + done + env: + - name: SPIDERPOOL_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SPIDERPOOL_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: SPIDERPOOL_LOG_LEVEL + value: {{ .Values.spiderpoolAgent.debug.logLevel | quote }} + - name: SPIDERPOOL_ENABLED_METRIC + value: {{ .Values.spiderpoolAgent.prometheus.enabled | quote }} + - name: SPIDERPOOL_ENABLED_DEBUG_METRIC + value: {{ .Values.spiderpoolAgent.prometheus.enabledDebugMetric | quote }} + - name: SPIDERPOOL_ENABLED_RDMA_METRIC + value: {{ .Values.spiderpoolAgent.prometheus.enabledRdmaMetric | quote }} + - name: SPIDERPOOL_METRIC_HTTP_PORT + value: {{ .Values.spiderpoolAgent.prometheus.port | quote }} + - name: SPIDERPOOL_HEALTH_PORT + value: {{ .Values.spiderpoolAgent.httpPort | quote }} + - name: SPIDERPOOL_GOPS_LISTEN_PORT + value: {{ .Values.spiderpoolAgent.debug.gopsPort | quote }} {{- if .Values.multus.multusCNI.defaultCniCRName }} - - name: MULTUS_CLUSTER_NETWORK - value: {{ .Release.Namespace }}/{{ .Values.multus.multusCNI.defaultCniCRName }} + - name: MULTUS_CLUSTER_NETWORK + value: {{ .Release.Namespace }}/{{ .Values.multus.multusCNI.defaultCniCRName }} {{ end }} {{- with .Values.spiderpoolAgent.extraEnv }} {{- toYaml . | nindent 8 }} {{- end }} {{- if or .Values.spiderpoolAgent.tuneSysctlConfig .Values.spiderpoolAgent.securityContext }} - securityContext: - privileged: true + securityContext: + privileged: true {{- with .Values.spiderpoolAgent.securityContext }} {{- toYaml . | nindent 8 }} {{- end }} {{- end }} - volumeMounts: - - name: config-path - mountPath: /tmp/spiderpool/config-map - readOnly: true - - name: cni-bin-path - mountPath: /host{{ .Values.global.cniBinHostPath }} - - name: ipam-unix-socket-dir - mountPath: {{ dir .Values.global.ipamUNIXSocketHostPath }} + volumeMounts: + {{- if .Values.spiderpoolAgent.prometheus.enabledRdmaMetric }} + - name: host-ns + mountPath: /var/run/netns + {{- end }} + - name: config-path + mountPath: /tmp/spiderpool/config-map + readOnly: true + - name: cni-bin-path + mountPath: /host{{ .Values.global.cniBinHostPath }} + - name: ipam-unix-socket-dir + mountPath: {{ dir .Values.global.ipamUNIXSocketHostPath }} {{- if .Values.multus.multusCNI.uninstall }} - - name: cni - mountPath: /host/etc/cni/net.d + - name: cni + mountPath: /host/etc/cni/net.d {{- end }} {{- if .Values.spiderpoolAgent.extraVolumes }} {{- include "tplvalues.render" ( dict "value" .Values.spiderpoolAgent.extraVolumeMounts "context" $ ) | nindent 8 }} {{- end }} {{- if .Values.multus.multusCNI.install }} - - name: multus-cni - imagePullPolicy: {{ .Values.multus.multusCNI.image.pullPolicy }} - image: {{ include "spiderpool.multus.image" . | quote }} - command: - - "/bin/sh" - - "-c" - - | - ITEM="multus" - rm -f /host/opt/cni/bin/${ITEM}.old || true - ( [ -f "/host/opt/cni/bin/${ITEM}" ] && mv /host/opt/cni/bin/${ITEM} /host/opt/cni/bin/${ITEM}.old ) || true - cp /usr/src/multus-cni/bin/${ITEM} /host/opt/cni/bin/${ITEM} - rm -f /host/opt/cni/bin/${ITEM}.old &>/dev/null || true - ./entrypoint.sh --multus-conf-file=/tmp/multus-conf/00-multus.conf \ - --cni-version=0.3.1 - securityContext: - privileged: true - {{- if .Values.multus.multusCNI.uninstall }} - lifecycle: - preStop: - exec: - command: - - "/bin/sh" - - "-c" - - | - rm -f /host/opt/cni/bin/multus || true - rm -rf /host/etc/cni/net.d/multus.d || true - rm -f /host/etc/cni/net.d/00-multus.conf || true + - name: multus-cni + imagePullPolicy: {{ .Values.multus.multusCNI.image.pullPolicy }} + image: {{ include "spiderpool.multus.image" . | quote }} + command: + - "/bin/sh" + - "-c" + - | + ITEM="multus" + rm -f /host/opt/cni/bin/${ITEM}.old || true + ( [ -f "/host/opt/cni/bin/${ITEM}" ] && mv /host/opt/cni/bin/${ITEM} /host/opt/cni/bin/${ITEM}.old ) || true + cp /usr/src/multus-cni/bin/${ITEM} /host/opt/cni/bin/${ITEM} + rm -f /host/opt/cni/bin/${ITEM}.old &>/dev/null || true + ./entrypoint.sh --multus-conf-file=/tmp/multus-conf/00-multus.conf \ + --cni-version=0.3.1 + securityContext: + privileged: true + {{- if .Values.multus.multusCNI.uninstall }} + lifecycle: + preStop: + exec: + command: + - "/bin/sh" + - "-c" + - | + rm -f /host/opt/cni/bin/multus || true + rm -rf /host/etc/cni/net.d/multus.d || true + rm -f /host/etc/cni/net.d/00-multus.conf || true {{- end }} - volumeMounts: - - name: cni - mountPath: /host/etc/cni/net.d - - name: cni-bin-path - mountPath: /host/opt/cni/bin - mountPropagation: Bidirectional - - name: multus-cfg - mountPath: /tmp/multus-conf + volumeMounts: + - name: cni + mountPath: /host/etc/cni/net.d + - name: cni-bin-path + mountPath: /host/opt/cni/bin + mountPropagation: Bidirectional + - name: multus-cfg + mountPath: /tmp/multus-conf {{- if .Values.multus.multusCNI.extraVolumes }} {{- include "tplvalues.render" ( dict "value" .Values.multus.multusCNI.extraVolumeMounts "context" $ ) | nindent 12 }} {{- end }} {{- end }} volumes: # To read the configuration from the config map - - name: config-path - configMap: - defaultMode: 0400 - name: {{ .Values.global.configName }} - - name: cni-bin-path - hostPath: - path: {{ .Values.global.cniBinHostPath }} - type: DirectoryOrCreate - # To create unix socket dir in the host - - name: ipam-unix-socket-dir - hostPath: - path: {{ dir .Values.global.ipamUNIXSocketHostPath }} - type: DirectoryOrCreate - # multus + - name: config-path + configMap: + defaultMode: 0400 + name: {{ .Values.global.configName }} + - name: cni-bin-path + hostPath: + path: {{ .Values.global.cniBinHostPath }} + type: DirectoryOrCreate + # To create unix socket dir in the host + - name: ipam-unix-socket-dir + hostPath: + path: {{ dir .Values.global.ipamUNIXSocketHostPath }} + type: DirectoryOrCreate + # multus + {{- if .Values.spiderpoolAgent.prometheus.enabledRdmaMetric }} + - name: host-ns + hostPath: + path: /var/run/netns + {{- end }} {{- if .Values.multus.multusCNI.install }} - - name: cni - hostPath: - path: /etc/cni/net.d - - name: multus-cfg - configMap: - name: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" }} - items: - - key: cni-conf.json - path: 00-multus.conf + - name: cni + hostPath: + path: /etc/cni/net.d + - name: multus-cfg + configMap: + name: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" }} + items: + - key: cni-conf.json + path: 00-multus.conf {{- end }} {{- if .Values.spiderpoolAgent.extraVolumeMounts }} {{- include "tplvalues.render" ( dict "value" .Values.spiderpoolAgent.extraVolumeMounts "context" $ ) | nindent 6 }} {{- end }} {{- if .Values.multus.multusCNI.extraVolumeMounts }} {{- include "tplvalues.render" ( dict "value" .Values.multus.multusCNI.extraVolumeMounts "context" $ ) | nindent 8 }} - {{- end }} + {{- end }} \ No newline at end of file diff --git a/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml b/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml new file mode 100644 index 0000000000..f07c839bd0 --- /dev/null +++ b/charts/spiderpool/templates/grafanaDashboardRdmaCluster.yaml @@ -0,0 +1,26 @@ +{{- if .Values.grafanaDashboard.install -}} +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: {{ default "spiderpool" .Values.global.nameOverride }}-rdma-cluster + namespace: {{ default .Release.Namespace .Values.grafanaDashboard.namespace }} + labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} + {{- if or .Values.global.commonAnnotations .Values.grafanaDashboard.annotations }} + annotations: + {{- if .Values.global.commonAnnotations }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.annotations }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.annotations "context" $ ) | nindent 4 }} + {{- end }} + {{- end }} +spec: + json: |- + {{ .Files.Get "files/grafana-rdma-cluster.json" | toJson | indent 4 }} +{{- end }} \ No newline at end of file diff --git a/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml b/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml new file mode 100644 index 0000000000..85ae18ec30 --- /dev/null +++ b/charts/spiderpool/templates/grafanaDashboardRdmaNode.yaml @@ -0,0 +1,26 @@ +{{- if .Values.grafanaDashboard.install -}} +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: {{ default "spiderpool" .Values.global.nameOverride }}-rdma-node + namespace: {{ default .Release.Namespace .Values.grafanaDashboard.namespace }} + labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} + {{- if or .Values.global.commonAnnotations .Values.grafanaDashboard.annotations }} + annotations: + {{- if .Values.global.commonAnnotations }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.annotations }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.annotations "context" $ ) | nindent 4 }} + {{- end }} + {{- end }} +spec: + json: |- + {{ .Files.Get "files/grafana-rdma-node.json" | toJson | indent 4 }} +{{- end }} diff --git a/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml b/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml new file mode 100644 index 0000000000..5efceaa699 --- /dev/null +++ b/charts/spiderpool/templates/grafanaDashboardRdmaPod.yaml @@ -0,0 +1,26 @@ +{{- if .Values.grafanaDashboard.install -}} +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: {{ default "spiderpool" .Values.global.nameOverride }}-rdma-pod + namespace: {{ default .Release.Namespace .Values.grafanaDashboard.namespace }} + labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} + {{- if or .Values.global.commonAnnotations .Values.grafanaDashboard.annotations }} + annotations: + {{- if .Values.global.commonAnnotations }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonAnnotations "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.grafanaDashboard.annotations }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.annotations "context" $ ) | nindent 4 }} + {{- end }} + {{- end }} +spec: + json: |- + {{ .Files.Get "files/grafana-rdma-pod.json" | toJson | indent 4 }} +{{- end }} diff --git a/charts/spiderpool/values.yaml b/charts/spiderpool/values.yaml index 3bea1d4d69..e96171daf3 100644 --- a/charts/spiderpool/values.yaml +++ b/charts/spiderpool/values.yaml @@ -468,6 +468,9 @@ spiderpoolAgent: ## @param spiderpoolAgent.prometheus.enabled enable spiderpool agent to collect metrics enabled: false + ## @param spiderpoolAgent.prometheus.enabledRdmaMetric enable spiderpool agent to collect RDMA metrics + enabledRdmaMetric: false + ## @param spiderpoolAgent.prometheus.enabledDebugMetric enable spiderpool agent to collect debug level metrics enabledDebugMetric: false diff --git a/cmd/spiderpool-agent/cmd/config.go b/cmd/spiderpool-agent/cmd/config.go index 6cab225820..45d981ae86 100644 --- a/cmd/spiderpool-agent/cmd/config.go +++ b/cmd/spiderpool-agent/cmd/config.go @@ -52,6 +52,7 @@ var envInfo = []envConf{ {"SPIDERPOOL_LOG_LEVEL", logutils.LogInfoLevelStr, true, &agentContext.Cfg.LogLevel, nil, nil}, {"SPIDERPOOL_ENABLED_METRIC", "false", false, nil, &agentContext.Cfg.EnableMetric, nil}, + {"SPIDERPOOL_ENABLED_RDMA_METRIC", "false", false, nil, &agentContext.Cfg.EnableRDMAMetric, nil}, {"SPIDERPOOL_ENABLED_DEBUG_METRIC", "false", false, nil, &agentContext.Cfg.EnableDebugLevelMetric, nil}, {"SPIDERPOOL_POD_NAMESPACE", "", true, &agentContext.Cfg.AgentPodNamespace, nil, nil}, {"SPIDERPOOL_POD_NAME", "", true, &agentContext.Cfg.AgentPodName, nil, nil}, @@ -78,6 +79,7 @@ type Config struct { // env LogLevel string EnableMetric bool + EnableRDMAMetric bool EnableDebugLevelMetric bool AgentPodNamespace string AgentPodName string diff --git a/cmd/spiderpool-agent/cmd/daemon.go b/cmd/spiderpool-agent/cmd/daemon.go index ad3d3b63f3..f98917338e 100644 --- a/cmd/spiderpool-agent/cmd/daemon.go +++ b/cmd/spiderpool-agent/cmd/daemon.go @@ -140,9 +140,6 @@ func DaemonMain() { logger.Fatal(err.Error()) } - logger.Info("Begin to initialize spiderpool-agent metrics HTTP server") - initAgentMetricsServer(agentContext.InnerCtx) - logger.Info("Begin to initialize spiderpool-agent runtime manager") mgr, err := newCRDManager() if nil != err { @@ -150,6 +147,9 @@ func DaemonMain() { } agentContext.CRDManager = mgr + logger.Info("Begin to initialize spiderpool-agent metrics HTTP server") + initAgentMetricsServer(agentContext.InnerCtx) + // init managers... initAgentServiceManagers(agentContext.InnerCtx) diff --git a/cmd/spiderpool-agent/cmd/metrics_server.go b/cmd/spiderpool-agent/cmd/metrics_server.go index 183563d7d5..9d263baa17 100644 --- a/cmd/spiderpool-agent/cmd/metrics_server.go +++ b/cmd/spiderpool-agent/cmd/metrics_server.go @@ -14,13 +14,16 @@ import ( // initAgentMetricsServer will start an opentelemetry http server for spiderpool agent. func initAgentMetricsServer(ctx context.Context) { - metricController, err := metric.InitMetric(ctx, constant.SpiderpoolAgent, - agentContext.Cfg.EnableMetric, agentContext.Cfg.EnableDebugLevelMetric) + metricController, err := metric.InitMetric(ctx, + constant.SpiderpoolAgent, + agentContext.Cfg.EnableMetric, + agentContext.Cfg.EnableDebugLevelMetric, + ) if nil != err { logger.Fatal(err.Error()) } - err = metric.InitSpiderpoolAgentMetrics(ctx) + err = metric.InitSpiderpoolAgentMetrics(ctx, agentContext.Cfg.EnableRDMAMetric, agentContext.CRDManager.GetClient()) if nil != err { logger.Fatal(err.Error()) } diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 9cdad44ecd..2d9a06fc64 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -103,6 +103,7 @@ nav: - Access Service for Underlay CNI: usage/underlay_cni_service.md - Bandwidth Manage for IPVlan CNI: usage/ipvlan_bandwidth.md - Kubevirt: usage/kubevirt.md + - Enable RDMA metrics: usage/rdma-metrics.md - FAQ: usage/faq.md - Reference: - Annotations: reference/annotation.md diff --git a/docs/usage/rdma-metrics-zh-CN.md b/docs/usage/rdma-metrics-zh-CN.md new file mode 100644 index 0000000000..048300c8b3 --- /dev/null +++ b/docs/usage/rdma-metrics-zh-CN.md @@ -0,0 +1,81 @@ +# RDMA 指标 + +RDMA 是一种高效的网络通信技术,允许一台计算机直接访问另一台计算机的内存,无需操作系统介入,从而减少延迟,提高数据传输速度和效率。RDMA 支持高速数据传输,减少 CPU 负载,非常适用于需要高速网络通信的场景。 + +在 Kubernetes 集群中,spiderpool CNI 支持 RoCE 和 IB 2 种 RDMA 场景,Pod 可以通过共享和独占的方式使用 RDMA 网卡,用户可以根据需求选择合适的方式来使用 RDMA 网卡。 + +spiderpool 同时提供了 RDMA exporter 功能和 grafana 监控面板,通过实时监控 Pod/Node RDMA 网络的性能,包括吞吐量、延迟、丢包率等,可以时发现问题并采取措施解决,提高网络的可靠性和性能。 + +## RDMA 指标的常见场景 + +1. **性能监控**: + - **吞吐量**: 测量通过网络传输的数据量。 + - **延迟**: 测量数据从源到目的地的传输时间。 + - **丢包率**: 监控传输过程中丢失的数据包数量。 + +2. **错误检测**: + - **传输错误**: 检测数据传输中的错误。 + - **连接失败**: 监控失败的连接尝试和断开连接。 + +3. **网络健康状况**: + - **拥塞**: 检测网络拥塞和瓶颈。 + +## 如何开启 + +```shell +helm upgrade spiderpool spiderpool/spiderpool --reuse-values --wait --namespace kube-system \ + --set spiderpoolAgent.prometheus.enabled \ + --set spiderpoolAgent.prometheus.enabledRdmaMetric=true +``` + +- 通过设置 `--reuse-values` 重用现有的配置 +- 通过设置 `--wait` 等待所有 Pod 运行 +- 通过设置 `--namespace` 指定 Helm 安装的命名空间 +- 通过设置 `--set spiderpoolAgent.prometheus.enabled` 启用 Prometheus 监控 +- 通过设置 `spiderpoolAgent.prometheus.enabledRdmaMetric=true`,可以启用 RDMA 指标 exporter + + +## 指标列表 + +以下是经过整理后的表格,包含了"指标名称"、"指标类型"、"指标含义"和"备注"四列: + +| 指标名称 | 指标类型 | 指标含义 | 备注 | +|---------------------------------|------|---------------------------------------------|----| +| rx_write_requests | 数量 | 接收到的写请求的数量 | | +| rx_read_requests | 数量 | 接收到的读请求的数量 | | +| rx_atomic_requests | 数量 | 接收到的原子请求的数量 | | +| rx_dct_connect | 数量 | 接收到的 DCT 连接请求的数量 | | +| out_of_buffer | 数量 | 缓冲区不足错误的数量 | | +| out_of_sequence | 数量 | 收到的乱序包数量 | | +| duplicate_request | 数量 | 重复请求的数量 | | +| rnr_nak_retry_err | 数量 | 收到的 RNR NAK 包未超过 QP 重试限制的数量 | | +| packet_seq_err | 数量 | 包序列错误的数量 | | +| implied_nak_seq_err | 数量 | 隐含 NAK 序列错误的数量 | | +| local_ack_timeout_err | 数量 | 发送端 QP 的 ack 计时器过期的次数(适用于 RC, XRC, DCT QPs) | | +| resp_local_length_error | 数量 | 响应者检测到本地长度错误的次数 | | +| resp_cqe_error | 数量 | 响应 CQE 错误的数量 | | +| req_cqe_error | 数量 | 请求者检测到 CQE 完成且带错误的次数 | | +| req_remote_invalid_request | 数量 | 请求者检测到远程无效请求错误的次数 | | +| req_remote_access_errors | 数量 | 请求的远程访问错误的数量 | | +| resp_remote_access_errors | 数量 | 响应的远程访问错误的数量 | | +| resp_cqe_flush_error | 数量 | 响应 CQE 刷新错误的数量 | | +| req_cqe_flush_error | 数量 | 请求 CQE 刷新错误的数量 | | +| roce_adp_retrans | 数量 | RoCE 自适应重传的次数 | | +| roce_adp_retrans_to | 数量 | RoCE 自适应重传超时的次数 | | +| roce_slow_restart | 数量 | RoCE 缓慢重启的次数 | | +| roce_slow_restart_cnps | 数量 | RoCE 缓慢重启产生的CNP包数 | | +| roce_slow_restart_trans | 数量 | RoCE 缓慢重启状态转换为缓慢重启的次数 | | +| rp_cnp_ignored | 数量 | Reaction Point HCA 接收到并忽略的 CNP 包数量 | | +| rp_cnp_handled | 数量 | Reaction Point HCA 处理以降低传输速率的 CNP 包数量 | | +| np_ecn_marked_roce_packets | 数量 | 进入 Pod/Node 的方向收到的 ECN,表示路径拥塞 | | +| np_cnp_sent | 数量 | 通知点在 RoCEv2 IP 头部注意到拥塞体验时发送的 CNP 包数 | | +| rx_icrc_encapsulated | 数量 | 具有 ICRC 错误的 RoCE 包数量 | | +| rx_vport_rdma_unicast_packets | 数量 | 单播 RDMA 包数量 | | +| tx_vport_rdma_unicast_packets | 数量 | 发送的单播 RDMA 包数量 | | +| rx_vport_rdma_multicast_packets | 数量 | 接收到的多播 RDMA 包数量 | | +| tx_vport_rdma_multicast_packets | 数量 | 发送的多播 RDMA 包数量 | | +| rx_vport_rdma_unicast_bytes | 数量 | 接收到的单播 RDMA 包的字节数 | | +| tx_vport_rdma_unicast_bytes | 数量 | 发送的单播 RDMA 包的字节数 | | +| rx_vport_rdma_multicast_bytes | 数量 | 接收到的多播 RDMA 包的字节数 | | +| tx_vport_rdma_multicast_bytes | 数量 | 发送的多播 RDMA 包的字节数 | | +| vport_speed_mbps | 速度 | 端口的速度,以兆位每秒(Mbps)表示 | | diff --git a/docs/usage/rdma-metrics.md b/docs/usage/rdma-metrics.md new file mode 100644 index 0000000000..e6af6b66a7 --- /dev/null +++ b/docs/usage/rdma-metrics.md @@ -0,0 +1,80 @@ +# RDMA Metrics + +RDMA is an efficient network communication technology that allows one computer to directly access the memory of another computer without involving the operating system, thus reducing latency and improving data transfer speed and efficiency. RDMA supports high-speed data transmission and reduces CPU load, making it ideal for scenarios requiring high-speed network communication. + +In a Kubernetes cluster, the spiderpool CNI supports two RDMA scenarios: RoCE and IB. Pods can use the RDMA network card in either shared or exclusive modes. Users can choose the appropriate method based on their needs for utilizing RDMA network cards. + +Spiderpool also provides an RDMA exporter feature and a Grafana monitoring panel. By monitoring the performance of Pod/Node RDMA networks in real-time, including throughput, latency, packet loss rate, etc., issues can be detected and measures taken to improve network reliability and performance. + +## Common Scenarios for RDMA Metrics + +1. **Performance Monitoring**: + - **Throughput**: Measures the amount of data transmitted over the network. + - **Latency**: Measures the time it takes for data to travel from source to destination. + - **Packet Loss Rate**: Monitors the number of data packets lost during transmission. + +2. **Error Detection**: + - **Transmission Errors**: Detects errors in data transmission. + - **Connection Failures**: Monitors failed connection attempts and disconnects. + +3. **Network Health**: + - **Congestion**: Detects network congestion and bottlenecks. + +## How to Enable + +```shell +helm upgrade spiderpool spiderpool/spiderpool --reuse-values --wait --namespace kube-system \ + --set spiderpoolAgent.prometheus.enabled \ + --set spiderpoolAgent.prometheus.enabledRdmaMetric=true +``` + +- Use `--reuse-values` to reuse existing configurations. +- Use `--wait` to wait for all Pods to be running. +- Use `--namespace` to specify the Helm installation namespace. +- Use `--set spiderpoolAgent.prometheus.enabled` to enable Prometheus monitoring. +- Use `--set spiderpoolAgent.prometheus.enabledRdmaMetric=true` to enable the RDMA metric exporter. + +## Metrics List + +Below is a table containing "Metric Name," "Metric Type," "Metric Meaning," and "Remarks": + +| Metric Name | Type | Meaning | Remarks | +|---------------------------------|-------|---------------------------------------------------------------------------------|-----------------------| +| rx_write_requests | Count | Number of received write requests | | +| rx_read_requests | Count | Number of received read requests | | +| rx_atomic_requests | Count | Number of received atomic requests | | +| rx_dct_connect | Count | Number of received DCT connection requests | | +| out_of_buffer | Count | Number of buffer insufficiency errors | | +| out_of_sequence | Count | Number of out-of-sequence packets received | | +| duplicate_request | Count | Number of duplicate requests | | +| rnr_nak_retry_err | Count | Count of RNR NAK packets not exceeding QP retry limit | | +| packet_seq_err | Count | Number of packet sequence errors | | +| implied_nak_seq_err | Count | Number of implied NAK sequence errors | | +| local_ack_timeout_err | Count | Number of times the sender's QP ack timer expired | RC, XRC, DCT QPs only | +| resp_local_length_error | Count | Number of times a respondent detected a local length error | | +| resp_cqe_error | Count | Number of response CQE errors | | +| req_cqe_error | Count | Number of times a requester detected CQE completion with errors | | +| req_remote_invalid_request | Count | Number of remote invalid request errors detected by requester | | +| req_remote_access_errors | Count | Number of requested remote access errors | | +| resp_remote_access_errors | Count | Number of response remote access errors | | +| resp_cqe_flush_error | Count | Number of response CQE flush errors | | +| req_cqe_flush_error | Count | Number of request CQE flush errors | | +| roce_adp_retrans | Count | Number of RoCE adaptive retransmissions | | +| roce_adp_retrans_to | Count | Number of RoCE adaptive retransmission timeouts | | +| roce_slow_restart | Count | Number of RoCE slow restarts | | +| roce_slow_restart_cnps | Count | Number of CNP packets generated during RoCE slow restart | | +| roce_slow_restart_trans | Count | Number of times state transitioned to slow restart | | +| rp_cnp_ignored | Count | Number of CNP packets received and ignored by Reaction Point HCA | | +| rp_cnp_handled | Count | Number of CNP packets handled by Reaction Point HCA to reduce transmission rate | | +| np_ecn_marked_roce_packets | Count | Number of ECN-marked RoCE packets indicating path congestion | | +| np_cnp_sent | Count | Number of CNP packets sent when congestion is experienced in RoCEv2 IP header | | +| rx_icrc_encapsulated | Count | Number of RoCE packets with ICRC errors | | +| rx_vport_rdma_unicast_packets | Count | Number of received unicast RDMA packets | | +| tx_vport_rdma_unicast_packets | Count | Number of transmitted unicast RDMA packets | | +| rx_vport_rdma_multicast_packets | Count | Number of received multicast RDMA packets | | +| tx_vport_rdma_multicast_packets | Count | Number of transmitted multicast RDMA packets | | +| rx_vport_rdma_unicast_bytes | Count | Number of bytes received in unicast RDMA packets | | +| tx_vport_rdma_unicast_bytes | Count | Number of bytes transmitted in unicast RDMA packets | | +| rx_vport_rdma_multicast_bytes | Count | Number of bytes received in multicast RDMA packets | | +| tx_vport_rdma_multicast_bytes | Count | Number of bytes transmitted in multicast RDMA packets | | +| vport_speed_mbps | Speed | Speed of the port in Mbps | | diff --git a/go.mod b/go.mod index 80d4e4a06f..8d921cd372 100644 --- a/go.mod +++ b/go.mod @@ -76,6 +76,7 @@ require k8s.io/component-base v0.29.4 // indirect require ( github.com/hashicorp/go-multierror v1.1.1 + github.com/safchain/ethtool v0.4.0 go.uber.org/automaxprocs v1.5.3 k8s.io/kubectl v0.26.3 ) @@ -157,7 +158,6 @@ require ( github.com/prometheus/procfs v0.11.1 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect - github.com/safchain/ethtool v0.4.0 // indirect github.com/shirou/gopsutil/v3 v3.23.5 // indirect github.com/shopspring/decimal v1.2.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect diff --git a/images/spiderpool-agent/Dockerfile b/images/spiderpool-agent/Dockerfile index afe1b5436c..ab42de867e 100644 --- a/images/spiderpool-agent/Dockerfile +++ b/images/spiderpool-agent/Dockerfile @@ -1,7 +1,7 @@ # Copyright 2022 Authors of spidernet-io # SPDX-License-Identifier: Apache-2.0 -ARG BASE_IMAGE=ghcr.io/spidernet-io/spiderpool/spiderpool-base:1f8330482d25b58d2ae26bc6252e20384bac92ad +ARG BASE_IMAGE=ghcr.io/spidernet-io/spiderpool/spiderpool-base:4316032f53c3753c5fa4f9fa2911415af5fd76b7 ARG GOLANG_IMAGE=docker.io/library/golang:1.23.2@sha256:adee809c2d0009a4199a11a1b2618990b244c6515149fe609e2788ddf164bd10 #======= build bin ========== diff --git a/images/spiderpool-base/install-others.sh b/images/spiderpool-base/install-others.sh index 54fbf0432f..095699db7a 100644 --- a/images/spiderpool-base/install-others.sh +++ b/images/spiderpool-base/install-others.sh @@ -14,6 +14,7 @@ packages=( libmnl0 bash-completion iptables + rdma-core ) diff --git a/pkg/ippoolmanager/ippool_manager_suite_test.go b/pkg/ippoolmanager/ippool_manager_suite_test.go index 655286f51d..0e9f0aa6da 100644 --- a/pkg/ippoolmanager/ippool_manager_suite_test.go +++ b/pkg/ippoolmanager/ippool_manager_suite_test.go @@ -46,13 +46,8 @@ func TestIPPoolManager(t *testing.T) { } var _ = BeforeSuite(func() { - _, err := metric.InitMetric(context.TODO(), constant.SpiderpoolAgent, false, false) - Expect(err).NotTo(HaveOccurred()) - err = metric.InitSpiderpoolAgentMetrics(context.TODO()) - Expect(err).NotTo(HaveOccurred()) - scheme = runtime.NewScheme() - err = spiderpoolv2beta1.AddToScheme(scheme) + err := spiderpoolv2beta1.AddToScheme(scheme) Expect(err).NotTo(HaveOccurred()) fakeClient = fake.NewClientBuilder(). @@ -74,6 +69,10 @@ var _ = BeforeSuite(func() { }). WithStatusSubresource(&spiderpoolv2beta1.SpiderIPPool{}). Build() + _, err = metric.InitMetric(context.TODO(), constant.SpiderpoolAgent, false, false) + Expect(err).NotTo(HaveOccurred()) + err = metric.InitSpiderpoolAgentMetrics(context.TODO(), false, fakeClient) + Expect(err).NotTo(HaveOccurred()) tracker = k8stesting.NewObjectTracker(scheme, k8sscheme.Codecs.UniversalDecoder()) fakeAPIReader = fake.NewClientBuilder(). diff --git a/pkg/metric/metrics.go b/pkg/metric/metrics.go index 67a46acc89..80ebf652b2 100644 --- a/pkg/metric/metrics.go +++ b/pkg/metric/metrics.go @@ -10,6 +10,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/spidernet-io/spiderpool/pkg/constant" "go.opentelemetry.io/otel/exporters/prometheus" api "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/metric/noop" @@ -17,8 +18,6 @@ import ( sdkmetric "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" - - "github.com/spidernet-io/spiderpool/pkg/constant" ) const debugMetrics = "debug-metrics" diff --git a/pkg/metric/metrics_instance.go b/pkg/metric/metrics_instance.go index 7620e6f9f0..6c6e53532d 100644 --- a/pkg/metric/metrics_instance.go +++ b/pkg/metric/metrics_instance.go @@ -9,8 +9,10 @@ import ( "go.opentelemetry.io/otel/attribute" api "go.opentelemetry.io/otel/metric" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spidernet-io/spiderpool/pkg/lock" + "github.com/spidernet-io/spiderpool/pkg/rdmametrics" ) const metricPrefix = "spiderpool_" @@ -219,7 +221,15 @@ func (a *asyncInt64Gauge) Record(value int64, attrs ...attribute.KeyValue) { } // InitSpiderpoolAgentMetrics serves for spiderpool agent metrics initialization -func InitSpiderpoolAgentMetrics(ctx context.Context) error { +func InitSpiderpoolAgentMetrics(ctx context.Context, enableRDMAMetric bool, client client.Client) error { + // for rdma + if enableRDMAMetric { + err := rdmametrics.Register(ctx, meter, client) + if err != nil { + return err + } + } + err := initSpiderpoolAgentAllocationMetrics(ctx) if nil != err { return err diff --git a/pkg/rdmametrics/ethtool/ethtool.go b/pkg/rdmametrics/ethtool/ethtool.go new file mode 100644 index 0000000000..1658a8d0a6 --- /dev/null +++ b/pkg/rdmametrics/ethtool/ethtool.go @@ -0,0 +1,27 @@ +// Copyright 2024 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package ethtool + +import "github.com/safchain/ethtool" + +func Stats(netIfName string) (map[string]uint64, error) { + tool, err := ethtool.NewEthtool() + if err != nil { + return nil, err + } + defer tool.Close() + stats, err := tool.Stats(netIfName) + if err != nil { + return nil, err + } + speed, err := tool.CmdGetMapped(netIfName) + if err != nil { + return nil, err + } + // speed unknown = 4294967295 + if val, ok := speed["speed"]; ok && val != 4294967295 { + stats["vport_speed_mbps"] = val + } + return stats, nil +} diff --git a/pkg/rdmametrics/metrics.go b/pkg/rdmametrics/metrics.go new file mode 100644 index 0000000000..10268b989f --- /dev/null +++ b/pkg/rdmametrics/metrics.go @@ -0,0 +1,628 @@ +// Copyright 2024 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package rdmametrics + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "unicode" + + "github.com/vishvananda/netlink" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.uber.org/zap" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/exec" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/containernetworking/plugins/pkg/ns" + "github.com/spidernet-io/spiderpool/pkg/lock" + "github.com/spidernet-io/spiderpool/pkg/logutils" + "github.com/spidernet-io/spiderpool/pkg/rdmametrics/ethtool" +) + +var cli client.Client + +const netnsPath = "/var/run/netns" + +var ( + readDir = os.ReadDir + rdmaSystemGetNetnsMode = netlink.RdmaSystemGetNetnsMode + + rdmaMetricsPrefix = "rdma_" + + knownMetricsKeyDescription = map[string]string{ + "rx_write_requests": "The number of received WRITE requests for the associated QPs.", + "rx_read_requests": "The number of received read requests", + "rx_atomic_requests": "The number of received atomic requests", + "rx_dct_connect": "The number of received DCT connect requests", + "out_of_buffer": "The number of out of buffer errors", + "out_of_sequence": "The number of out-of-order arrivals", + "duplicate_request": "The number of duplicate requests", + "rnr_nak_retry_err": "The number of received RNR NAK packets did not exceed the QP retry limit", + "packet_seq_err": "The number of packet sequence errors", + "implied_nak_seq_err": "The number of implied NAK sequence errors", + "local_ack_timeout_err": "The number of times QP's ack timer expired for RC, XRC, DCT QPs at the sender side", + "resp_local_length_error": "The number of times responder detected local length errors", + "resp_cqe_error": "The number of response CQE errors", + "req_cqe_error": "The number of times requester detected CQEs completed with errors", + "req_remote_invalid_request": "The number of times requester detected remote invalid request errors", + "req_remote_access_errors": "The number of request remote access errors", + "resp_remote_access_errors": "The number of response remote access errors", + "resp_cqe_flush_error": "The number of response CQE flush errors", + "req_cqe_flush_error": "The number of request CQE flush errors", + "roce_adp_retrans": "The number of RoCE adaptive retransmissions", + "roce_adp_retrans_to": "The number of RoCE adaptive retransmission timeouts", + "roce_slow_restart": "The number of RoCE slow restart", + "roce_slow_restart_cnps": "The number of times RoCE slow restart generated CNP packets", + "roce_slow_restart_trans": "The number of times RoCE slow restart changed state to slow restart", + "rp_cnp_ignored": "The number of CNP packets received and ignored by the Reaction Point HCA", + "rp_cnp_handled": "The number of CNP packets handled by the Reaction Point HCA to throttle the transmission rate", + "np_ecn_marked_roce_packets": "The number of RoCEv2 packets received by the notification point which were marked for experiencing the congestion (ECN bits where '11' on the ingress RoCE traffic)", + "np_cnp_sent": "The number of CNP packets sent by the Notification Point when it noticed congestion experienced in the RoCEv2 IP header (ECN bits)", + "rx_icrc_encapsulated": "The number of RoCE packets with ICRC errors", + "rx_vport_rdma_unicast_packets": "The number of unicast RDMA packets received on the virtual port.", + "tx_vport_rdma_unicast_packets": "The number of unicast RDMA packets transmitted from the virtual port.", + "rx_vport_rdma_multicast_packets": "The number of multicast RDMA packets received on the virtual port.", + "tx_vport_rdma_multicast_packets": "The number of multicast RDMA packets transmitted from the virtual port.", + "rx_vport_rdma_unicast_bytes": "The number of bytes received in unicast RDMA packets on the virtual port.", + "tx_vport_rdma_unicast_bytes": "The number of bytes transmitted in unicast RDMA packets from the virtual port.", + "rx_vport_rdma_multicast_bytes": "The number of bytes received in multicast RDMA packets on the virtual port.", + "tx_vport_rdma_multicast_bytes": "The number of bytes transmitted in multicast RDMA packets from the virtual port.", + "vport_speed_mbps": "The speed of the virtual port expressed in megabits per second (Mbps).", + } + + // skip to export these fields + // key and reason + skipRDMAStatsField = map[string]string{ + "port": "The field is attribute is used to identify the nic port", + "ifname": "The field is attribute is used to identify the nic netnsPath", + "net_dev_name": "The field is attribute is used to identify the nic net_dev_name", + "node_guid": "The field is attribute is used to identify the nic node_guid", + "sys_image_guid": "The field is attribute is used to identify the nic sys_image_guid", + "rdma_parent_name": "The field is attribute is used to identify the nic rdma_parent_name", + "is_root": "The field is attribute is used to identify the nic is_root", + } +) + +type GetObservable func(string) (metric.Int64ObservableCounter, bool) + +type EthtoolImpl struct { + Stats func(netIfName string) (map[string]uint64, error) +} + +type NetlinkImpl struct { + RdmaLinkList func() ([]*netlink.RdmaLink, error) + LinkList func() ([]netlink.Link, error) +} + +type RDMADevice struct { + NetDevName string + NodeGuid string + SysImageGuid string + IsRoot bool +} + +func Register(ctx context.Context, meter metric.Meter, client client.Client) error { + cli = client + log := logutils.Logger.Named("rdma-metrics-exporter") + nodeName, err := os.Hostname() + if err != nil { + return fmt.Errorf("failed to get hostname: %w", err) + } + attributeNodeName := attribute.String("node_name", nodeName) + e := &exporter{ + observableMap: make(map[string]metric.Int64ObservableCounter), + nodeName: &attributeNodeName, + meter: meter, + netns: func(netnsID string, toRun func() error) error { + if netnsID != "" { + netns, err := ns.GetNS(filepath.Join(netnsPath, netnsID)) + if err != nil { + return err + } + return netns.Do(func(netNS ns.NetNS) error { + return toRun() + }) + } + return toRun() + }, + exec: exec.New(), + log: log, + ch: make(chan struct{}, 10), + waitToRegisterMetrics: make(map[string]struct{}), + ethtool: EthtoolImpl{Stats: ethtool.Stats}, + netlinkImpl: NetlinkImpl{ + RdmaLinkList: netlink.RdmaLinkList, + LinkList: netlink.LinkList, + }, + } + err = e.registerMetrics(meter) + if err != nil { + return err + } + log.Info("rdma metrics registered") + go e.daemon(ctx) + return nil +} + +type exporter struct { + nodeName *attribute.KeyValue + meter metric.Meter + lock lock.Mutex + log *zap.Logger + ch chan struct{} + netns func(netnsID string, toRun func() error) error + netlinkImpl NetlinkImpl + ethtool EthtoolImpl + exec exec.Interface + registration metric.Registration + waitToRegisterMetrics map[string]struct{} + observableMap map[string]metric.Int64ObservableCounter +} + +func (e *exporter) registerMetrics(meter metric.Meter) error { + list := make([]metric.Observable, 0) + // register known metrics + for key, description := range knownMetricsKeyDescription { + keyWithPrefix := rdmaMetricsPrefix + key + d := metric.WithDescription(description) + val, err := meter.Int64ObservableCounter(keyWithPrefix, d) + if err != nil { + return err + } + e.observableMap[keyWithPrefix] = val + list = append(list, val) + } + // register discovered metrics + for key := range e.waitToRegisterMetrics { + keyWithPrefix := rdmaMetricsPrefix + key + val, err := e.meter.Int64ObservableCounter(keyWithPrefix) + if err != nil { + return err + } + e.observableMap[keyWithPrefix] = val + list = append(list, val) + } + e.waitToRegisterMetrics = make(map[string]struct{}) + + registration, err := meter.RegisterCallback(e.Callback, list...) + if err != nil { + return err + } + e.registration = registration + return nil +} + +func (e *exporter) reRegisterMetrics() error { + e.lock.Lock() + defer e.lock.Unlock() + err := e.registration.Unregister() + if err != nil { + return fmt.Errorf("failed to unregister metric: %w", err) + } + err = e.registerMetrics(e.meter) + if err != nil { + return err + } + return nil +} + +func (e *exporter) daemon(ctx context.Context) { + for { + select { + case <-ctx.Done(): + close(e.ch) + return + case _, ok := <-e.ch: + if !ok { + return + } + err := e.reRegisterMetrics() + if err != nil { + e.log.Error("failed to re-register metrics", zap.Error(err)) + } + } + } +} + +func (e *exporter) Callback(ctx context.Context, observer metric.Observer) error { + list, err := listNodeNetNS() + if err != nil { + e.log.Error("failed to list node net ns", zap.Error(err)) + return fmt.Errorf("failed to list node net ns: %w", err) + } + list = append(list, "") + + podIPMap, err := getIPToPodMap(ctx, cli) + if err != nil { + e.log.Error("failed to get IP to pod map", zap.Error(err)) + return fmt.Errorf("failed to get ip map pod: %w", err) + } + unRegistrationMetric := make([]string, 0) + getObservable := func(key string) (metric.Int64ObservableCounter, bool) { + if val, ok := e.observableMap[rdmaMetricsPrefix+key]; ok { + return val, ok + } + unRegistrationMetric = append(unRegistrationMetric, key) + return nil, false + } + nodeGuidNetDeviceNameMap, err := getNodeGuidNetDeviceNameMap(e.netlinkImpl) + if err != nil { + e.log.Error("failed to get node guid net device name map", zap.Error(err)) + return fmt.Errorf("failed to get node guid net device name map: %w", err) + } + for _, netNsID := range list { + if err := e.processNetNS(netNsID, podIPMap, nodeGuidNetDeviceNameMap, observer, getObservable); err != nil { + e.log.Error("failed to process net ns", zap.String("net_ns_id", netNsID), zap.Error(err)) + continue + } + } + if len(unRegistrationMetric) > 0 { + e.updateUnregisteredMetrics(unRegistrationMetric) + } + return nil +} + +func (e *exporter) updateUnregisteredMetrics(unRegistrationMetric []string) { + e.lock.Lock() + defer e.lock.Unlock() + if e.waitToRegisterMetrics == nil { + e.waitToRegisterMetrics = make(map[string]struct{}) + } + for _, key := range unRegistrationMetric { + e.waitToRegisterMetrics[key] = struct{}{} + } + select { + case e.ch <- struct{}{}: + default: + e.log.Warn("channel is closed or full, cannot send data") + } +} + +func (e *exporter) processNetNS(netNsID string, ipMapPod map[string]types.NamespacedName, + nodeGuidNetDeviceNameMap map[string]string, + observer metric.Observer, getObservable GetObservable) error { + podPrimaryIP, statsList, err := getRDMAStats(netNsID, e.netns, nodeGuidNetDeviceNameMap, e.netlinkImpl, e.exec, e.ethtool) + if err != nil { + e.log.Error("failed to get RDMA stats", zap.String("net_ns_id", netNsID), zap.Error(err)) + return err + } + if len(statsList) == 0 { + return nil + } + + var attributeNamespace, attributeName *attribute.KeyValue + if item, ok := ipMapPod[podPrimaryIP]; ok { + attributeNamespace, attributeName = getPodAttributes(item) + } + for _, stats := range statsList { + processStats(stats, observer, getObservable, attributeNamespace, attributeName, e.nodeName) + } + return nil +} + +func getPodAttributes(item types.NamespacedName) (*attribute.KeyValue, *attribute.KeyValue) { + var attributeNamespace, attributeName *attribute.KeyValue + if item.Namespace != "" { + t := attribute.String("pod_namespace", item.Namespace) + attributeNamespace = &t + } + if item.Name != "" { + t := attribute.String("pod_name", item.Name) + attributeName = &t + } + return attributeNamespace, attributeName +} + +func processStats(stats map[string]interface{}, observer metric.Observer, + getObservable GetObservable, attributes ...*attribute.KeyValue) { + nicExtAttributes := getIdentifyAttributes(stats) + attributes = append(attributes, nicExtAttributes...) + + for key, val := range stats { + if _, skip := skipRDMAStatsField[key]; skip { + continue + } + if observable, ok := getObservable(key); ok { + if value, ok := val.(float64); ok { + observe(observer, observable, int(value), attributes...) + continue + } + if value, ok := val.(uint64); ok { + observe(observer, observable, int(value), attributes...) + } + } + } +} + +func getIdentifyAttributes(stats map[string]interface{}) []*attribute.KeyValue { + res := make([]*attribute.KeyValue, 0) + if val, ok := stats["port"].(float64); ok { + res = append(res, &attribute.KeyValue{Key: "port", Value: attribute.IntValue(int(val))}) + } + if val, ok := stats["ifname"].(string); ok { + res = append(res, &attribute.KeyValue{Key: "ifname", Value: attribute.StringValue(val)}) + } + if val, ok := stats["net_dev_name"].(string); ok { + res = append(res, &attribute.KeyValue{Key: "net_dev_name", Value: attribute.StringValue(val)}) + } + if val, ok := stats["node_guid"].(string); ok { + res = append(res, &attribute.KeyValue{Key: "node_guid", Value: attribute.StringValue(val)}) + } + if val, ok := stats["sys_image_guid"].(string); ok { + res = append(res, &attribute.KeyValue{Key: "sys_image_guid", Value: attribute.StringValue(val)}) + } + if val, ok := stats["rdma_parent_name"].(string); ok { + res = append(res, &attribute.KeyValue{Key: "rdma_parent_name", Value: attribute.StringValue(val)}) + } + if val, ok := stats["is_root"].(bool); ok { + res = append(res, &attribute.KeyValue{Key: "is_root", Value: attribute.BoolValue(val)}) + } + return res +} + +func observe(observer metric.Observer, counter metric.Int64ObservableCounter, value int, attributes ...*attribute.KeyValue) { + list := make([]attribute.KeyValue, 0, len(attributes)) + for _, item := range attributes { + if item != nil { + list = append(list, *item) + } + } + observer.ObserveInt64(counter, int64(value), metric.WithAttributes(list...)) +} + +func listNodeNetNS() ([]string, error) { + mode, err := rdmaSystemGetNetnsMode() + if err != nil { + return nil, err + } + if mode != "exclusive" { + return []string{}, nil + } + + dirEntries, err := readDir(netnsPath) + if err != nil { + return nil, err + } + + netnsList := make([]string, 0, len(dirEntries)) + for _, entry := range dirEntries { + netnsList = append(netnsList, entry.Name()) + } + return netnsList, nil +} + +func getIPToPodMap(ctx context.Context, cli client.Client) (map[string]types.NamespacedName, error) { + list := new(corev1.PodList) + err := cli.List(ctx, list) + if err != nil { + return nil, err + } + + res := map[string]types.NamespacedName{} + for _, item := range list.Items { + if item.Spec.HostNetwork { + continue + } + if len(item.Status.PodIPs) == 0 { + continue + } + for _, ip := range item.Status.PodIPs { + if ip.IP == "" { + continue + } + res[ip.IP] = types.NamespacedName{Namespace: item.Namespace, Name: item.Name} + } + } + return res, err +} + +func getRDMAStats(nsID string, + netnsDo func(nsID string, toRun func() error) error, + nodeGuidNetNameMap map[string]string, + nl NetlinkImpl, + e exec.Interface, ethtool EthtoolImpl) (string, []map[string]interface{}, error) { + + var srcIP string + var rdmaStats []map[string]interface{} + + err := netnsDo(nsID, func() error { + cmd := e.Command("rdma", "statistic", "-j") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("error executing 'rdma statistic -j' command: %w", err) + } + + if err := json.Unmarshal(output, &rdmaStats); err != nil { + return fmt.Errorf("failed to unmarshal rdma stats: %w", err) + } + + netDevMap, err := getIfnameNetDevMap(nl) + if err != nil { + return fmt.Errorf("failed to get ifname net dev map: %w", err) + } + + for i, item := range rdmaStats { + newMap := make(map[string]interface{}) + for k, v := range item { + newMap[camelToSnake(k)] = v + } + + // append more metrics + var ifname string + if val, ok := newMap["ifname"].(string); ok { + ifname = val + } + if ifname != "" { + if devName, ok := netDevMap[ifname]; ok { + newMap["net_dev_name"] = devName.NetDevName + newMap["node_guid"] = devName.NodeGuid + newMap["sys_image_guid"] = devName.SysImageGuid + newMap["is_root"] = devName.IsRoot + stats, err := ethtool.Stats(devName.NetDevName) + if err == nil { + for k, v := range stats { + if strings.Contains(k, "vport") { + newMap[k] = v + } + } + } + if rdmaParentName, ok := nodeGuidNetNameMap[devName.SysImageGuid]; ok { + newMap["rdma_parent_name"] = rdmaParentName + } + } + } + rdmaStats[i] = newMap + } + + // host netns, don't need get default ip to mapping pod metadata to metrics + if nsID == "" { + return nil + } + srcIP, err = getDefaultIP(e) + if err != nil { + return err + } + return nil + }) + if err != nil { + return "", nil, err + } + + return srcIP, rdmaStats, nil +} + +// map of node guid to rdma name +func getNodeGuidNetDeviceNameMap(nl NetlinkImpl) (map[string]string, error) { + list, err := getIfnameNetDevMap(nl) + if err != nil { + return nil, err + } + res := make(map[string]string) + for _, item := range list { + if item.IsRoot { + res[item.NodeGuid] = item.NetDevName + } + } + return res, nil +} + +func getIfnameNetDevMap(nl NetlinkImpl) (map[string]RDMADevice, error) { + netList, err := nl.LinkList() + if err != nil { + return nil, err + } + + netDevHardwareAddrMap := make(map[string]string) + for _, item := range netList { + if item.Attrs().Name == "" { + continue + } + addr := item.Attrs().HardwareAddr.String() + if addr == "" { + continue + } + if item.Type() == "ipoib" { + if len(addr) == 59 { + // for example: + // ib device hardware addr 00:00:01:af:fe:80:00:00:00:00:00:00:03:a7:83:7a:20:bf:ed:2f + // node guid = addr[36:] = 03:a7:83:7a:20:bf:ed:2f + addr = addr[36:] + netDevHardwareAddrMap[addr] = item.Attrs().Name + } + continue + } + netDevHardwareAddrMap[addr] = item.Attrs().Name + } + + res := make(map[string]RDMADevice) + rdmaList, err := nl.RdmaLinkList() + if err != nil { + return nil, err + } + + for _, v := range rdmaList { + guid := reverseMACAddress(v.Attrs.NodeGuid) + if devName, ok := netDevHardwareAddrMap[guid]; ok { + res[v.Attrs.Name] = RDMADevice{ + NetDevName: devName, + NodeGuid: v.Attrs.NodeGuid, + SysImageGuid: v.Attrs.SysImageGuid, + IsRoot: v.Attrs.NodeGuid == v.Attrs.SysImageGuid, + } + continue + } + if len(guid) == 23 { + // 3a:b0:33:ff:fe:1a:0d:70 + // 3a:b0:33: 1a:0d:70 + roceHardwareAddr := guid[:9] + guid[15:] + if devName, ok := netDevHardwareAddrMap[roceHardwareAddr]; ok { + res[v.Attrs.Name] = RDMADevice{ + NetDevName: devName, + NodeGuid: v.Attrs.NodeGuid, + SysImageGuid: v.Attrs.SysImageGuid, + IsRoot: v.Attrs.NodeGuid == v.Attrs.SysImageGuid, + } + } + } + } + return res, nil +} + +// getDefaultIP returns the default IP address of the host/pod +func getDefaultIP(e exec.Interface) (string, error) { + re := regexp.MustCompile(`\bsrc\s+(\S+)`) + + // Check for IPv4 default route + cmd := e.Command("ip", "route", "get", "1.0.0.0") + output, err := cmd.CombinedOutput() + if err == nil { + match := re.FindStringSubmatch(string(output)) + if len(match) > 0 { + return match[1], nil + } + } + + // Check for IPv6 default route + cmd = e.Command("ip", "-6", "route", "get", "2001:4860:4860::8888") + output, err = cmd.CombinedOutput() + if err == nil { + match := re.FindStringSubmatch(string(output)) + if len(match) > 0 { + return match[1], nil + } + } + + return "", fmt.Errorf("failed to find default IP address: %w", err) +} + +// camelToSnake converts a camelCase string to snake_case +func camelToSnake(camel string) string { + var result []rune + for i, r := range camel { + if unicode.IsUpper(r) { + if i > 0 && (i+1 < len(camel) && unicode.IsLower(rune(camel[i+1]))) { + result = append(result, '_') + } + r = unicode.ToLower(r) + } + result = append(result, r) + } + return string(result) +} + +func reverseMACAddress(mac string) string { + parts := strings.Split(mac, ":") + for i, j := 0, len(parts)-1; i < j; i, j = i+1, j-1 { + parts[i], parts[j] = parts[j], parts[i] + } + return strings.Join(parts, ":") +} diff --git a/pkg/rdmametrics/metrics_test.go b/pkg/rdmametrics/metrics_test.go new file mode 100644 index 0000000000..9672ffafb1 --- /dev/null +++ b/pkg/rdmametrics/metrics_test.go @@ -0,0 +1,1226 @@ +// Copyright 2024 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package rdmametrics + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net" + "os" + "reflect" + "runtime" + "testing" + "time" + + "github.com/vishvananda/netlink" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kruntime "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/exec" + testexec "k8s.io/utils/exec/testing" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/spidernet-io/spiderpool/pkg/logutils" +) + +// Label(K00002) + +func TestRegister(t *testing.T) { + ctx := context.Background() + meter := noop.NewMeterProvider().Meter("test") + cli := fake.NewClientBuilder().Build() + + err := Register(ctx, meter, cli) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } +} + +func TestReRegisterMetrics(t *testing.T) { + meter := noop.NewMeterProvider().Meter("test") + + log := logutils.Logger.Named("rdma-metrics-exporter") + e := &exporter{ + observableMap: make(map[string]metric.Int64ObservableCounter), + meter: meter, + log: log, + ch: make(chan struct{}, 10), + waitToRegisterMetrics: make(map[string]struct{}), + } + err := e.registerMetrics(meter) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + + err = e.reRegisterMetrics() + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + log.Info("rdma metrics registered") +} + +func TestCamelToSnake(t *testing.T) { + tests := map[string]string{ + "camelCase": "camel_case", + "PascalCase": "pascal_case", + "snake_case": "snake_case", + "HTTPServer": "http_server", + "noChange": "no_change", + } + + for input, expected := range tests { + output := camelToSnake(input) + if output != expected { + t.Errorf("Expected %s but got %s", expected, output) + } + } +} + +func TestGetDefaultIP(t *testing.T) { + tests := []struct { + name string + commandScript []testexec.FakeCommandAction + expectedIP string + expectError bool + }{ + { + name: "IPv4", + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + if cmd == "ip" && len(args) > 0 && args[0] == "route" && args[1] == "get" && args[2] == "1.0.0.0" { + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("1.0.0.0 via 10.6.0.1 dev ens160 src 10.6.1.21 uid 1000\n cache"), nil, nil + }, + } + } + return fakeCmd + }, + }, + expectedIP: "10.6.1.21", + expectError: false, + }, + { + name: "IPv6", + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("xxx"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("2001:4860:4860::8888 from :: via fd00::1 dev ens160 proto static src fd00::21 metric 1024 pref medium"), nil, nil + }, + } + return fakeCmd + }, + }, + expectedIP: "fd00::21", + expectError: false, + }, + { + name: "IPv4 and IPv6", + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("1.0.0.0 via 10.6.0.1 dev ens160 src 10.6.1.21 uid 1000\n cache"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("2001:4860:4860::8888 from :: via fd00::1 dev ens160 proto static src fd00::21 metric 1024 pref medium"), nil, nil + }, + } + return fakeCmd + }, + }, + expectedIP: "10.6.1.21", + expectError: false, + }, + { + name: "Neither IPv4 nor IPv6", + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte(""), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte(""), nil, nil + }, + } + return fakeCmd + }, + }, + expectedIP: "", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeExec := &testexec.FakeExec{ + CommandScript: tt.commandScript, + } + + ip, err := getDefaultIP(fakeExec) + if tt.expectError { + if err == nil { + t.Errorf("Expected error but got none") + } + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if ip != tt.expectedIP { + t.Errorf("Expected IP %s but got %s", tt.expectedIP, ip) + } + } + }) + } +} + +func TestListNodeNetNS(t *testing.T) { + tests := []struct { + name string + mode string + rdmaSystemGetNetnsModeError bool + dirEntries []os.DirEntry + readDirErr error + expected []string + expectError bool + }{ + { + name: "read system rdma mode error", + rdmaSystemGetNetnsModeError: true, + expectError: true, + }, + { + name: "exclusive mode with entries", + mode: "exclusive", + dirEntries: []os.DirEntry{mockDirEntry("netns1"), mockDirEntry("netnsimpl")}, + expected: []string{"netns1", "netnsimpl"}, + expectError: false, + }, + { + name: "non-exclusive mode", + mode: "non-exclusive", + expected: []string{}, + expectError: false, + }, + { + name: "readDir error", + mode: "exclusive", + dirEntries: []os.DirEntry{mockDirEntry("netns1"), mockDirEntry("netnsimpl")}, + readDirErr: errors.New("mock error"), + expectError: true, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + rdmaSystemGetNetnsMode = func() (string, error) { + if tt.rdmaSystemGetNetnsModeError { + return "", errors.New("mock error") + } + return tt.mode, nil + } + + // patch os.ReadDir + readDir = func(name string) ([]os.DirEntry, error) { + t.Logf("test name: %s ,tt.readdiderr: %v \n", tt.name, tt.readDirErr) + return tt.dirEntries, tt.readDirErr + } + + result, err := listNodeNetNS() + if tt.expectError { + if err == nil { + t.Errorf("Expected error but got none") + } + return + } else { + if err != nil { + t.Errorf("Unexpected error: %v", err) + return + } + } + + if !reflect.DeepEqual(tt.expected, result) { + t.Errorf("Expected %v but got %v", tt.expected, result) + } + time.Sleep(100 * time.Millisecond) + runtime.GC() + }) + } +} + +// Helper function to create a mock DirEntry +func mockDirEntry(name string) os.DirEntry { + return &mockDirEntryStruct{name: name} +} + +type mockDirEntryStruct struct { + name string +} + +func (m *mockDirEntryStruct) Name() string { return m.name } +func (m *mockDirEntryStruct) IsDir() bool { return false } +func (m *mockDirEntryStruct) Type() os.FileMode { return 0 } +func (m *mockDirEntryStruct) Info() (os.FileInfo, error) { return nil, nil } + +func TestGetIPToPodMap(t *testing.T) { + tests := []struct { + name string + pods []client.Object + expectedMap map[string]types.NamespacedName + expectError bool + }{ + { + name: "Single Pod with IP", + pods: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", Namespace: "default", + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "192.168.1.1"}, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", Namespace: "default", + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "192.168.1.2"}, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", Namespace: "test", + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "192.168.1.3"}, + {IP: ""}, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod4", Namespace: "test", + }, + Spec: corev1.PodSpec{ + HostNetwork: true, + }, + Status: corev1.PodStatus{ + PodIPs: []corev1.PodIP{ + {IP: "10.6.1.21"}, + }, + }, + }, + }, + expectedMap: map[string]types.NamespacedName{ + "192.168.1.1": {Namespace: "default", Name: "pod1"}, + "192.168.1.2": {Namespace: "default", Name: "pod2"}, + "192.168.1.3": {Namespace: "test", Name: "pod3"}, + }, + expectError: false, + }, + { + name: "Pod with no IP", + pods: []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod3", Namespace: "default"}, + Status: corev1.PodStatus{}, + }, + }, + expectedMap: map[string]types.NamespacedName{}, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // create a fake client with the provided pods + scheme := kruntime.NewScheme() + err := corev1.AddToScheme(scheme) + if err != nil { + t.Fatalf("Failed to add corev1 to scheme: %v", err) + } + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tt.pods...).Build() + + // call the function + result, err := getIPToPodMap(context.Background(), cli) + if tt.expectError { + if err == nil { + t.Errorf("Expected error but got none") + } + } else { + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if !reflect.DeepEqual(tt.expectedMap, result) { + t.Errorf("Expected %v but got %v", tt.expectedMap, result) + } + } + }) + } +} + +func TestGetIPToPodMap_WithMockClient(t *testing.T) { + ctx := context.Background() + scheme := kruntime.NewScheme() + cli := fake.NewClientBuilder().WithScheme(scheme).Build() + _, err := getIPToPodMap(ctx, cli) + if err == nil { + t.Errorf("expected an error, got nil") + } +} + +func TestGetPodAttributes(t *testing.T) { + tests := []struct { + name string + input types.NamespacedName + expectedNamespace *attribute.KeyValue + expectedName *attribute.KeyValue + }{ + { + name: "Both namespace and name are set", + input: types.NamespacedName{ + Namespace: "default", + Name: "pod1", + }, + expectedNamespace: &attribute.KeyValue{Key: "pod_namespace", Value: attribute.StringValue("default")}, + expectedName: &attribute.KeyValue{Key: "pod_name", Value: attribute.StringValue("pod1")}, + }, + { + name: "Only namespace is set", + input: types.NamespacedName{ + Namespace: "default", + }, + expectedNamespace: &attribute.KeyValue{Key: "pod_namespace", Value: attribute.StringValue("default")}, + expectedName: nil, + }, + { + name: "Only name is set", + input: types.NamespacedName{ + Name: "pod1", + }, + expectedNamespace: nil, + expectedName: &attribute.KeyValue{Key: "pod_name", Value: attribute.StringValue("pod1")}, + }, + { + name: "Neither namespace nor name is set", + input: types.NamespacedName{}, + expectedNamespace: nil, + expectedName: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + attributeNamespace, attributeName := getPodAttributes(tt.input) + if attributeNamespace != nil && tt.expectedNamespace != nil { + if *attributeNamespace != *tt.expectedNamespace { + t.Errorf("Expected namespace %v, but got %v", *tt.expectedNamespace, *attributeNamespace) + } + } else if attributeNamespace != tt.expectedNamespace { + t.Errorf("Expected namespace %v, but got %v", tt.expectedNamespace, attributeNamespace) + } + + if attributeName != nil && tt.expectedName != nil { + if *attributeName != *tt.expectedName { + t.Errorf("Expected name %v, but got %v", *tt.expectedName, *attributeName) + } + } else if attributeName != tt.expectedName { + t.Errorf("Expected name %v, but got %v", tt.expectedName, attributeName) + } + }) + } +} + +func TestGetIdentifyAttributes(t *testing.T) { + tests := []struct { + name string + stats map[string]interface{} + expCount int + }{ + { + name: "Valid port and ifname", + stats: map[string]interface{}{ + "port": float64(1), + "ifname": "eth0", + "net_dev_name": "net1", + "is_root": true, + "node_guid": "1d:c9:d1:fe:ff:ac:36:ae", + "sys_image_guid": "b6:65:05:0c:9c:5c:f6:08", + "rdma_parent_name": "ib1", + }, + expCount: 7, + }, + { + name: "Missing port", + stats: map[string]interface{}{ + "ifname": "eth1", + }, + + expCount: 1, + }, + { + name: "Missing ifname", + stats: map[string]interface{}{ + "port": float64(2), + }, + expCount: 1, + }, + { + name: "Empty stats", + stats: map[string]interface{}{}, + expCount: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + list := getIdentifyAttributes(tt.stats) + if len(list) != tt.expCount { + t.Errorf("Expected %d attributes, but got %d", tt.expCount, len(list)) + } + }) + } +} + +func TestObserve(t *testing.T) { + tests := []struct { + name string + counter metric.Int64ObservableCounter + value int + attributes []*attribute.KeyValue + expected []attribute.KeyValue + }{ + { + name: "No attributes", + counter: mustNewInt64ObservableCounter(noop.NewMeterProvider().Meter("test").Int64ObservableCounter("test_counter")), + value: 10, + expected: []attribute.KeyValue{}, + }, + { + name: "Single attribute", + counter: mustNewInt64ObservableCounter(noop.NewMeterProvider().Meter("test").Int64ObservableCounter("test_counter")), + value: 20, + attributes: []*attribute.KeyValue{ + ptr(attribute.String("key1", "value1")), + }, + expected: []attribute.KeyValue{ + attribute.String("key1", "value1"), + }, + }, + { + name: "Multiple attributes", + counter: mustNewInt64ObservableCounter(noop.NewMeterProvider().Meter("test").Int64ObservableCounter("test_counter")), + value: 30, + attributes: []*attribute.KeyValue{ + ptr(attribute.String("key1", "value1")), + ptr(attribute.Int("key2", 2)), + }, + expected: []attribute.KeyValue{ + attribute.String("key1", "value1"), + attribute.Int("key2", 2), + }, + }, + { + name: "Nil attribute", + counter: mustNewInt64ObservableCounter(noop.NewMeterProvider().Meter("test").Int64ObservableCounter("test_counter")), + value: 40, + attributes: []*attribute.KeyValue{ + nil, + ptr(attribute.String("key1", "value1")), + }, + expected: []attribute.KeyValue{ + attribute.String("key1", "value1"), + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + observe(noop.Observer{}, tt.counter, tt.value, tt.attributes...) + }) + } +} + +func mustNewInt64ObservableCounter(counter metric.Int64ObservableCounter, err error) metric.Int64ObservableCounter { + if err != nil { + panic(err) + } + return counter +} + +func ptr(kv attribute.KeyValue) *attribute.KeyValue { + return &kv +} + +func TestGetRDMAStats(t *testing.T) { + tests := []struct { + name string + nsID string + mockNetnsCli func(netnsID string, toRun func() error) error + nl NetlinkImpl + commandScript []testexec.FakeCommandAction + expectedIP string + expectedStats string + expectedErr bool + ethtoolImpl EthtoolImpl + }{ + { + name: "get net get from path should return error", + nsID: "test", + mockNetnsCli: func(netnsID string, toRun func() error) error { + return errors.New("mock error") + }, + nl: NetlinkImpl{ + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, nil + }, + LinkList: nil, + }, + expectedErr: true, + }, + { + name: "get pod rdma stats error", + nsID: "test", + mockNetnsCli: func(netnsID string, toRun func() error) error { + return toRun() + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return nil, nil, errors.New("mock error") + }, + } + return fakeCmd + }, + }, + + expectedErr: true, + }, + + { + name: "get pod rdma stats error with unmarshal json", + nsID: "test", + mockNetnsCli: func(netnsID string, toRun func() error) error { + return toRun() + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("{\"rxWriteRequests\": 100}"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("src 1.1.1.1"), nil, nil + }, + } + return fakeCmd + }, + }, + + expectedErr: true, + }, + + { + name: "get pod RDMA stats success", + nsID: "test", + ethtoolImpl: EthtoolImpl{Stats: func(netIfName string) (map[string]uint64, error) { + return map[string]uint64{ + "rx_vport_rdma_unicast_bytes": 100, + "tx_vport_rdma_unicast_bytes": 100, + }, nil + }}, + nl: NetlinkImpl{ + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + rdmaList := []*netlink.RdmaLink{ + {Attrs: netlink.RdmaLinkAttrs{ + Name: "mlx5_34", + NodeGuid: "1d:c9:d1:fe:ff:ac:36:ae", + }}, + {Attrs: netlink.RdmaLinkAttrs{ + Name: "mlx5_6", + NodeGuid: "b6:65:05:0c:9c:5c:f6:08", + SysImageGuid: "b6:65:05:0c:9c:5c:f6:00", + }}, + } + return rdmaList, nil + }, + LinkList: func() ([]netlink.Link, error) { + linkList := []netlink.Link{ + &netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: ""}}, + &netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "mock-empty-addr", HardwareAddr: nil}}, + &netlink.Device{LinkAttrs: netlink.LinkAttrs{ + Name: "enp5s0f0v6", + HardwareAddr: func() net.HardwareAddr { + mac, _ := net.ParseMAC("ae:36:ac:d1:c9:1d") + return mac + }(), + }}, + &netlink.IPoIB{LinkAttrs: netlink.LinkAttrs{ + Name: "ibp13s0v7", + HardwareAddr: func() net.HardwareAddr { + mac, _ := net.ParseMAC("00:00:00:68:fe:80:00:00:00:00:00:00:08:f6:5c:9c:0c:05:65:b6") + return mac + }(), + }}, + } + return linkList, nil + }, + }, + mockNetnsCli: func(netnsID string, toRun func() error) error { + return toRun() + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("[{\"rxWriteRequests\": 100,\"ifname\": \"mlx5_34\"}]"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("src 192.168.1.1"), nil, nil + }, + } + return fakeCmd + }, + }, + expectedIP: "192.168.1.1", + expectedErr: false, + expectedStats: "[{\"ifname\":\"mlx5_34\",\"is_root\":false,\"net_dev_name\":\"enp5s0f0v6\",\"node_guid\":\"1d:c9:d1:fe:ff:ac:36:ae\",\"rx_vport_rdma_unicast_bytes\":100,\"rx_write_requests\":100,\"sys_image_guid\":\"\",\"tx_vport_rdma_unicast_bytes\":100}]", + }, + + { + name: "get default ip return error", + nsID: "test", + nl: NetlinkImpl{ + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, nil + }, + LinkList: func() ([]netlink.Link, error) { + return nil, nil + }, + }, + mockNetnsCli: func(netnsID string, toRun func() error) error { + return toRun() + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("[{\"rxWriteRequests\": 100}]"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return nil, nil, errors.New("mock error") + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return nil, nil, errors.New("mock error") + }, + } + return fakeCmd + }, + }, + expectedIP: "192.168.1.1", + expectedErr: true, + expectedStats: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeExec := &testexec.FakeExec{CommandScript: tt.commandScript} + guidMapNetDeviceName := map[string]string{ + "b6:65:05:0c:9c:5c:f6:00": "ib1", + } + ip, stats, err := getRDMAStats(tt.nsID, tt.mockNetnsCli, guidMapNetDeviceName, tt.nl, fakeExec, tt.ethtoolImpl) + if (err != nil) != tt.expectedErr { + t.Errorf("expected error: %v, but got: %v", tt.expectedErr, err) + } + if tt.expectedErr { + return + } + if ip != tt.expectedIP { + t.Errorf("expected IP %v, but got %v", tt.expectedIP, ip) + } + + got, err := json.Marshal(stats) + if err != nil { + t.Fatal(err) + } + if !compareMaps(string(got), tt.expectedStats) { + raw, _ := json.Marshal(stats) + t.Errorf("expected stats %v, but got %v", tt.expectedStats, string(raw)) + } + }) + } +} + +// compareMaps compares two slices of maps for equality +func compareMaps(got, exp string) bool { + expList := make([]map[string]interface{}, 0) + err := json.Unmarshal([]byte(exp), &expList) + if err != nil { + panic(err) + } + + gotList := make([]map[string]interface{}, 0) + err = json.Unmarshal([]byte(got), &gotList) + if err != nil { + panic(err) + } + + if len(gotList) != len(expList) { + return false + } + for i := range gotList { + for key, value := range gotList[i] { + if expList[i][key] != value { + return false + } + } + for key, value := range expList[i] { + if gotList[i][key] != value { + return false + } + } + } + return true +} + +func TestProcessStats(t *testing.T) { + stats := map[string]interface{}{ + "port": float64(1), + "ifname": "eth0", + "rx_write_requests": float64(100), + "some_uint64": uint64(100), + } + var attributes []*attribute.KeyValue + + processStats(stats, noop.Observer{}, func(s string) (metric.Int64ObservableCounter, bool) { + return mustNewInt64ObservableCounter(noop.NewMeterProvider().Meter("test").Int64ObservableCounter("test_counter")), true + }, attributes...) + + processStats(stats, noop.Observer{}, func(s string) (metric.Int64ObservableCounter, bool) { + return nil, false + }, attributes...) +} + +func TestProcessNetNS(t *testing.T) { + tests := []struct { + name string + netnsID string + ipPodMap map[string]types.NamespacedName + commandScript []testexec.FakeCommandAction + getObservable GetObservable + expectError bool + }{ + { + name: "Empty netnsID and ipPodMap", + netnsID: "", + ipPodMap: map[string]types.NamespacedName{ + "192.168.1.1": {Namespace: "default", Name: "demo"}, + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("[{\"rxWriteRequests\": 100}]"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("src 192.168.1.1"), nil, nil + }, + } + return fakeCmd + }, + }, + getObservable: func(s string) (metric.Int64ObservableCounter, bool) { + return nil, true + }, + expectError: false, + }, + + { + name: "processNetNS call getRDMAStats get error", + netnsID: "", + ipPodMap: map[string]types.NamespacedName{ + "192.168.1.1": {Namespace: "default", Name: "demo"}, + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return nil, nil, errors.New("mock error") + }, + } + return fakeCmd + }, + }, + getObservable: func(s string) (metric.Int64ObservableCounter, bool) { + return nil, true + }, + expectError: true, + }, + + { + name: "processNetNS call getRDMAStats get empty stats", + netnsID: "", + ipPodMap: map[string]types.NamespacedName{ + "192.168.1.1": {Namespace: "default", Name: "demo"}, + }, + commandScript: []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("[]"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("src 192.168.1.1"), nil, nil + }, + } + return fakeCmd + }, + }, + getObservable: func(s string) (metric.Int64ObservableCounter, bool) { + return nil, true + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + meter := noop.NewMeterProvider().Meter("test") + cli = fake.NewClientBuilder().Build() + + fakeExec := &testexec.FakeExec{ + CommandScript: tt.commandScript, + } + + e := &exporter{ + meter: meter, + netlinkImpl: NetlinkImpl{ + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, nil + }, + LinkList: func() ([]netlink.Link, error) { + return nil, nil + }, + }, + netns: func(netnsID string, toRun func() error) error { + return toRun() + }, + exec: fakeExec, + log: logutils.Logger.Named("rdma-metrics-exporter"), + } + observer := noop.Observer{} + guidMapNetDeviceName := map[string]string{ + "b6:65:05:0c:9c:5c:f6:00": "ib1", + } + err := e.processNetNS(tt.netnsID, tt.ipPodMap, guidMapNetDeviceName, observer, tt.getObservable) + if (err != nil) != tt.expectError { + t.Errorf("Expected error: %v, but got: %v", tt.expectError, err) + } + }) + } +} + +func TestUpdateUnregisteredMetrics(t *testing.T) { + var commandScript []testexec.FakeCommandAction + + meter := noop.NewMeterProvider().Meter("test") + cli = fake.NewClientBuilder().Build() + + fakeExec := &testexec.FakeExec{ + CommandScript: commandScript, + } + + e := &exporter{ + observableMap: make(map[string]metric.Int64ObservableCounter), + ch: make(chan struct{}, 10), + meter: meter, + netns: func(netnsID string, toRun func() error) error { + return toRun() + }, + exec: fakeExec, + log: logutils.Logger.Named("rdma-metrics-exporter"), + } + ctx, cancel := context.WithCancel(context.Background()) + + err := e.registerMetrics(e.meter) + if err != nil { + t.Fatal(err) + } + + go e.daemon(ctx) + + unRegistrationMetric := []string{"test_counter"} + e.updateUnregisteredMetrics(unRegistrationMetric) + + time.Sleep(time.Second * 3) + + cancel() +} + +func TestCallback(t *testing.T) { + commandScript := []testexec.FakeCommandAction{ + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("[{\"rxWriteRequests\": 100}]"), nil, nil + }, + } + return fakeCmd + }, + func(cmd string, args ...string) exec.Cmd { + fakeCmd := &testexec.FakeCmd{} + fakeCmd.CombinedOutputScript = []testexec.FakeAction{ + func() ([]byte, []byte, error) { + return []byte("src 192.168.1.1"), nil, nil + }, + } + return fakeCmd + }, + } + + meter := noop.NewMeterProvider().Meter("test") + cli = fake.NewClientBuilder().Build() + + fakeExec := &testexec.FakeExec{ + CommandScript: commandScript, + } + + e := &exporter{ + observableMap: make(map[string]metric.Int64ObservableCounter), + netlinkImpl: NetlinkImpl{ + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, nil + }, + LinkList: func() ([]netlink.Link, error) { + return nil, nil + }, + }, + ch: make(chan struct{}, 10), + meter: meter, + netns: func(netnsID string, toRun func() error) error { + return toRun() + }, + exec: fakeExec, + log: logutils.Logger.Named("rdma-metrics-exporter"), + } + err := e.registerMetrics(e.meter) + if err != nil { + t.Fatal(err) + } + + readDir = func(name string) ([]os.DirEntry, error) { + return nil, nil + } + + ctx2 := context.Background() + observer := noop.Observer{} + err = e.Callback(ctx2, observer) + if err != nil { + t.Fatal(err) + } +} + +func TestReverseMACAddress(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"00:1A:2B:3C:4D:5E", "5E:4D:3C:2B:1A:00"}, + {"12:34:56:78:9A:BC", "BC:9A:78:56:34:12"}, + {"AA:BB:CC:DD:EE:FF", "FF:EE:DD:CC:BB:AA"}, + {"01:02:03:04:05:06", "06:05:04:03:02:01"}, + {"", ""}, + {"A1:B2:C3", "C3:B2:A1"}, + } + + for _, tt := range tests { + result := reverseMACAddress(tt.input) + if result != tt.expected { + t.Errorf("reverseMACAddress(%q) = %q; want %q", tt.input, result, tt.expected) + } + } +} + +func TestGetIfnameNetDevMap(t *testing.T) { + linkList := []netlink.Link{ + &netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: ""}}, + &netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "mock-empty-addr", HardwareAddr: nil}}, + &netlink.Device{LinkAttrs: netlink.LinkAttrs{ + Name: "enp5s0f0v6", + HardwareAddr: func() net.HardwareAddr { + mac, _ := net.ParseMAC("ae:36:ac:d1:c9:1d") + return mac + }(), + }}, + &netlink.IPoIB{LinkAttrs: netlink.LinkAttrs{ + Name: "ibp13s0v7", + HardwareAddr: func() net.HardwareAddr { + mac, _ := net.ParseMAC("00:00:00:68:fe:80:00:00:00:00:00:00:08:f6:5c:9c:0c:05:65:b6") + return mac + }(), + }}, + } + + rdmaList := []*netlink.RdmaLink{ + {Attrs: netlink.RdmaLinkAttrs{ + Name: "mlx5_34", + NodeGuid: "1d:c9:d1:fe:ff:ac:36:ae", + }}, + {Attrs: netlink.RdmaLinkAttrs{ + Name: "mlx5_6", + NodeGuid: "b6:65:05:0c:9c:5c:f6:08", + }}, + } + + tests := []struct { + name string + nl NetlinkImpl + expErr bool + exp map[string]string + }{ + { + name: "call netlink.LinkList func get err", + nl: NetlinkImpl{ + LinkList: func() ([]netlink.Link, error) { + return nil, fmt.Errorf("mock err") + }, + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, fmt.Errorf("mock err") + }, + }, + expErr: true, + }, + { + name: "call netlink.RdmaLinkList func get list", + nl: NetlinkImpl{ + LinkList: func() ([]netlink.Link, error) { + return linkList, nil + }, + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return nil, fmt.Errorf("mock err") + }, + }, + expErr: true, + }, + { + name: "success", + nl: NetlinkImpl{ + LinkList: func() ([]netlink.Link, error) { + return linkList, nil + }, + RdmaLinkList: func() ([]*netlink.RdmaLink, error) { + return rdmaList, nil + }, + }, + expErr: false, + exp: map[string]string{ + "mlx5_34": "enp5s0f0v6", + "mlx5_6": "ibp13s0v7", + }, + }, + } + + for _, tt := range tests { + netDevMap, err := getIfnameNetDevMap(tt.nl) + if (err != nil) != tt.expErr { + t.Errorf("expected error: %v, but got: %v", tt.expErr, err) + } + got := make(map[string]string) + for s, device := range netDevMap { + got[s] = device.NetDevName + } + if tt.exp != nil { + eq := reflect.DeepEqual(tt.exp, got) + if !eq { + t.Errorf("expected: %v, but got: %v", tt.exp, got) + } + } + } +} diff --git a/pkg/subnetmanager/subnet_manager_suite_test.go b/pkg/subnetmanager/subnet_manager_suite_test.go index 4dd49616cf..050f3da3da 100644 --- a/pkg/subnetmanager/subnet_manager_suite_test.go +++ b/pkg/subnetmanager/subnet_manager_suite_test.go @@ -49,13 +49,8 @@ func TestSubnetManager(t *testing.T) { } var _ = BeforeSuite(func() { - _, err := metric.InitMetric(context.TODO(), constant.SpiderpoolController, false, false) - Expect(err).NotTo(HaveOccurred()) - err = metric.InitSpiderpoolControllerMetrics(context.TODO()) - Expect(err).NotTo(HaveOccurred()) - scheme = runtime.NewScheme() - err = spiderpoolv2beta1.AddToScheme(scheme) + err := spiderpoolv2beta1.AddToScheme(scheme) Expect(err).NotTo(HaveOccurred()) err = kruiseapi.AddToScheme(scheme) Expect(err).NotTo(HaveOccurred()) @@ -69,6 +64,11 @@ var _ = BeforeSuite(func() { WithStatusSubresource(&spiderpoolv2beta1.SpiderSubnet{}). Build() + _, err = metric.InitMetric(context.TODO(), constant.SpiderpoolController, false, false) + Expect(err).NotTo(HaveOccurred()) + err = metric.InitSpiderpoolControllerMetrics(context.TODO()) + Expect(err).NotTo(HaveOccurred()) + tracker = k8stesting.NewObjectTracker(scheme, k8sscheme.Codecs.UniversalDecoder()) fakeAPIReader = fake.NewClientBuilder(). WithScheme(scheme). diff --git a/test/doc/metric.md b/test/doc/metric.md index 151d6e6efe..af34323c29 100644 --- a/test/doc/metric.md +++ b/test/doc/metric.md @@ -1,5 +1,6 @@ # E2E Cases for metric -| Case ID | Title | Priority | Smoke | Status | Other | -| ------- | ------------------------------------------------------------ | -------- | ----- | ------ | ----- | -| K00001 | The metric should work fine. | p1 | true | done | | +| Case ID | Title | Priority | Smoke | Status | Other | +|---------|-----------------------------------|----------|-------|--------|-------| +| K00001 | The metric should work fine. | p1 | true | done | | +| K00002 | The rdma metric should work fine. | p1 | true | done | | diff --git a/vendor/k8s.io/utils/exec/testing/fake_exec.go b/vendor/k8s.io/utils/exec/testing/fake_exec.go new file mode 100644 index 0000000000..7c125a6b45 --- /dev/null +++ b/vendor/k8s.io/utils/exec/testing/fake_exec.go @@ -0,0 +1,277 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testingexec + +import ( + "context" + "fmt" + "io" + "sync" + + "k8s.io/utils/exec" +) + +// FakeExec is a simple scripted Interface type. +type FakeExec struct { + CommandScript []FakeCommandAction + CommandCalls int + LookPathFunc func(string) (string, error) + // ExactOrder enforces that commands are called in the order they are scripted, + // and with the exact same arguments + ExactOrder bool + // DisableScripts removes the requirement that CommandScripts be populated + // before calling Command(). This makes Command() and subsequent calls to + // Run() or CombinedOutput() always return success and empty output. + DisableScripts bool + + mu sync.Mutex +} + +var _ exec.Interface = &FakeExec{} + +// FakeCommandAction is the function to be executed +type FakeCommandAction func(cmd string, args ...string) exec.Cmd + +// Command returns the next unexecuted command in CommandScripts. +// This function is safe for concurrent access as long as the underlying +// FakeExec struct is not modified during execution. +func (fake *FakeExec) Command(cmd string, args ...string) exec.Cmd { + if fake.DisableScripts { + fakeCmd := &FakeCmd{DisableScripts: true} + return InitFakeCmd(fakeCmd, cmd, args...) + } + fakeCmd := fake.nextCommand(cmd, args) + if fake.ExactOrder { + argv := append([]string{cmd}, args...) + fc := fakeCmd.(*FakeCmd) + if cmd != fc.Argv[0] { + panic(fmt.Sprintf("received command: %s, expected: %s", cmd, fc.Argv[0])) + } + if len(argv) != len(fc.Argv) { + panic(fmt.Sprintf("command (%s) received with extra/missing arguments. Expected %v, Received %v", cmd, fc.Argv, argv)) + } + for i, a := range argv[1:] { + if a != fc.Argv[i+1] { + panic(fmt.Sprintf("command (%s) called with unexpected argument. Expected %s, Received %s", cmd, fc.Argv[i+1], a)) + } + } + } + return fakeCmd +} + +func (fake *FakeExec) nextCommand(cmd string, args []string) exec.Cmd { + fake.mu.Lock() + defer fake.mu.Unlock() + + if fake.CommandCalls > len(fake.CommandScript)-1 { + panic(fmt.Sprintf("ran out of Command() actions. Could not handle command [%d]: %s args: %v", fake.CommandCalls, cmd, args)) + } + i := fake.CommandCalls + fake.CommandCalls++ + return fake.CommandScript[i](cmd, args...) +} + +// CommandContext wraps arguments into exec.Cmd +func (fake *FakeExec) CommandContext(ctx context.Context, cmd string, args ...string) exec.Cmd { + return fake.Command(cmd, args...) +} + +// LookPath is for finding the path of a file +func (fake *FakeExec) LookPath(file string) (string, error) { + return fake.LookPathFunc(file) +} + +// FakeCmd is a simple scripted Cmd type. +type FakeCmd struct { + Argv []string + CombinedOutputScript []FakeAction + CombinedOutputCalls int + CombinedOutputLog [][]string + OutputScript []FakeAction + OutputCalls int + OutputLog [][]string + RunScript []FakeAction + RunCalls int + RunLog [][]string + Dirs []string + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer + Env []string + StdoutPipeResponse FakeStdIOPipeResponse + StderrPipeResponse FakeStdIOPipeResponse + WaitResponse error + StartResponse error + DisableScripts bool +} + +var _ exec.Cmd = &FakeCmd{} + +// InitFakeCmd is for creating a fake exec.Cmd +func InitFakeCmd(fake *FakeCmd, cmd string, args ...string) exec.Cmd { + fake.Argv = append([]string{cmd}, args...) + return fake +} + +// FakeStdIOPipeResponse holds responses to use as fakes for the StdoutPipe and +// StderrPipe method calls +type FakeStdIOPipeResponse struct { + ReadCloser io.ReadCloser + Error error +} + +// FakeAction is a function type +type FakeAction func() ([]byte, []byte, error) + +// SetDir sets the directory +func (fake *FakeCmd) SetDir(dir string) { + fake.Dirs = append(fake.Dirs, dir) +} + +// SetStdin sets the stdin +func (fake *FakeCmd) SetStdin(in io.Reader) { + fake.Stdin = in +} + +// SetStdout sets the stdout +func (fake *FakeCmd) SetStdout(out io.Writer) { + fake.Stdout = out +} + +// SetStderr sets the stderr +func (fake *FakeCmd) SetStderr(out io.Writer) { + fake.Stderr = out +} + +// SetEnv sets the environment variables +func (fake *FakeCmd) SetEnv(env []string) { + fake.Env = env +} + +// StdoutPipe returns an injected ReadCloser & error (via StdoutPipeResponse) +// to be able to inject an output stream on Stdout +func (fake *FakeCmd) StdoutPipe() (io.ReadCloser, error) { + return fake.StdoutPipeResponse.ReadCloser, fake.StdoutPipeResponse.Error +} + +// StderrPipe returns an injected ReadCloser & error (via StderrPipeResponse) +// to be able to inject an output stream on Stderr +func (fake *FakeCmd) StderrPipe() (io.ReadCloser, error) { + return fake.StderrPipeResponse.ReadCloser, fake.StderrPipeResponse.Error +} + +// Start mimicks starting the process (in the background) and returns the +// injected StartResponse +func (fake *FakeCmd) Start() error { + return fake.StartResponse +} + +// Wait mimicks waiting for the process to exit returns the +// injected WaitResponse +func (fake *FakeCmd) Wait() error { + return fake.WaitResponse +} + +// Run runs the command +func (fake *FakeCmd) Run() error { + if fake.DisableScripts { + return nil + } + if fake.RunCalls > len(fake.RunScript)-1 { + panic("ran out of Run() actions") + } + if fake.RunLog == nil { + fake.RunLog = [][]string{} + } + i := fake.RunCalls + fake.RunLog = append(fake.RunLog, append([]string{}, fake.Argv...)) + fake.RunCalls++ + stdout, stderr, err := fake.RunScript[i]() + if stdout != nil { + fake.Stdout.Write(stdout) + } + if stderr != nil { + fake.Stderr.Write(stderr) + } + return err +} + +// CombinedOutput returns the output from the command +func (fake *FakeCmd) CombinedOutput() ([]byte, error) { + if fake.DisableScripts { + return []byte{}, nil + } + if fake.CombinedOutputCalls > len(fake.CombinedOutputScript)-1 { + panic("ran out of CombinedOutput() actions") + } + if fake.CombinedOutputLog == nil { + fake.CombinedOutputLog = [][]string{} + } + i := fake.CombinedOutputCalls + fake.CombinedOutputLog = append(fake.CombinedOutputLog, append([]string{}, fake.Argv...)) + fake.CombinedOutputCalls++ + stdout, _, err := fake.CombinedOutputScript[i]() + return stdout, err +} + +// Output is the response from the command +func (fake *FakeCmd) Output() ([]byte, error) { + if fake.DisableScripts { + return []byte{}, nil + } + if fake.OutputCalls > len(fake.OutputScript)-1 { + panic("ran out of Output() actions") + } + if fake.OutputLog == nil { + fake.OutputLog = [][]string{} + } + i := fake.OutputCalls + fake.OutputLog = append(fake.OutputLog, append([]string{}, fake.Argv...)) + fake.OutputCalls++ + stdout, _, err := fake.OutputScript[i]() + return stdout, err +} + +// Stop is to stop the process +func (fake *FakeCmd) Stop() { + // no-op +} + +// FakeExitError is a simple fake ExitError type. +type FakeExitError struct { + Status int +} + +var _ exec.ExitError = FakeExitError{} + +func (fake FakeExitError) String() string { + return fmt.Sprintf("exit %d", fake.Status) +} + +func (fake FakeExitError) Error() string { + return fake.String() +} + +// Exited always returns true +func (fake FakeExitError) Exited() bool { + return true +} + +// ExitStatus returns the fake status +func (fake FakeExitError) ExitStatus() int { + return fake.Status +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 0c2ed04a67..9c3e4a25cf 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1354,6 +1354,7 @@ k8s.io/utils/buffer k8s.io/utils/clock k8s.io/utils/clock/testing k8s.io/utils/exec +k8s.io/utils/exec/testing k8s.io/utils/integer k8s.io/utils/internal/third_party/forked/golang/net k8s.io/utils/net