Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose NVMe Volume Metrics #2216

Merged
merged 3 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ e2e/single-az: bin/helm bin/ginkgo
TEST_PATH=./tests/e2e/... \
GINKGO_FOCUS="\[ebs-csi-e2e\] \[single-az\]" \
GINKGO_PARALLEL=5 \
HELM_EXTRA_FLAGS="--set=controller.volumeModificationFeature.enabled=true,sidecars.provisioner.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',sidecars.resizer.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true'" \
HELM_EXTRA_FLAGS="--set=controller.volumeModificationFeature.enabled=true,sidecars.provisioner.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',sidecars.resizer.additionalArgs[0]='--feature-gates=VolumeAttributesClass=true',node.enableMetrics=true" \
./hack/e2e/run.sh

.PHONY: e2e/multi-az
Expand Down
6 changes: 6 additions & 0 deletions charts/aws-ebs-csi-driver/templates/_node.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ spec:
{{- with .Values.node.reservedVolumeAttachments }}
- --reserved-volume-attachments={{ . }}
{{- end }}
{{- if .Values.node.enableMetrics }}
- --http-endpoint=0.0.0.0:3302
{{- end}}
{{- with .Values.node.kubeletPath }}
- --csi-mount-point-prefix={{ . }}
{{- end}}
{{- with .Values.node.volumeAttachLimit }}
- --volume-attach-limit={{ . }}
{{- end }}
Expand Down
18 changes: 18 additions & 0 deletions charts/aws-ebs-csi-driver/templates/metrics.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,21 @@ spec:
interval: {{ .Values.controller.serviceMonitor.interval | default "15s"}}
{{- end }}
{{- end }}
---
{{- if .Values.node.enableMetrics -}}
apiVersion: v1
kind: Service
metadata:
name: ebs-csi-node
namespace: {{ .Release.Namespace }}
labels:
app: ebs-csi-node
spec:
selector:
app: ebs-csi-node
ports:
- name: metrics
port: 3302
targetPort: 3302
type: ClusterIP
{{- end }}
14 changes: 9 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,6 @@ func main() {
}()
}

if options.HTTPEndpoint != "" {
r := metrics.InitializeRecorder()
r.InitializeMetricsHandler(options.HTTPEndpoint, "/metrics", options.MetricsCertFile, options.MetricsKeyFile)
}

cfg := metadata.MetadataServiceConfig{
EC2MetadataClient: metadata.DefaultEC2MetadataClient,
K8sAPIClient: metadata.DefaultKubernetesAPIClient(options.Kubeconfig),
Expand All @@ -159,6 +154,15 @@ func main() {
md, metadataErr = metadata.NewMetadataService(cfg, region)
}

if options.HTTPEndpoint != "" {
r := metrics.InitializeRecorder()
r.InitializeMetricsHandler(options.HTTPEndpoint, "/metrics", options.MetricsCertFile, options.MetricsKeyFile)

if options.Mode == driver.NodeMode || options.Mode == driver.AllMode {
metrics.InitializeNVME(r, options.CsiMountPointPath, md.GetInstanceID())
}
}

if metadataErr != nil {
klog.ErrorS(metadataErr, "Failed to initialize metadata when it is required")
if options.Mode == driver.ControllerMode {
Expand Down
1 change: 1 addition & 0 deletions deploy/kubernetes/base/node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ spec:
args:
- node
- --endpoint=$(CSI_ENDPOINT)
- --csi-mount-point-prefix=/var/lib/kubelet
- --logging-format=text
- --v=2
env:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/kubernetes-csi/external-snapshotter/client/v4 v4.2.0
github.com/onsi/ginkgo/v2 v2.21.0
github.com/onsi/gomega v1.35.0
github.com/prometheus/client_golang v1.20.5
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.9.0
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.56.0
Expand Down Expand Up @@ -99,7 +100,6 @@ require (
github.com/opencontainers/selinux v1.11.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.20.5 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.60.1 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,9 @@ func (d *NodeService) nodePublishVolumeForBlock(req *csi.NodePublishVolumeReques
}

// Create the mount point as a file since bind mount device node requires it to be a file
// This implementation detail is relied upon by the NVMECollector,
// which discovers block devices by parsing /proc/self/mountinfo. The bind mount
// created here ensures block devices appear in mountinfo even without a filesystem.
klog.V(4).InfoS("NodePublishVolume [block]: making target file", "target", target)
if err = d.mounter.MakeFile(target); err != nil {
if removeErr := os.Remove(target); removeErr != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/driver/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ type Options struct {
WindowsHostProcess bool
// LegacyXFSProgs formats XFS volumes with `bigtime=0,inobtcount=0,reflink=0`, so that they can be mounted onto nodes with linux kernel ≤ v5.4. Volumes formatted with this option may experience issues after 2038, and will be unable to use some XFS features (for example, reflinks).
LegacyXFSProgs bool
// CsiMountPointPath is the path where CSI volumes are expected to be mounted on the node.
CsiMountPointPath string
}

func (o *Options) AddFlags(f *flag.FlagSet) {
Expand Down Expand Up @@ -118,6 +120,7 @@ func (o *Options) AddFlags(f *flag.FlagSet) {
f.IntVar(&o.ReservedVolumeAttachments, "reserved-volume-attachments", -1, "Number of volume attachments reserved for system use. Not used when --volume-attach-limit is specified. The total amount of volume attachments for a node is computed as: <nr. of attachments for corresponding instance type> - <number of NICs, if relevant to the instance type> - <reserved-volume-attachments value>. When -1, the amount of reserved attachments is loaded from instance metadata that captured state at node boot and may include not only system disks but also CSI volumes.")
f.BoolVar(&o.WindowsHostProcess, "windows-host-process", false, "ALPHA: Indicates whether the driver is running in a Windows privileged container")
f.BoolVar(&o.LegacyXFSProgs, "legacy-xfs", false, "Warning: This option will be removed in a future version of EBS CSI Driver. Formats XFS volumes with `bigtime=0,inobtcount=0,reflink=0`, so that they can be mounted onto nodes with linux kernel ≤ v5.4. Volumes formatted with this option may experience issues after 2038, and will be unable to use some XFS features (for example, reflinks).")
f.StringVar(&o.CsiMountPointPath, "csi-mount-point-prefix", "", "A prefix of the mountpoints of all CSI-managed volumes. If this value is non-empty, all volumes mounted to a path beginning with the provided value are assumed to be CSI volumes owned by the EBS CSI Driver and safe to treat as such (for example, by exposing volume metrics).")
AndrewSirenko marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/driver/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ func TestAddFlags(t *testing.T) {
t.Errorf("error setting legacy-xfs: %v", err)
}

if err := f.Set("csi-mount-point-prefix", "/var/lib/kubelet"); err != nil {
t.Errorf("error setting csi-mount-point-prefix: %v", err)
}

if o.Endpoint != "custom-endpoint" {
t.Errorf("unexpected Endpoint: got %s, want custom-endpoint", o.Endpoint)
}
Expand Down
58 changes: 26 additions & 32 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ import (
"sync"
"time"

"k8s.io/component-base/metrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
)

Expand All @@ -29,7 +30,7 @@ var (
)

type metricRecorder struct {
registry metrics.KubeRegistry
registry *prometheus.Registry
metrics map[string]interface{}
}

Expand All @@ -43,13 +44,18 @@ func Recorder() *metricRecorder {
func InitializeRecorder() *metricRecorder {
once.Do(func() {
r = &metricRecorder{
registry: metrics.NewKubeRegistry(),
registry: prometheus.NewRegistry(),
metrics: make(map[string]interface{}),
}
})
return r
}

// InitializeNVME registers the NVMe collector for gathering metrics from NVMe devices.
func InitializeNVME(r *metricRecorder, csiMountPointPath, instanceID string) {
registerNVMECollector(r, csiMountPointPath, instanceID)
}

// IncreaseCount increases the counter metric by 1.
func (m *metricRecorder) IncreaseCount(name string, labels map[string]string) {
if m == nil {
Expand All @@ -65,7 +71,7 @@ func (m *metricRecorder) IncreaseCount(name string, labels map[string]string) {
return
}

metricAsCounterVec, ok := metric.(*metrics.CounterVec)
metricAsCounterVec, ok := metric.(*prometheus.CounterVec)
if ok {
metricAsCounterVec.With(labels).Inc()
} else {
Expand All @@ -87,7 +93,7 @@ func (m *metricRecorder) ObserveHistogram(name string, value float64, labels map
return
}

metricAsHistogramVec, ok := metric.(*metrics.HistogramVec)
metricAsHistogramVec, ok := metric.(*prometheus.HistogramVec)
if ok {
metricAsHistogramVec.With(labels).Observe(value)
} else {
Expand All @@ -103,11 +109,7 @@ func (m *metricRecorder) InitializeMetricsHandler(address, path, certFile, keyFi
}

mux := http.NewServeMux()
mux.Handle(path, metrics.HandlerFor(
m.registry,
metrics.HandlerOpts{
ErrorHandling: metrics.ContinueOnError,
}))
mux.Handle(path, promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{ErrorHandling: promhttp.ContinueOnError}))

server := &http.Server{
Addr: address,
Expand Down Expand Up @@ -136,7 +138,14 @@ func (m *metricRecorder) registerHistogramVec(name, help string, labels []string
if _, exists := m.metrics[name]; exists {
return
}
histogram := createHistogramVec(name, help, labels, buckets)
histogram := prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: name,
Help: help,
Buckets: buckets,
},
labels,
)
m.metrics[name] = histogram
m.registry.MustRegister(histogram)
}
Expand All @@ -145,30 +154,15 @@ func (m *metricRecorder) registerCounterVec(name, help string, labels []string)
if _, exists := m.metrics[name]; exists {
return
}
counter := createCounterVec(name, help, labels)
m.metrics[name] = counter
m.registry.MustRegister(counter)
}

func createHistogramVec(name, help string, labels []string, buckets []float64) *metrics.HistogramVec {
opts := &metrics.HistogramOpts{
Name: name,
Help: help,
StabilityLevel: metrics.ALPHA,
Buckets: buckets,
}
return metrics.NewHistogramVec(opts, labels)
}

func createCounterVec(name, help string, labels []string) *metrics.CounterVec {
return metrics.NewCounterVec(
&metrics.CounterOpts{
Name: name,
Help: help,
StabilityLevel: metrics.ALPHA,
counter := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: name,
Help: help,
},
labels,
)
m.metrics[name] = counter
m.registry.MustRegister(counter)
}

func getLabelNames(labels map[string]string) []string {
Expand Down
6 changes: 3 additions & 3 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func TestMetricRecorder(t *testing.T) {
m.IncreaseCount("test_total", map[string]string{"key": "value"})
},
expected: `
# HELP test_total [ALPHA] ebs_csi_aws_com metric
# HELP test_total ebs_csi_aws_com metric
# TYPE test_total counter
test_total{key="value"} 1
`,
Expand All @@ -46,7 +46,7 @@ test_total{key="value"} 1
m.ObserveHistogram("test", 1.5, map[string]string{"key": "value"}, []float64{1, 2, 3})
},
expected: `
# HELP test [ALPHA] ebs_csi_aws_com metric
# HELP test ebs_csi_aws_com metric
# TYPE test histogram
test{key="value",le="1"} 0
test{key="value",le="2"} 1
Expand All @@ -66,7 +66,7 @@ test_count{key="value"} 1
m.IncreaseCount("test_re_register_total", map[string]string{"key": "value2"})
},
expected: `
# HELP test_re_register_total [ALPHA] ebs_csi_aws_com metric
# HELP test_re_register_total ebs_csi_aws_com metric
# TYPE test_re_register_total counter
test_re_register_total{key="value1"} 2
test_re_register_total{key="value2"} 1
Expand Down
Loading