From 50fafe4a3c68e21b9b77f48d3e39bdaec1d09a0c Mon Sep 17 00:00:00 2001 From: gshaibi Date: Wed, 27 Nov 2024 12:54:00 +0200 Subject: [PATCH] Enrich with prometheus labels --- .../templates/status-exporter/_helpers.tpl | 5 ++ .../status-exporter/servicemonitor.yaml | 16 ++++ deploy/fake-gpu-operator/values.yaml | 12 +++ .../gpu-operator/metrics/idle-gpu.yaml | 76 +++++++++++++++++++ internal/common/constants/constants.go | 14 ++-- .../export/metrics/exporter.go | 29 ++++++- .../status-exporter/export/metrics/metrics.go | 6 +- .../handlers/node/fake_node_deployments.go | 24 +++--- 8 files changed, 163 insertions(+), 19 deletions(-) create mode 100644 deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml create mode 100644 design/samples/gpu-operator/metrics/idle-gpu.yaml diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl index 28a779d..f9f7fd3 100644 --- a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl +++ b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl @@ -39,6 +39,8 @@ containers: value: "{{ .Release.Namespace }}" - name: TOPOLOGY_MAX_EXPORT_INTERVAL value: "{{ .Values.statusExporter.topologyMaxExportInterval }}" + - name: EXPORT_PROMETHEUS_LABEL_ENRICHMENTS + value: "{{ .Values.statusExporter.config.exportPrometheusLabelEnrichments }}" ports: - containerPort: 9400 name: http @@ -53,6 +55,9 @@ tolerations: - effect: NoSchedule key: nvidia.com/gpu operator: Exists + {{- if .Values.kwok.tolerations }} + {{ .Values.kwok.tolerations | toYaml | nindent 2 }} + {{- end }} imagePullSecrets: - name: gcr-secret volumes: diff --git a/deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml b/deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml new file mode 100644 index 0000000..c8dd8ca --- /dev/null +++ b/deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml @@ -0,0 +1,16 @@ +{{- if .Values.statusExporter.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: nvidia-dcgm-exporter + labels: + release: {{ .Release.Name }} +spec: + selector: + matchLabels: + app: nvidia-dcgm-exporter + endpoints: + - port: gpu-metrics + interval: 30s + honorLabels: true +{{- end }} diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index 4d8e957..ad33d96 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -53,6 +53,10 @@ statusExporter: cpu: "200m" memory: "200Mi" topologyMaxExportInterval: 10s + serviceMonitor: + enabled: false + config: + exportPrometheusLabelEnrichments: false kwokGpuDevicePlugin: image: @@ -86,3 +90,11 @@ topology: gpuMemory: 11441 nodePoolLabelKey: run.ai/simulated-gpu-node-pool migStrategy: mixed + +# GuyTodo: Test each +kwok: + tolerations: + - key: kwok.x-k8s.io/node + operator: Equal + value: fake + effect: NoSchedule diff --git a/design/samples/gpu-operator/metrics/idle-gpu.yaml b/design/samples/gpu-operator/metrics/idle-gpu.yaml new file mode 100644 index 0000000..ae6a74c --- /dev/null +++ b/design/samples/gpu-operator/metrics/idle-gpu.yaml @@ -0,0 +1,76 @@ +################ Idle GPU ################ + +###### From DCGM Exporter Directly ###### + +{ +# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz). +# TYPE DCGM_FI_DEV_SM_CLOCK gauge +DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 300 +# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz). +# TYPE DCGM_FI_DEV_MEM_CLOCK gauge +DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 405 +# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C). +# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge +DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C). +# TYPE DCGM_FI_DEV_GPU_TEMP gauge +DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 23 +# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W). +# TYPE DCGM_FI_DEV_POWER_USAGE gauge +DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14.573000 +# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ). +# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 1232417 +# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries. +# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter +DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %). +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %). +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %). +# TYPE DCGM_FI_DEV_ENC_UTIL gauge +DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %). +# TYPE DCGM_FI_DEV_DEC_UTIL gauge +DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered. +# TYPE DCGM_FI_DEV_XID_ERRORS gauge +DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14",err_code="0",err_msg="No Error"} 0 +# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB). +# TYPE DCGM_FI_DEV_FB_FREE gauge +DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14914 +# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB). +# TYPE DCGM_FI_DEV_FB_USED gauge +DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 2 +# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes. +# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status +# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge +DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0 +# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active. +# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge +DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000 +# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active. +# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000 +# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data. +# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge +DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000002 +# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge +DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340143 +# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge +DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340655 +} + +###### From Prometheus ###### + +{ +DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-123", UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", gpu="0", instance="10.244.2.16:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-zlj4t", service="nvidia-dcgm-exporter"} +} + diff --git a/internal/common/constants/constants.go b/internal/common/constants/constants.go index beed436..3a35b82 100644 --- a/internal/common/constants/constants.go +++ b/internal/common/constants/constants.go @@ -19,10 +19,12 @@ const ( GpuResourceName = "nvidia.com/gpu" - EnvFakeNode = "FAKE_NODE" - EnvNodeName = "NODE_NAME" - EnvTopologyCmName = "TOPOLOGY_CM_NAME" - EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE" - EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE" - EnvImpersonateIP = "IMPERSONATE_IP" + EnvFakeNode = "FAKE_NODE" + EnvNodeName = "NODE_NAME" + EnvTopologyCmName = "TOPOLOGY_CM_NAME" + EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE" + EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE" + EnvImpersonatePodName = "IMPERSONATE_POD_NAME" + EnvImpersonatePodIP = "IMPERSONATE_IP" + EnvExportPrometheusLabelEnrichments = "EXPORT_PROMETHEUS_LABEL_ENRICHMENTS" ) diff --git a/internal/status-exporter/export/metrics/exporter.go b/internal/status-exporter/export/metrics/exporter.go index a6328e3..c5a48a8 100644 --- a/internal/status-exporter/export/metrics/exporter.go +++ b/internal/status-exporter/export/metrics/exporter.go @@ -17,6 +17,11 @@ import ( "github.com/spf13/viper" ) +const ( + exporterPort = 9400 + exporterContainerName = "nvidia-dcgm-exporter" +) + type MetricsExporter struct { topologyChan <-chan *topology.NodeTopology } @@ -63,6 +68,7 @@ func (e *MetricsExporter) Run(stopCh <-chan struct{}) { func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error { nodeName := viper.GetString(constants.EnvNodeName) + shouldExportPrometheusLabelEnrichments := viper.GetBool(constants.EnvExportPrometheusLabelEnrichments) gpuUtilization.Reset() gpuFbUsed.Reset() @@ -81,6 +87,11 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error { "container": gpu.Status.AllocatedBy.Container, } + if shouldExportPrometheusLabelEnrichments { + // GuyTodo: Test this + labels = e.enrichWithPrometheusLabels(labels) + } + utilization := gpu.Status.PodGpuUsageStatus.Utilization() fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory) @@ -94,7 +105,7 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error { func setupServer() { http.Handle("/metrics", promhttp.Handler()) - err := http.ListenAndServe(":9400", nil) + err := http.ListenAndServe(fmt.Sprintf(":%d", exporterPort), nil) if err != nil { log.Fatal(err) } @@ -107,3 +118,19 @@ func generateFakeHostname(nodeName string) string { nodeHostname := fmt.Sprintf("%s-%x", "nvidia-dcgm-exporter", nodeNameSHA1[:3]) return nodeHostname } + +func (e *MetricsExporter) enrichWithPrometheusLabels(labels prometheus.Labels) prometheus.Labels { + // If the labels are already set, move the existing values to "exported_" prefixed labels. + for _, label := range []string{"container", "namespace", "pod", "instance"} { + if val, ok := labels[label]; ok { + labels["exported_"+label] = val + } + } + + labels["namespace"] = viper.GetString(constants.EnvFakeGpuOperatorNs) + labels["pod"] = viper.GetString(constants.EnvImpersonatePodName) + labels["container"] = exporterContainerName + labels["instance"] = fmt.Sprintf("%s:%d", viper.GetString(constants.EnvImpersonatePodIP), exporterPort) + + return labels +} diff --git a/internal/status-exporter/export/metrics/metrics.go b/internal/status-exporter/export/metrics/metrics.go index cedf697..b59981a 100644 --- a/internal/status-exporter/export/metrics/metrics.go +++ b/internal/status-exporter/export/metrics/metrics.go @@ -9,13 +9,13 @@ var ( gpuUtilization = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "DCGM_FI_DEV_GPU_UTIL", Help: "GPU Utilization", - }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"}) + }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"}) gpuFbUsed = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "DCGM_FI_DEV_FB_USED", Help: "GPU Framebuffer Used", - }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"}) + }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"}) gpuFbFree = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "DCGM_FI_DEV_FB_FREE", Help: "GPU Framebuffer Free", - }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"}) + }, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"}) ) diff --git a/internal/status-updater/handlers/node/fake_node_deployments.go b/internal/status-updater/handlers/node/fake_node_deployments.go index a4cc245..22b127d 100644 --- a/internal/status-updater/handlers/node/fake_node_deployments.go +++ b/internal/status-updater/handlers/node/fake_node_deployments.go @@ -119,8 +119,11 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De Name: constants.EnvFakeNode, Value: "true", }, v1.EnvVar{ - Name: constants.EnvImpersonateIP, + Name: constants.EnvImpersonatePodIP, Value: dummyDcgmExporterPod.Status.PodIP, + }, v1.EnvVar{ + Name: constants.EnvImpersonatePodName, + Value: dummyDcgmExporterPod.Name, }) deployment.Spec.Template.Spec.Containers[0].Resources.Limits = v1.ResourceList{ @@ -135,6 +138,7 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De return deployment, nil } +// Waits for the dummy dcgm exporter pod to be ready and returns it func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) { clientset := p.kubeClient // Assuming p.kubeClient is of type kubernetes.Interface @@ -143,7 +147,7 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) fieldSelector := fields.OneTermEqualSelector("spec.nodeName", nodeName).String() // Create a context with timeout - ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() // Create a watch for pods with the specified label and field selectors @@ -156,19 +160,21 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) } defer watcher.Stop() - // Wait for the pod to be created + // Wait for the pod to be ready for { select { case event := <-watcher.ResultChan(): - if event.Type == watch.Added { - pod, ok := event.Object.(*v1.Pod) - if !ok { - return nil, fmt.Errorf("unexpected type") + pod, ok := event.Object.(*v1.Pod) + if !ok { + return nil, fmt.Errorf("unexpected object type: %T", event.Object) + } + if event.Type == watch.Added || event.Type == watch.Modified { + if pod.Status.Phase == v1.PodRunning { + return pod, nil } - return pod, nil } case <-ctx.Done(): - return nil, fmt.Errorf("timeout waiting for pod to be created") + return nil, fmt.Errorf("timeout waiting for pod to be ready") } } }