Enrich with prometheus labels

run-ai · Nov 27, 2024 · 50fafe4 · 50fafe4
1 parent 45ef11e
commit 50fafe4
Show file tree

Hide file tree

Showing 8 changed files with 163 additions and 19 deletions.
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
@@ -39,6 +39,8 @@ containers:
       value: "{{ .Release.Namespace }}"
     - name: TOPOLOGY_MAX_EXPORT_INTERVAL
       value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
+    - name: EXPORT_PROMETHEUS_LABEL_ENRICHMENTS
+      value: "{{ .Values.statusExporter.config.exportPrometheusLabelEnrichments }}"
   ports:
     - containerPort: 9400
       name: http
@@ -53,6 +55,9 @@ tolerations:
   - effect: NoSchedule
     key: nvidia.com/gpu
     operator: Exists
+  {{- if .Values.kwok.tolerations }}
+  {{ .Values.kwok.tolerations | toYaml | nindent 2 }}
+  {{- end }}
 imagePullSecrets:
   - name: gcr-secret
 volumes:

diff --git a/deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml b/deploy/fake-gpu-operator/templates/status-exporter/servicemonitor.yaml
@@ -0,0 +1,16 @@
+{{- if .Values.statusExporter.serviceMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: nvidia-dcgm-exporter
+  labels:
+    release: {{ .Release.Name }}
+spec:
+  selector:
+    matchLabels:
+      app: nvidia-dcgm-exporter
+  endpoints:
+    - port: gpu-metrics
+      interval: 30s
+      honorLabels: true
+{{- end }}
diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml
@@ -53,6 +53,10 @@ statusExporter:
       cpu: "200m"
       memory: "200Mi"
   topologyMaxExportInterval: 10s
+  serviceMonitor:
+    enabled: false
+  config:
+    exportPrometheusLabelEnrichments: false
 
 kwokGpuDevicePlugin:
   image:
@@ -86,3 +90,11 @@ topology:
       gpuMemory: 11441
   nodePoolLabelKey: run.ai/simulated-gpu-node-pool
   migStrategy: mixed
+
+# GuyTodo: Test each
+kwok:
+  tolerations:
+    - key: kwok.x-k8s.io/node
+      operator: Equal
+      value: fake
+      effect: NoSchedule
diff --git a/design/samples/gpu-operator/metrics/idle-gpu.yaml b/design/samples/gpu-operator/metrics/idle-gpu.yaml
@@ -0,0 +1,76 @@
+################ Idle GPU ################
+
+###### From DCGM Exporter Directly ######
+
+{
+# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_SM_CLOCK gauge
+DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 300
+# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
+# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
+DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 405
+# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
+# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
+DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C).
+# TYPE DCGM_FI_DEV_GPU_TEMP gauge
+DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 23
+# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W).
+# TYPE DCGM_FI_DEV_POWER_USAGE gauge
+DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14.573000
+# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ).
+# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter
+DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 1232417
+# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries.
+# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter
+DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %).
+# TYPE DCGM_FI_DEV_GPU_UTIL gauge
+DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %).
+# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge
+DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %).
+# TYPE DCGM_FI_DEV_ENC_UTIL gauge
+DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %).
+# TYPE DCGM_FI_DEV_DEC_UTIL gauge
+DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered.
+# TYPE DCGM_FI_DEV_XID_ERRORS gauge
+DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14",err_code="0",err_msg="No Error"} 0
+# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB).
+# TYPE DCGM_FI_DEV_FB_FREE gauge
+DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14914
+# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB).
+# TYPE DCGM_FI_DEV_FB_USED gauge
+DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 2
+# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes.
+# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter
+DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status
+# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge
+DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
+# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active.
+# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge
+DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000
+# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active.
+# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge
+DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000
+# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data.
+# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge
+DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000002
+# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
+# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge
+DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340143
+# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
+# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge
+DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340655
+}
+
+###### From Prometheus ######
+
+{
+DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-123", UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", gpu="0", instance="10.244.2.16:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-zlj4t", service="nvidia-dcgm-exporter"}
+}
+
diff --git a/internal/common/constants/constants.go b/internal/common/constants/constants.go
@@ -19,10 +19,12 @@ const (
 
 	GpuResourceName = "nvidia.com/gpu"
 
-	EnvFakeNode            = "FAKE_NODE"
-	EnvNodeName            = "NODE_NAME"
-	EnvTopologyCmName      = "TOPOLOGY_CM_NAME"
-	EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE"
-	EnvFakeGpuOperatorNs   = "FAKE_GPU_OPERATOR_NAMESPACE"
-	EnvImpersonateIP       = "IMPERSONATE_IP"
+	EnvFakeNode                         = "FAKE_NODE"
+	EnvNodeName                         = "NODE_NAME"
+	EnvTopologyCmName                   = "TOPOLOGY_CM_NAME"
+	EnvTopologyCmNamespace              = "TOPOLOGY_CM_NAMESPACE"
+	EnvFakeGpuOperatorNs                = "FAKE_GPU_OPERATOR_NAMESPACE"
+	EnvImpersonatePodName               = "IMPERSONATE_POD_NAME"
+	EnvImpersonatePodIP                 = "IMPERSONATE_IP"
+	EnvExportPrometheusLabelEnrichments = "EXPORT_PROMETHEUS_LABEL_ENRICHMENTS"
 )
diff --git a/internal/status-exporter/export/metrics/exporter.go b/internal/status-exporter/export/metrics/exporter.go
@@ -17,6 +17,11 @@ import (
 	"github.com/spf13/viper"
 )
 
+const (
+	exporterPort          = 9400
+	exporterContainerName = "nvidia-dcgm-exporter"
+)
+
 type MetricsExporter struct {
 	topologyChan <-chan *topology.NodeTopology
 }
@@ -63,6 +68,7 @@ func (e *MetricsExporter) Run(stopCh <-chan struct{}) {
 
 func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
 	nodeName := viper.GetString(constants.EnvNodeName)
+	shouldExportPrometheusLabelEnrichments := viper.GetBool(constants.EnvExportPrometheusLabelEnrichments)
 
 	gpuUtilization.Reset()
 	gpuFbUsed.Reset()
@@ -81,6 +87,11 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
 			"container": gpu.Status.AllocatedBy.Container,
 		}
 
+		if shouldExportPrometheusLabelEnrichments {
+			// GuyTodo: Test this
+			labels = e.enrichWithPrometheusLabels(labels)
+		}
+
 		utilization := gpu.Status.PodGpuUsageStatus.Utilization()
 		fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory)
 
@@ -94,7 +105,7 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
 
 func setupServer() {
 	http.Handle("/metrics", promhttp.Handler())
-	err := http.ListenAndServe(":9400", nil)
+	err := http.ListenAndServe(fmt.Sprintf(":%d", exporterPort), nil)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -107,3 +118,19 @@ func generateFakeHostname(nodeName string) string {
 	nodeHostname := fmt.Sprintf("%s-%x", "nvidia-dcgm-exporter", nodeNameSHA1[:3])
 	return nodeHostname
 }
+
+func (e *MetricsExporter) enrichWithPrometheusLabels(labels prometheus.Labels) prometheus.Labels {
+	// If the labels are already set, move the existing values to "exported_" prefixed labels.
+	for _, label := range []string{"container", "namespace", "pod", "instance"} {
+		if val, ok := labels[label]; ok {
+			labels["exported_"+label] = val
+		}
+	}
+
+	labels["namespace"] = viper.GetString(constants.EnvFakeGpuOperatorNs)
+	labels["pod"] = viper.GetString(constants.EnvImpersonatePodName)
+	labels["container"] = exporterContainerName
+	labels["instance"] = fmt.Sprintf("%s:%d", viper.GetString(constants.EnvImpersonatePodIP), exporterPort)
+
+	return labels
+}
diff --git a/internal/status-exporter/export/metrics/metrics.go b/internal/status-exporter/export/metrics/metrics.go
@@ -9,13 +9,13 @@ var (
 	gpuUtilization = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Name: "DCGM_FI_DEV_GPU_UTIL",
 		Help: "GPU Utilization",
-	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
+	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
 	gpuFbUsed = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Name: "DCGM_FI_DEV_FB_USED",
 		Help: "GPU Framebuffer Used",
-	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
+	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
 	gpuFbFree = promauto.NewGaugeVec(prometheus.GaugeOpts{
 		Name: "DCGM_FI_DEV_FB_FREE",
 		Help: "GPU Framebuffer Free",
-	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
+	}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
 )
diff --git a/internal/status-updater/handlers/node/fake_node_deployments.go b/internal/status-updater/handlers/node/fake_node_deployments.go
@@ -119,8 +119,11 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De
 		Name:  constants.EnvFakeNode,
 		Value: "true",
 	}, v1.EnvVar{
-		Name:  constants.EnvImpersonateIP,
+		Name:  constants.EnvImpersonatePodIP,
 		Value: dummyDcgmExporterPod.Status.PodIP,
+	}, v1.EnvVar{
+		Name:  constants.EnvImpersonatePodName,
+		Value: dummyDcgmExporterPod.Name,
 	})
 
 	deployment.Spec.Template.Spec.Containers[0].Resources.Limits = v1.ResourceList{
@@ -135,6 +138,7 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De
 	return deployment, nil
 }
 
+// Waits for the dummy dcgm exporter pod to be ready and returns it
 func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) {
 	clientset := p.kubeClient // Assuming p.kubeClient is of type kubernetes.Interface
 
@@ -143,7 +147,7 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error)
 	fieldSelector := fields.OneTermEqualSelector("spec.nodeName", nodeName).String()
 
 	// Create a context with timeout
-	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 	defer cancel()
 
 	// Create a watch for pods with the specified label and field selectors
@@ -156,19 +160,21 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error)
 	}
 	defer watcher.Stop()
 
-	// Wait for the pod to be created
+	// Wait for the pod to be ready
 	for {
 		select {
 		case event := <-watcher.ResultChan():
-			if event.Type == watch.Added {
-				pod, ok := event.Object.(*v1.Pod)
-				if !ok {
-					return nil, fmt.Errorf("unexpected type")
+			pod, ok := event.Object.(*v1.Pod)
+			if !ok {
+				return nil, fmt.Errorf("unexpected object type: %T", event.Object)
+			}
+			if event.Type == watch.Added || event.Type == watch.Modified {
+				if pod.Status.Phase == v1.PodRunning {
+					return pod, nil
 				}
-				return pod, nil
 			}
 		case <-ctx.Done():
-			return nil, fmt.Errorf("timeout waiting for pod to be created")
+			return nil, fmt.Errorf("timeout waiting for pod to be ready")
 		}
 	}
 }