Skip to content

Commit

Permalink
Enrich with prometheus labels
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi committed Nov 27, 2024
1 parent 45ef11e commit 50fafe4
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ containers:
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
- name: EXPORT_PROMETHEUS_LABEL_ENRICHMENTS
value: "{{ .Values.statusExporter.config.exportPrometheusLabelEnrichments }}"
ports:
- containerPort: 9400
name: http
Expand All @@ -53,6 +55,9 @@ tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
{{- if .Values.kwok.tolerations }}
{{ .Values.kwok.tolerations | toYaml | nindent 2 }}
{{- end }}
imagePullSecrets:
- name: gcr-secret
volumes:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{{- if .Values.statusExporter.serviceMonitor.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: nvidia-dcgm-exporter
labels:
release: {{ .Release.Name }}
spec:
selector:
matchLabels:
app: nvidia-dcgm-exporter
endpoints:
- port: gpu-metrics
interval: 30s
honorLabels: true
{{- end }}
12 changes: 12 additions & 0 deletions deploy/fake-gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ statusExporter:
cpu: "200m"
memory: "200Mi"
topologyMaxExportInterval: 10s
serviceMonitor:
enabled: false
config:
exportPrometheusLabelEnrichments: false

kwokGpuDevicePlugin:
image:
Expand Down Expand Up @@ -86,3 +90,11 @@ topology:
gpuMemory: 11441
nodePoolLabelKey: run.ai/simulated-gpu-node-pool
migStrategy: mixed

# GuyTodo: Test each
kwok:
tolerations:
- key: kwok.x-k8s.io/node
operator: Equal
value: fake
effect: NoSchedule
76 changes: 76 additions & 0 deletions design/samples/gpu-operator/metrics/idle-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
################ Idle GPU ################

###### From DCGM Exporter Directly ######

{
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 300
# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz).
# TYPE DCGM_FI_DEV_MEM_CLOCK gauge
DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 405
# HELP DCGM_FI_DEV_MEMORY_TEMP Memory temperature (in C).
# TYPE DCGM_FI_DEV_MEMORY_TEMP gauge
DCGM_FI_DEV_MEMORY_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature (in C).
# TYPE DCGM_FI_DEV_GPU_TEMP gauge
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 23
# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W).
# TYPE DCGM_FI_DEV_POWER_USAGE gauge
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14.573000
# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ).
# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 1232417
# HELP DCGM_FI_DEV_PCIE_REPLAY_COUNTER Total number of PCIe retries.
# TYPE DCGM_FI_DEV_PCIE_REPLAY_COUNTER counter
DCGM_FI_DEV_PCIE_REPLAY_COUNTER{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %).
# TYPE DCGM_FI_DEV_GPU_UTIL gauge
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory utilization (in %).
# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge
DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_ENC_UTIL Encoder utilization (in %).
# TYPE DCGM_FI_DEV_ENC_UTIL gauge
DCGM_FI_DEV_ENC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_DEC_UTIL Decoder utilization (in %).
# TYPE DCGM_FI_DEV_DEC_UTIL gauge
DCGM_FI_DEV_DEC_UTIL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered.
# TYPE DCGM_FI_DEV_XID_ERRORS gauge
DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14",err_code="0",err_msg="No Error"} 0
# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB).
# TYPE DCGM_FI_DEV_FB_FREE gauge
DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 14914
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB).
# TYPE DCGM_FI_DEV_FB_USED gauge
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 2
# HELP DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Total number of NVLink bandwidth counters for all lanes.
# TYPE DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL counter
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_DEV_VGPU_LICENSE_STATUS vGPU License status
# TYPE DCGM_FI_DEV_VGPU_LICENSE_STATUS gauge
DCGM_FI_DEV_VGPU_LICENSE_STATUS{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0
# HELP DCGM_FI_PROF_GR_ENGINE_ACTIVE Ratio of time the graphics engine is active.
# TYPE DCGM_FI_PROF_GR_ENGINE_ACTIVE gauge
DCGM_FI_PROF_GR_ENGINE_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000
# HELP DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Ratio of cycles the tensor (HMMA) pipe is active.
# TYPE DCGM_FI_PROF_PIPE_TENSOR_ACTIVE gauge
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000000
# HELP DCGM_FI_PROF_DRAM_ACTIVE Ratio of cycles the device memory interface is active sending or receiving data.
# TYPE DCGM_FI_PROF_DRAM_ACTIVE gauge
DCGM_FI_PROF_DRAM_ACTIVE{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 0.000002
# HELP DCGM_FI_PROF_PCIE_TX_BYTES The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_TX_BYTES gauge
DCGM_FI_PROF_PCIE_TX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340143
# HELP DCGM_FI_PROF_PCIE_RX_BYTES The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
# TYPE DCGM_FI_PROF_PCIE_RX_BYTES gauge
DCGM_FI_PROF_PCIE_RX_BYTES{gpu="0",UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e",pci_bus_id="00000000:00:1E.0",device="nvidia0",modelName="Tesla T4",Hostname="ip-172-20-10-123",DCGM_FI_DRIVER_VERSION="550.54.14"} 340655
}

###### From Prometheus ######

{
DCGM_FI_DEV_GPU_UTIL{DCGM_FI_DRIVER_VERSION="550.54.14", Hostname="ip-172-20-10-123", UUID="GPU-00e54cc1-4465-041c-1075-5f7b1358e51e", container="nvidia-dcgm-exporter", device="nvidia0", endpoint="gpu-metrics", gpu="0", instance="10.244.2.16:9400", job="nvidia-dcgm-exporter", modelName="Tesla T4", namespace="gpu-operator", pci_bus_id="00000000:00:1E.0", pod="nvidia-dcgm-exporter-zlj4t", service="nvidia-dcgm-exporter"}
}

14 changes: 8 additions & 6 deletions internal/common/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ const (

GpuResourceName = "nvidia.com/gpu"

EnvFakeNode = "FAKE_NODE"
EnvNodeName = "NODE_NAME"
EnvTopologyCmName = "TOPOLOGY_CM_NAME"
EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE"
EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE"
EnvImpersonateIP = "IMPERSONATE_IP"
EnvFakeNode = "FAKE_NODE"
EnvNodeName = "NODE_NAME"
EnvTopologyCmName = "TOPOLOGY_CM_NAME"
EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE"
EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE"
EnvImpersonatePodName = "IMPERSONATE_POD_NAME"
EnvImpersonatePodIP = "IMPERSONATE_IP"
EnvExportPrometheusLabelEnrichments = "EXPORT_PROMETHEUS_LABEL_ENRICHMENTS"
)
29 changes: 28 additions & 1 deletion internal/status-exporter/export/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ import (
"github.com/spf13/viper"
)

const (
exporterPort = 9400
exporterContainerName = "nvidia-dcgm-exporter"
)

type MetricsExporter struct {
topologyChan <-chan *topology.NodeTopology
}
Expand Down Expand Up @@ -63,6 +68,7 @@ func (e *MetricsExporter) Run(stopCh <-chan struct{}) {

func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
nodeName := viper.GetString(constants.EnvNodeName)
shouldExportPrometheusLabelEnrichments := viper.GetBool(constants.EnvExportPrometheusLabelEnrichments)

gpuUtilization.Reset()
gpuFbUsed.Reset()
Expand All @@ -81,6 +87,11 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {
"container": gpu.Status.AllocatedBy.Container,
}

if shouldExportPrometheusLabelEnrichments {
// GuyTodo: Test this
labels = e.enrichWithPrometheusLabels(labels)
}

utilization := gpu.Status.PodGpuUsageStatus.Utilization()
fbUsed := gpu.Status.PodGpuUsageStatus.FbUsed(nodeTopology.GpuMemory)

Expand All @@ -94,7 +105,7 @@ func (e *MetricsExporter) export(nodeTopology *topology.NodeTopology) error {

func setupServer() {
http.Handle("/metrics", promhttp.Handler())
err := http.ListenAndServe(":9400", nil)
err := http.ListenAndServe(fmt.Sprintf(":%d", exporterPort), nil)
if err != nil {
log.Fatal(err)
}
Expand All @@ -107,3 +118,19 @@ func generateFakeHostname(nodeName string) string {
nodeHostname := fmt.Sprintf("%s-%x", "nvidia-dcgm-exporter", nodeNameSHA1[:3])
return nodeHostname
}

func (e *MetricsExporter) enrichWithPrometheusLabels(labels prometheus.Labels) prometheus.Labels {
// If the labels are already set, move the existing values to "exported_" prefixed labels.
for _, label := range []string{"container", "namespace", "pod", "instance"} {
if val, ok := labels[label]; ok {
labels["exported_"+label] = val
}
}

labels["namespace"] = viper.GetString(constants.EnvFakeGpuOperatorNs)
labels["pod"] = viper.GetString(constants.EnvImpersonatePodName)
labels["container"] = exporterContainerName
labels["instance"] = fmt.Sprintf("%s:%d", viper.GetString(constants.EnvImpersonatePodIP), exporterPort)

return labels
}
6 changes: 3 additions & 3 deletions internal/status-exporter/export/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ var (
gpuUtilization = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "DCGM_FI_DEV_GPU_UTIL",
Help: "GPU Utilization",
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
gpuFbUsed = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "DCGM_FI_DEV_FB_USED",
Help: "GPU Framebuffer Used",
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
gpuFbFree = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "DCGM_FI_DEV_FB_FREE",
Help: "GPU Framebuffer Free",
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod"})
}, []string{"gpu", "UUID", "device", "modelName", "Hostname", "container", "namespace", "pod", "instance"})
)
24 changes: 15 additions & 9 deletions internal/status-updater/handlers/node/fake_node_deployments.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,11 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De
Name: constants.EnvFakeNode,
Value: "true",
}, v1.EnvVar{
Name: constants.EnvImpersonateIP,
Name: constants.EnvImpersonatePodIP,
Value: dummyDcgmExporterPod.Status.PodIP,
}, v1.EnvVar{
Name: constants.EnvImpersonatePodName,
Value: dummyDcgmExporterPod.Name,
})

deployment.Spec.Template.Spec.Containers[0].Resources.Limits = v1.ResourceList{
Expand All @@ -135,6 +138,7 @@ func (p *NodeHandler) generateFakeNodeDeploymentFromTemplate(template *appsv1.De
return deployment, nil
}

// Waits for the dummy dcgm exporter pod to be ready and returns it
func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error) {
clientset := p.kubeClient // Assuming p.kubeClient is of type kubernetes.Interface

Expand All @@ -143,7 +147,7 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error)
fieldSelector := fields.OneTermEqualSelector("spec.nodeName", nodeName).String()

// Create a context with timeout
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()

// Create a watch for pods with the specified label and field selectors
Expand All @@ -156,19 +160,21 @@ func (p *NodeHandler) getDummyDcgmExporterPod(nodeName string) (*v1.Pod, error)
}
defer watcher.Stop()

// Wait for the pod to be created
// Wait for the pod to be ready
for {
select {
case event := <-watcher.ResultChan():
if event.Type == watch.Added {
pod, ok := event.Object.(*v1.Pod)
if !ok {
return nil, fmt.Errorf("unexpected type")
pod, ok := event.Object.(*v1.Pod)
if !ok {
return nil, fmt.Errorf("unexpected object type: %T", event.Object)
}
if event.Type == watch.Added || event.Type == watch.Modified {
if pod.Status.Phase == v1.PodRunning {
return pod, nil
}
return pod, nil
}
case <-ctx.Done():
return nil, fmt.Errorf("timeout waiting for pod to be created")
return nil, fmt.Errorf("timeout waiting for pod to be ready")
}
}
}
Expand Down

0 comments on commit 50fafe4

Please sign in to comment.