diff --git a/dashboards/nvidia-gpu/README.md b/dashboards/nvidia-gpu/README.md index 77cb60cdc2..64e750f842 100644 --- a/dashboards/nvidia-gpu/README.md +++ b/dashboards/nvidia-gpu/README.md @@ -16,3 +16,8 @@ |:------------------| |Filename: [nvidia-dcgm.json](nvidia-dcgm.json)| |Displays Advanced GPU metrics from NVIDIA Datacenter GPU Manager (DCGM). This requires a specific setup (e.g. installing DCGM, installing the Ops Agent, and configuring it to receive DCGM metrics).| + +|NVIDIA GPU Monitoring Advanced DCGM Metrics (GKE Only)| +|:------------------| +|Filename: [nvidia-dcgm-prometheus.json](nvidia-dcgm-prometheus.json)| +|Displays Advanced GPU metrics from NVIDIA Datacenter GPU Manager (DCGM). This requires a specific setup (e.g. installing DCGM and DCGM exporter).| diff --git a/dashboards/nvidia-gpu/metadata.yaml b/dashboards/nvidia-gpu/metadata.yaml index b547f7a0b6..442d7f611e 100644 --- a/dashboards/nvidia-gpu/metadata.yaml +++ b/dashboards/nvidia-gpu/metadata.yaml @@ -14,3 +14,12 @@ sample_dashboards: related_integrations: - id: dcgm platform: GCE + - + category: NVIDIA GPUs + id: nvidia-dcgm-prometheus + display_name: NVIDIA GPU Monitoring Advanced DCGM Metrics (GKE Only) + description: |- + Displays Advanced GPU metrics from NVIDIA Datacenter GPU Manager (DCGM). This requires a specific setup (e.g. installing DCGM and DCGM exporter). + related_integrations: + - id: dcgm + platform: GKE diff --git a/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.01.png b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.01.png new file mode 100644 index 0000000000..527dc8e0bd Binary files /dev/null and b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.01.png differ diff --git a/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.02.png b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.02.png new file mode 100644 index 0000000000..8bb99db939 Binary files /dev/null and b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.02.png differ diff --git a/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.03.png b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.03.png new file mode 100644 index 0000000000..acfcf04aac Binary files /dev/null and b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.03.png differ diff --git a/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.json b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.json new file mode 100644 index 0000000000..9df423b828 --- /dev/null +++ b/dashboards/nvidia-gpu/nvidia-dcgm-prometheus.json @@ -0,0 +1,593 @@ +{ + "displayName": "NVIDIA GPU Monitoring Advanced DCGM Metrics (GKE Only)", + "dashboardFilters": [], + "mosaicLayout": { + "columns": 48, + "tiles": [ + { + "width": 24, + "height": 14, + "widget": { + "title": "GKE - Built-In - Accelerator Duty Cycle", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(kubernetes_io:container_accelerator_duty_cycle{monitored_resource=\"k8s_container\"}[${__interval}])", + "unitOverride": "%" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - Bulit-In - Accelerator Memory Used", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(kubernetes_io:container_accelerator_memory_used{monitored_resource=\"k8s_container\"}[${__interval}])", + "unitOverride": "By" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 14, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - GPU Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_GPU_UTIL[${__interval}])", + "unitOverride": "%" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 14, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - GPU Memory Used", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_FB_USED[${__interval}])", + "unitOverride": "MBy" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 29, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - SM Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_SM_ACTIVE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 29, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - Memory Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL[${__interval}])", + "unitOverride": "%" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 45, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - SM Occupancy", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_SM_OCCUPANCY[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 45, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - Tensor Engine Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 61, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - PCIe Tx Bandwidth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PCIE_TX_BYTES[${__interval}])", + "unitOverride": "By" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 61, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - PCIe Rx Bandwidth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PCIE_RX_BYTES[${__interval}])", + "unitOverride": "By" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 77, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - NvLink Tx Bandwidth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_NVLINK_TX_BYTES[${__interval}])", + "unitOverride": "By" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 77, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - NvLink Rx Bandwidth", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_NVLINK_RX_BYTES[${__interval}])", + "unitOverride": "By" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "y1Axis", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 93, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - FP64 Engine Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PIPE_FP64_ACTIVE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 93, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - FP32 Engine Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PIPE_FP32_ACTIVE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 109, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - FP16 Engine Utilization", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_PROF_PIPE_FP16_ACTIVE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Ratio", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 109, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - Current temperature in Celsius", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_GPU_TEMP[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Celsius", + "scale": "LINEAR" + } + } + } + }, + { + "yPos": 125, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - Memory temperature in Celsius", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_MEMORY_TEMP[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Celsius", + "scale": "LINEAR" + } + } + } + }, + { + "xPos": 24, + "yPos": 125, + "width": 24, + "height": 14, + "widget": { + "title": "GKE - DCGM - Power usage for the device in Watts", + "xyChart": { + "chartOptions": { + "mode": "COLOR" + }, + "dataSets": [ + { + "breakdowns": [], + "dimensions": [], + "measures": [], + "plotType": "LINE", + "targetAxis": "Y1", + "timeSeriesQuery": { + "prometheusQuery": "avg_over_time(DCGM_FI_DEV_POWER_USAGE[${__interval}])", + "unitOverride": "" + } + } + ], + "thresholds": [], + "timeshiftDuration": "0s", + "yAxis": { + "label": "Watts", + "scale": "LINEAR" + } + } + } + } + ] + }, + "labels": {} +} \ No newline at end of file