Skip to content

Commit

Permalink
NIM Dashboard additions (#19154)
Browse files Browse the repository at this point in the history
* NIM Dashboard additions

* remove in unit dashboard additions
  • Loading branch information
Kyle-Neale authored Nov 28, 2024
1 parent b48116b commit 3e1f734
Showing 1 changed file with 75 additions and 26 deletions.
101 changes: 75 additions & 26 deletions nvidia_nim/assets/dashboards/nvidia_nim_overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@
"hide_zero_counts": true,
"show_status": true,
"last_triggered_format": "relative",
"query": "tag:(integration:vllm)",
"query": "tag:(integration:nvidia_nim)",
"sort": "status,asc",
"count": 50,
"start": 0,
Expand Down Expand Up @@ -265,20 +265,34 @@
"title": "Requests Waiting",
"title_size": "16",
"title_align": "left",
"show_legend": false,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
"formulas": [
{
"number_format": {
"unit": {
"type": "canonical_unit"
}
},
"formula": "query1"
}
],
"queries": [
{
"name": "query1",
"data_source": "metrics",
"query": "avg:nvidia_nim.num_requests.waiting{$model_name} by {model_name}"
"query": "avg:nvidia_nim.num_requests.waiting{$model_name, $host} by {model_name}"
}
],
"response_format": "timeseries",
Expand All @@ -305,7 +319,6 @@
"title": "Requests Waiting",
"title_size": "16",
"title_align": "left",
"time": {},
"type": "query_value",
"requests": [
{
Expand Down Expand Up @@ -370,20 +383,34 @@
"title": "Requests Failed",
"title_size": "16",
"title_align": "left",
"show_legend": false,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
"formulas": [
{
"number_format": {
"unit": {
"type": "canonical_unit"
}
},
"formula": "query1"
}
],
"queries": [
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.request.failure.count{$model_name} by {model_name}.as_count()"
"query": "sum:nvidia_nim.request.failure.count{$model_name, $host} by {model_name}.as_count()"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -488,7 +515,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.request.success.count{$model_name} by {model_name}.as_rate()"
"query": "sum:nvidia_nim.request.success.count{$model_name, $host} by {model_name}.as_rate()"
}
],
"response_format": "timeseries",
Expand All @@ -514,7 +541,6 @@
"title": "Requests Running",
"title_size": "16",
"title_align": "left",
"time": {},
"type": "query_value",
"requests": [
{
Expand Down Expand Up @@ -550,9 +576,18 @@
{
"id": 2448557456884510,
"definition": {
"title": "K/V Cache Utilization",
"title": "GPU Cache Utilization",
"title_size": "16",
"title_align": "left",
"show_legend": false,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
Expand All @@ -566,7 +601,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "avg:nvidia_nim.gpu_cache_usage_percent{$model_name}"
"query": "avg:nvidia_nim.gpu_cache_usage_percent{$model_name, $host} by {model_name}"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -705,25 +740,29 @@
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
"formulas": [
{
"number_format": {
"unit": {
"type": "canonical_unit"
}
},
"formula": "query1 / query2"
}
],
"queries": [
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.time_to_first_token.seconds.sum{$model_name} by {model_name}.as_count()"
"query": "sum:nvidia_nim.time_to_first_token.seconds.sum{$model_name, $host} by {model_name}.as_count()"
},
{
"data_source": "metrics",
"name": "query2",
"query": "sum:nvidia_nim.time_to_first_token.seconds.count{$model_name} by {model_name}.as_count()"
"query": "sum:nvidia_nim.time_to_first_token.seconds.count{$model_name, $host} by {model_name}.as_count()"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -759,7 +798,6 @@
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
Expand All @@ -775,12 +813,12 @@
],
"queries": [
{
"query": "avg:nvidia_nim.request.prompt_tokens.sum{$model_name} by {model_name}.as_count()",
"query": "avg:nvidia_nim.request.prompt_tokens.sum{$model_name, $host} by {model_name}.as_count()",
"data_source": "metrics",
"name": "query2"
},
{
"query": "avg:nvidia_nim.request.generation_tokens.sum{$model_name} by {model_name}.as_count()",
"query": "avg:nvidia_nim.request.generation_tokens.sum{$model_name, $host} by {model_name}.as_count()",
"data_source": "metrics",
"name": "query1"
}
Expand Down Expand Up @@ -818,25 +856,29 @@
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
"formulas": [
{
"number_format": {
"unit": {
"type": "canonical_unit"
}
},
"formula": "query1 / query2"
}
],
"queries": [
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.time_per_output_token.seconds.sum{$model_name} by {model_name}.as_count()"
"query": "sum:nvidia_nim.time_per_output_token.seconds.sum{$model_name, $host} by {model_name}.as_count()"
},
{
"data_source": "metrics",
"name": "query2",
"query": "sum:nvidia_nim.time_per_output_token.seconds.count{$model_name} by {model_name}.as_count()"
"query": "sum:nvidia_nim.time_per_output_token.seconds.count{$model_name, $host} by {model_name}.as_count()"
}
],
"response_format": "timeseries",
Expand All @@ -861,7 +903,7 @@
},
"layout": {
"x": 0,
"y": 20,
"y": 0,
"width": 12,
"height": 10,
"is_column_break": true
Expand Down Expand Up @@ -924,7 +966,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "avg:nvidia_nim.process.resident_memory_bytes{$model_name}"
"query": "avg:nvidia_nim.process.resident_memory_bytes{$host} by {endpoint}"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -971,7 +1013,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.python.gc.collections.count{$model_name} by {generation}.as_count()"
"query": "sum:nvidia_nim.python.gc.collections.count{$host} by {generation,endpoint}.as_count()"
}
],
"response_format": "timeseries",
Expand All @@ -998,7 +1040,15 @@
"title": "Uncollectable Objects",
"title_size": "16",
"title_align": "left",
"time": {},
"show_legend": false,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"type": "timeseries",
"requests": [
{
Expand All @@ -1011,7 +1061,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.python.gc.objects.uncollectable.count{$host}.as_count()"
"query": "sum:nvidia_nim.python.gc.objects.uncollectable.count{$host} by {endpoint,generation}.as_count()"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -1047,7 +1097,6 @@
"value",
"sum"
],
"time": {},
"type": "timeseries",
"requests": [
{
Expand All @@ -1060,7 +1109,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "avg:nvidia_nim.process.virtual_memory_bytes{$host} by {host}"
"query": "avg:nvidia_nim.process.virtual_memory_bytes{$host} by {endpoint}"
}
],
"response_format": "timeseries",
Expand Down Expand Up @@ -1107,7 +1156,7 @@
{
"data_source": "metrics",
"name": "query1",
"query": "sum:nvidia_nim.python.gc.objects.collected.count{$model_name} by {generation}.as_count()"
"query": "sum:nvidia_nim.python.gc.objects.collected.count{$model_name, $host} by {generation}.as_count()"
}
],
"response_format": "timeseries",
Expand All @@ -1131,7 +1180,7 @@
},
"layout": {
"x": 0,
"y": 30,
"y": 10,
"width": 12,
"height": 8
}
Expand Down

0 comments on commit 3e1f734

Please sign in to comment.