@@ -23,83 +23,77 @@ import (
2323const (
2424 GPUMetricIndicator = "_gpu_"
2525
26- ContainerMemTotal = "container_gpu_memory_total"
27- ContainerMemUsed = "container_gpu_memory_used"
28- ContainerPower = "container_gpu_power_draw"
29- ContainerTemp = "container_gpu_temperature"
30- ContainerUtil = "container_gpu_utilization"
31- ContainerMemUtil = "container_gpu_memory_utilization"
32- PodMemTotal = "pod_gpu_memory_total"
33- PodMemUsed = "pod_gpu_memory_used"
34- PodPower = "pod_gpu_power_draw"
35- PodTemp = "pod_gpu_temperature"
36- PodUtil = "pod_gpu_utilization"
37- PodMemUtil = "pod_gpu_memory_utilization"
38- PodLimit = "pod_gpu_limit"
39- PodRequest = "pod_gpu_request"
40- PodCountTotal = "pod_gpu_usage_total"
41- PodReserved = "pod_gpu_reserved_capacity"
42- NodeMemTotal = "node_gpu_memory_total"
43- NodeMemUsed = "node_gpu_memory_used"
44- NodePower = "node_gpu_power_draw"
45- NodeTemp = "node_gpu_temperature"
46- NodeUtil = "node_gpu_utilization"
47- NodeMemUtil = "node_gpu_memory_utilization"
48- NodeCountTotal = "node_gpu_usage_total"
49- NodeCountLimit = "node_gpu_limit"
50- NodeReserved = "node_gpu_reserved_capacity"
26+ ContainerMemTotal = "container_gpu_memory_total"
27+ ContainerMemUsed = "container_gpu_memory_used"
28+ ContainerPower = "container_gpu_power_draw"
29+ ContainerTemp = "container_gpu_temperature"
30+ ContainerUtil = "container_gpu_utilization"
31+ ContainerMemUtil = "container_gpu_memory_utilization"
32+ ContainerTensorUtil = "container_gpu_tensor_core_utilization"
33+ PodMemTotal = "pod_gpu_memory_total"
34+ PodMemUsed = "pod_gpu_memory_used"
35+ PodPower = "pod_gpu_power_draw"
36+ PodTemp = "pod_gpu_temperature"
37+ PodUtil = "pod_gpu_utilization"
38+ PodMemUtil = "pod_gpu_memory_utilization"
39+ PodTensorUtil = "pod_gpu_tensor_core_utilization"
40+ PodLimit = "pod_gpu_limit"
41+ PodRequest = "pod_gpu_request"
42+ PodCountTotal = "pod_gpu_usage_total"
43+ PodReserved = "pod_gpu_reserved_capacity"
44+ NodeMemTotal = "node_gpu_memory_total"
45+ NodeMemUsed = "node_gpu_memory_used"
46+ NodePower = "node_gpu_power_draw"
47+ NodeTemp = "node_gpu_temperature"
48+ NodeUtil = "node_gpu_utilization"
49+ NodeMemUtil = "node_gpu_memory_utilization"
50+ NodeTensorUtil = "node_gpu_tensor_core_utilization"
51+ NodeCountTotal = "node_gpu_usage_total"
52+ NodeCountLimit = "node_gpu_limit"
53+ NodeReserved = "node_gpu_reserved_capacity"
54+ NodeUnreserved = "node_gpu_unreserved_capacity"
55+ NodeAvailable = "node_gpu_available_capacity"
5156)
5257
5358var UseE2EMetrics = flag .Bool ("useE2EMetrics" , false , "Use E2E metrics mapping which uses latest build CWA" )
5459
5560// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation
5661var ExpectedDimsToMetricsIntegTest = map [string ][]string {
5762 "ClusterName" : {
58- ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil ,
59- PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil ,
60- NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil ,
61- "container_gpu_tensor_core_utilization" , "pod_gpu_tensor_core_utilization" , "node_gpu_tensor_core_utilization" ,
62- "node_gpu_unreserved_capacity" , "node_gpu_available_capacity" ,
63+ ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil , ContainerTensorUtil ,
64+ PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil , PodTensorUtil ,
65+ NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil , NodeTensorUtil ,
6366 },
6467 "ClusterName-Namespace" : {
65- PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil ,
66- "pod_gpu_tensor_core_utilization" ,
68+ PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil , PodTensorUtil ,
6769 },
6870 //"ClusterName-Namespace-Service": {
6971 // PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
7072 //},
7173 "ClusterName-Namespace-PodName" : {
72- PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil ,
73- "pod_gpu_tensor_core_utilization" ,
74+ PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil , PodTensorUtil ,
7475 },
7576 "ClusterName-ContainerName-Namespace-PodName" : {
76- ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil ,
77- "container_gpu_tensor_core_utilization" ,
77+ ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil , ContainerTensorUtil ,
7878 },
7979 "ClusterName-ContainerName-FullPodName-Namespace-PodName" : {
80- ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil ,
81- "container_gpu_tensor_core_utilization" ,
80+ ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil , ContainerTensorUtil ,
8281 },
8382 "ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName" : {
84- ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil ,
85- "container_gpu_tensor_core_utilization" ,
83+ ContainerMemTotal , ContainerMemUsed , ContainerPower , ContainerTemp , ContainerUtil , ContainerMemUtil , ContainerTensorUtil ,
8684 },
8785 "ClusterName-FullPodName-Namespace-PodName" : {
88- PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil ,
89- "pod_gpu_tensor_core_utilization" ,
86+ PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil , PodTensorUtil ,
9087 },
9188 "ClusterName-FullPodName-GpuDevice-Namespace-PodName" : {
92- PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil ,
93- "pod_gpu_tensor_core_utilization" ,
89+ PodMemTotal , PodMemUsed , PodPower , PodTemp , PodUtil , PodMemUtil , PodTensorUtil ,
9490 },
9591 "ClusterName-InstanceId-NodeName" : {
96- NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil ,
97- "node_gpu_tensor_core_utilization" , "node_gpu_unreserved_capacity" , "node_gpu_available_capacity" ,
92+ NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil , NodeTensorUtil ,
9893 //NodeCountTotal, NodeCountRequest, NodeCountLimit,
9994 },
10095 "ClusterName-GpuDevice-InstanceId-InstanceType-NodeName" : {
101- NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil ,
102- "node_gpu_tensor_core_utilization" ,
96+ NodeMemTotal , NodeMemUsed , NodePower , NodeTemp , NodeUtil , NodeMemUtil , NodeTensorUtil ,
10397 },
10498}
10599
@@ -117,7 +111,7 @@ func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
117111 if * UseE2EMetrics {
118112 expectedDimsToMetrics ["ClusterName" ] = append (
119113 expectedDimsToMetrics ["ClusterName" ],
120- PodReserved , PodRequest , PodCountTotal , PodLimit , NodeCountTotal , NodeCountLimit , NodeReserved ,
114+ PodReserved , PodRequest , PodCountTotal , PodLimit , NodeCountTotal , NodeCountLimit , NodeReserved , NodeUnreserved , NodeAvailable ,
121115 )
122116 expectedDimsToMetrics ["ClusterName-Namespace-PodName" ] = append (
123117 expectedDimsToMetrics ["ClusterName-Namespace-PodName" ],
@@ -129,7 +123,7 @@ func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
129123 )
130124 expectedDimsToMetrics ["ClusterName-InstanceId-NodeName" ] = append (
131125 expectedDimsToMetrics ["ClusterName-InstanceId-NodeName" ],
132- NodeCountLimit , NodeCountTotal , NodeReserved ,
126+ NodeCountLimit , NodeCountTotal , NodeReserved , NodeUnreserved , NodeAvailable ,
133127 )
134128 }
135129
0 commit comments