Skip to content

Commit 00df2fd

Browse files
committed
fix integ test
1 parent 12ad308 commit 00df2fd

File tree

6 files changed

+50
-55
lines changed

6 files changed

+50
-55
lines changed

generator/test_case_generator.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,10 +305,6 @@ var testTypeToTestConfig = map[string][]testConfig{
305305
testDir: "../../../../test/gpu",
306306
terraformDir: "terraform/eks/addon/gpu",
307307
},
308-
{
309-
testDir: "../../../../test/gpu_high_frequency_metrics",
310-
terraformDir: "terraform/eks/addon/gpu",
311-
},
312308
},
313309
"eks_daemon": {
314310
{

terraform/eks/daemon/gpu/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ resource "kubernetes_daemonset" "exporter" {
306306
"-c",
307307
]
308308
args = [
309-
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
309+
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_PROF_PIPE_TENSOR_ACTIVE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
310310
]
311311
volume_mount {
312312
mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert"

test/gpu/common/gpu_validation.go

Lines changed: 44 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -23,83 +23,77 @@ import (
2323
const (
2424
GPUMetricIndicator = "_gpu_"
2525

26-
ContainerMemTotal = "container_gpu_memory_total"
27-
ContainerMemUsed = "container_gpu_memory_used"
28-
ContainerPower = "container_gpu_power_draw"
29-
ContainerTemp = "container_gpu_temperature"
30-
ContainerUtil = "container_gpu_utilization"
31-
ContainerMemUtil = "container_gpu_memory_utilization"
32-
PodMemTotal = "pod_gpu_memory_total"
33-
PodMemUsed = "pod_gpu_memory_used"
34-
PodPower = "pod_gpu_power_draw"
35-
PodTemp = "pod_gpu_temperature"
36-
PodUtil = "pod_gpu_utilization"
37-
PodMemUtil = "pod_gpu_memory_utilization"
38-
PodLimit = "pod_gpu_limit"
39-
PodRequest = "pod_gpu_request"
40-
PodCountTotal = "pod_gpu_usage_total"
41-
PodReserved = "pod_gpu_reserved_capacity"
42-
NodeMemTotal = "node_gpu_memory_total"
43-
NodeMemUsed = "node_gpu_memory_used"
44-
NodePower = "node_gpu_power_draw"
45-
NodeTemp = "node_gpu_temperature"
46-
NodeUtil = "node_gpu_utilization"
47-
NodeMemUtil = "node_gpu_memory_utilization"
48-
NodeCountTotal = "node_gpu_usage_total"
49-
NodeCountLimit = "node_gpu_limit"
50-
NodeReserved = "node_gpu_reserved_capacity"
26+
ContainerMemTotal = "container_gpu_memory_total"
27+
ContainerMemUsed = "container_gpu_memory_used"
28+
ContainerPower = "container_gpu_power_draw"
29+
ContainerTemp = "container_gpu_temperature"
30+
ContainerUtil = "container_gpu_utilization"
31+
ContainerMemUtil = "container_gpu_memory_utilization"
32+
ContainerTensorUtil = "container_gpu_tensor_core_utilization"
33+
PodMemTotal = "pod_gpu_memory_total"
34+
PodMemUsed = "pod_gpu_memory_used"
35+
PodPower = "pod_gpu_power_draw"
36+
PodTemp = "pod_gpu_temperature"
37+
PodUtil = "pod_gpu_utilization"
38+
PodMemUtil = "pod_gpu_memory_utilization"
39+
PodTensorUtil = "pod_gpu_tensor_core_utilization"
40+
PodLimit = "pod_gpu_limit"
41+
PodRequest = "pod_gpu_request"
42+
PodCountTotal = "pod_gpu_usage_total"
43+
PodReserved = "pod_gpu_reserved_capacity"
44+
NodeMemTotal = "node_gpu_memory_total"
45+
NodeMemUsed = "node_gpu_memory_used"
46+
NodePower = "node_gpu_power_draw"
47+
NodeTemp = "node_gpu_temperature"
48+
NodeUtil = "node_gpu_utilization"
49+
NodeMemUtil = "node_gpu_memory_utilization"
50+
NodeTensorUtil = "node_gpu_tensor_core_utilization"
51+
NodeCountTotal = "node_gpu_usage_total"
52+
NodeCountLimit = "node_gpu_limit"
53+
NodeReserved = "node_gpu_reserved_capacity"
54+
NodeUnreserved = "node_gpu_unreserved_capacity"
55+
NodeAvailable = "node_gpu_available_capacity"
5156
)
5257

5358
var UseE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")
5459

5560
// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation
5661
var ExpectedDimsToMetricsIntegTest = map[string][]string{
5762
"ClusterName": {
58-
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
59-
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
60-
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
61-
"container_gpu_tensor_core_utilization", "pod_gpu_tensor_core_utilization", "node_gpu_tensor_core_utilization",
62-
"node_gpu_unreserved_capacity", "node_gpu_available_capacity",
63+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
64+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
65+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
6366
},
6467
"ClusterName-Namespace": {
65-
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
66-
"pod_gpu_tensor_core_utilization",
68+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
6769
},
6870
//"ClusterName-Namespace-Service": {
6971
// PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
7072
//},
7173
"ClusterName-Namespace-PodName": {
72-
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
73-
"pod_gpu_tensor_core_utilization",
74+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
7475
},
7576
"ClusterName-ContainerName-Namespace-PodName": {
76-
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
77-
"container_gpu_tensor_core_utilization",
77+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
7878
},
7979
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
80-
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
81-
"container_gpu_tensor_core_utilization",
80+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
8281
},
8382
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
84-
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil,
85-
"container_gpu_tensor_core_utilization",
83+
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
8684
},
8785
"ClusterName-FullPodName-Namespace-PodName": {
88-
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
89-
"pod_gpu_tensor_core_utilization",
86+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
9087
},
9188
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
92-
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
93-
"pod_gpu_tensor_core_utilization",
89+
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
9490
},
9591
"ClusterName-InstanceId-NodeName": {
96-
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
97-
"node_gpu_tensor_core_utilization", "node_gpu_unreserved_capacity", "node_gpu_available_capacity",
92+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
9893
//NodeCountTotal, NodeCountRequest, NodeCountLimit,
9994
},
10095
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
101-
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil,
102-
"node_gpu_tensor_core_utilization",
96+
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
10397
},
10498
}
10599

@@ -117,7 +111,7 @@ func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
117111
if *UseE2EMetrics {
118112
expectedDimsToMetrics["ClusterName"] = append(
119113
expectedDimsToMetrics["ClusterName"],
120-
PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved,
114+
PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved, NodeUnreserved, NodeAvailable,
121115
)
122116
expectedDimsToMetrics["ClusterName-Namespace-PodName"] = append(
123117
expectedDimsToMetrics["ClusterName-Namespace-PodName"],
@@ -129,7 +123,7 @@ func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
129123
)
130124
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"] = append(
131125
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"],
132-
NodeCountLimit, NodeCountTotal, NodeReserved,
126+
NodeCountLimit, NodeCountTotal, NodeReserved, NodeUnreserved, NodeAvailable,
133127
)
134128
}
135129

test/metric_value_benchmark/eks_resources/test_schemas/container_gpu.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"container_gpu_temperature": {},
2727
"container_gpu_utilization": {},
2828
"container_gpu_memory_utilization": {},
29+
"container_gpu_tensor_core_utilization": {},
2930
"Service":{}
3031
},
3132
"required": [

test/metric_value_benchmark/eks_resources/test_schemas/node_gpu.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,12 @@
2626
"node_gpu_temperature": {},
2727
"node_gpu_utilization": {},
2828
"node_gpu_memory_utilization": {},
29+
"node_gpu_tensor_core_utilization": {},
2930
"node_gpu_total": {},
3031
"node_gpu_request": {},
3132
"node_gpu_list": {},
33+
"node_gpu_unreserved_capacity": {},
34+
"node_gpu_available_capacity": {},
3235
"Service":{}
3336
},
3437
"required": [

test/metric_value_benchmark/eks_resources/test_schemas/pod_gpu.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"pod_gpu_temperature": {},
2727
"pod_gpu_utilization": {},
2828
"pod_gpu_memory_utilization": {},
29+
"pod_gpu_tensor_core_utilization": {},
2930
"pod_gpu_total": {},
3031
"pod_gpu_request": {},
3132
"pod_gpu_list": {},

0 commit comments

Comments
 (0)