Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions generator/test_case_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,10 @@ var testTypeToTestConfig = map[string][]testConfig{
testDir: "./test/gpu", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
{
testDir: "./test/gpu_high_frequency_metrics", terraformDir: "terraform/eks/daemon/gpu",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
},
{
testDir: "./test/awsneuron", terraformDir: "terraform/eks/daemon/awsneuron",
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
Expand Down
2 changes: 1 addition & 1 deletion terraform/eks/daemon/gpu/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ resource "kubernetes_daemonset" "exporter" {
"-c",
]
args = [
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
"/bin/echo 'DCGM_FI_DEV_GPU_UTIL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_FREE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_TOTAL{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_FB_USED_PERCENT{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_GPU_TEMP{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_DEV_POWER_USAGE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1\nDCGM_FI_PROF_PIPE_TENSOR_ACTIVE{PodName=\"pod1\",gpu=\"0\",UUID=\"uuid0\",device=\"nvidia0\",modelName=\"Tesla T4\",Hostname=\"hostname1\",container=\"main\",namespace=\"amazon-cloudwatch\",pod=\"pod1-hash\"} 1' >> /usr/local/apache2/htdocs/metrics && sed -i -e \"s/hostname1/$HOST_NAME/g\" /usr/local/apache2/htdocs/metrics && httpd-foreground -k restart"
]
volume_mount {
mount_path = "/etc/amazon-cloudwatch-observability-dcgm-cert"
Expand Down
231 changes: 231 additions & 0 deletions test/gpu/common/gpu_validation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

//go:build !windows

package common

import (
"encoding/json"
"flag"
"fmt"
"log"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs/types"

"github.com/aws/amazon-cloudwatch-agent-test/environment"
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
"github.com/aws/amazon-cloudwatch-agent-test/util/awsservice"
)

const (
GPUMetricIndicator = "_gpu_"

ContainerMemTotal = "container_gpu_memory_total"
ContainerMemUsed = "container_gpu_memory_used"
ContainerPower = "container_gpu_power_draw"
ContainerTemp = "container_gpu_temperature"
ContainerUtil = "container_gpu_utilization"
ContainerMemUtil = "container_gpu_memory_utilization"
ContainerTensorUtil = "container_gpu_tensor_core_utilization"
PodMemTotal = "pod_gpu_memory_total"
PodMemUsed = "pod_gpu_memory_used"
PodPower = "pod_gpu_power_draw"
PodTemp = "pod_gpu_temperature"
PodUtil = "pod_gpu_utilization"
PodMemUtil = "pod_gpu_memory_utilization"
PodTensorUtil = "pod_gpu_tensor_core_utilization"
PodLimit = "pod_gpu_limit"
PodRequest = "pod_gpu_request"
PodCountTotal = "pod_gpu_usage_total"
PodReserved = "pod_gpu_reserved_capacity"
NodeMemTotal = "node_gpu_memory_total"
NodeMemUsed = "node_gpu_memory_used"
NodePower = "node_gpu_power_draw"
NodeTemp = "node_gpu_temperature"
NodeUtil = "node_gpu_utilization"
NodeMemUtil = "node_gpu_memory_utilization"
NodeTensorUtil = "node_gpu_tensor_core_utilization"
NodeCountTotal = "node_gpu_usage_total"
NodeCountLimit = "node_gpu_limit"
NodeReserved = "node_gpu_reserved_capacity"
NodeUnreserved = "node_gpu_unreserved_capacity"
NodeAvailable = "node_gpu_available_capacity"
)

var UseE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")

// ExpectedDimsToMetricsIntegTest defines the expected dimensions and metrics for GPU validation
var ExpectedDimsToMetricsIntegTest = map[string][]string{
"ClusterName": {
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
},
"ClusterName-Namespace": {
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
},
//"ClusterName-Namespace-Service": {
// PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil,
//},
"ClusterName-Namespace-PodName": {
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
},
"ClusterName-ContainerName-Namespace-PodName": {
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
},
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
},
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
ContainerMemTotal, ContainerMemUsed, ContainerPower, ContainerTemp, ContainerUtil, ContainerMemUtil, ContainerTensorUtil,
},
"ClusterName-FullPodName-Namespace-PodName": {
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
},
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
PodMemTotal, PodMemUsed, PodPower, PodTemp, PodUtil, PodMemUtil, PodTensorUtil,
},
"ClusterName-InstanceId-NodeName": {
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
//NodeCountTotal, NodeCountRequest, NodeCountLimit,
},
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
NodeMemTotal, NodeMemUsed, NodePower, NodeTemp, NodeUtil, NodeMemUtil, NodeTensorUtil,
},
}

// ValidateGPUMetrics validates GPU metrics using the common validation logic
func ValidateGPUMetrics(env *environment.MetaData) []status.TestResult {
var testResults []status.TestResult

// Create a copy of the expected dimensions to metrics map
expectedDimsToMetrics := make(map[string][]string)
for k, v := range ExpectedDimsToMetricsIntegTest {
expectedDimsToMetrics[k] = append([]string{}, v...)
}

// Add GPU count metrics if using E2E metrics
if *UseE2EMetrics {
expectedDimsToMetrics["ClusterName"] = append(
expectedDimsToMetrics["ClusterName"],
PodReserved, PodRequest, PodCountTotal, PodLimit, NodeCountTotal, NodeCountLimit, NodeReserved, NodeUnreserved, NodeAvailable,
)
expectedDimsToMetrics["ClusterName-Namespace-PodName"] = append(
expectedDimsToMetrics["ClusterName-Namespace-PodName"],
PodCountTotal, PodRequest, PodReserved, PodLimit,
)
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"] = append(
expectedDimsToMetrics["ClusterName-FullPodName-Namespace-PodName"],
PodCountTotal, PodRequest, PodReserved, PodLimit,
)
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"] = append(
expectedDimsToMetrics["ClusterName-InstanceId-NodeName"],
NodeCountLimit, NodeCountTotal, NodeReserved, NodeUnreserved, NodeAvailable,
)
}

// Validate metrics and logs
testResults = append(testResults, metric.ValidateMetrics(env, GPUMetricIndicator, expectedDimsToMetrics)...)
testResults = append(testResults, metric.ValidateLogs(env))

return testResults
}

// ValidateHistogramFormat validates that the logs contain metrics in histogram format
func ValidateHistogramFormat(env *environment.MetaData) status.TestResult {
testResult := status.TestResult{
Name: "histogram-format",
Status: status.FAILED,
}

end := time.Now()
start := end.Add(time.Duration(-3) * time.Minute)
group := fmt.Sprintf("/aws/containerinsights/%s/performance", env.EKSClusterName)

log.Println("Searching for histogram format in log group:", group)

// Get the instances used for the EKS cluster
eKSInstances, err := awsservice.GetEKSInstances(env.EKSClusterName)
if err != nil {
log.Println("Failed to get EKS instances:", err)
return testResult
}

histogramFound := false
logCount := 0
gpuMetricCount := 0

for _, instance := range eKSInstances {
stream := *instance.InstanceName

err = awsservice.ValidateLogs(
group,
stream,
&start,
&end,
awsservice.AssertLogsNotEmpty(),
awsservice.AssertPerLog(
func(event types.OutputLogEvent) error {
logCount++
message := *event.Message

// Check if the log contains histogram format
var logData map[string]interface{}
if err := json.Unmarshal([]byte(message), &logData); err != nil {
return nil // Skip this log if it's not valid JSON
}

// Check for GPU metrics with histogram format
gpuMetricsInLog := 0
for key, value := range logData {
if !strings.Contains(key, "_gpu_") {
continue
}

gpuMetricsInLog++
gpuMetricCount++

// Check if the value is a map with histogram fields
valueMap, ok := value.(map[string]interface{})
if !ok {
continue
}

// Check for required histogram fields
_, hasValues := valueMap["Values"]
_, hasCounts := valueMap["Counts"]
_, hasMax := valueMap["Max"]
_, hasMin := valueMap["Min"]
_, hasCount := valueMap["Count"]
_, hasSum := valueMap["Sum"]

if hasValues && hasCounts && hasMax && hasMin && hasCount && hasSum {
histogramFound = true
log.Println("Found GPU metric in histogram format:", key)
return nil
}
}

return nil // Continue checking other logs
},
),
)

if err != nil {
log.Println("Error validating logs:", err)
}

if histogramFound {
log.Println("Successfully found GPU metric in histogram format")
testResult.Status = status.SUCCESSFUL
return testResult
}
}

log.Printf("Processed %d logs, found %d GPU metrics, but none in histogram format", logCount, gpuMetricCount)
return testResult
}
88 changes: 4 additions & 84 deletions test/gpu/nvidia_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,86 +6,14 @@
package emf

import (
"flag"
"time"

"github.com/aws/amazon-cloudwatch-agent-test/environment"
"github.com/aws/amazon-cloudwatch-agent-test/test/metric"
"github.com/aws/amazon-cloudwatch-agent-test/test/gpu/common"
"github.com/aws/amazon-cloudwatch-agent-test/test/status"
"github.com/aws/amazon-cloudwatch-agent-test/test/test_runner"
)

const (
gpuMetricIndicator = "_gpu_"

containerMemTotal = "container_gpu_memory_total"
containerMemUsed = "container_gpu_memory_used"
containerPower = "container_gpu_power_draw"
containerTemp = "container_gpu_temperature"
containerUtil = "container_gpu_utilization"
containerMemUtil = "container_gpu_memory_utilization"
podMemTotal = "pod_gpu_memory_total"
podMemUsed = "pod_gpu_memory_used"
podPower = "pod_gpu_power_draw"
podTemp = "pod_gpu_temperature"
podUtil = "pod_gpu_utilization"
podMemUtil = "pod_gpu_memory_utilization"
podLimit = "pod_gpu_limit"
podRequest = "pod_gpu_request"
podCountTotal = "pod_gpu_usage_total"
podReserved = "pod_gpu_reserved_capacity"
nodeMemTotal = "node_gpu_memory_total"
nodeMemUsed = "node_gpu_memory_used"
nodePower = "node_gpu_power_draw"
nodeTemp = "node_gpu_temperature"
nodeUtil = "node_gpu_utilization"
nodeMemUtil = "node_gpu_memory_utilization"
nodeCountTotal = "node_gpu_usage_total"
nodeCountLimit = "node_gpu_limit"
nodeReserved = "node_gpu_reserved_capacity"
)

var useE2EMetrics = flag.Bool("useE2EMetrics", false, "Use E2E metrics mapping which uses latest build CWA")

var expectedDimsToMetricsIntegTest = map[string][]string{
"ClusterName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
},
"ClusterName-Namespace": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
},
//"ClusterName-Namespace-Service": {
// podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
//},
"ClusterName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
},
"ClusterName-ContainerName-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-ContainerName-FullPodName-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-ContainerName-FullPodName-GpuDevice-Namespace-PodName": {
containerMemTotal, containerMemUsed, containerPower, containerTemp, containerUtil, containerMemUtil,
},
"ClusterName-FullPodName-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
},
"ClusterName-FullPodName-GpuDevice-Namespace-PodName": {
podMemTotal, podMemUsed, podPower, podTemp, podUtil, podMemUtil,
},
"ClusterName-InstanceId-NodeName": {
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
//nodeCountTotal, nodeCountRequest, nodeCountLimit,
},
"ClusterName-GpuDevice-InstanceId-InstanceType-NodeName": {
nodeMemTotal, nodeMemUsed, nodePower, nodeTemp, nodeUtil, nodeMemUtil,
},
}

type NvidiaTestRunner struct {
test_runner.BaseTestRunner
testName string
Expand All @@ -95,17 +23,9 @@ type NvidiaTestRunner struct {
var _ test_runner.ITestRunner = (*NvidiaTestRunner)(nil)

func (t *NvidiaTestRunner) Validate() status.TestGroupResult {
var testResults []status.TestResult
expectedDimsToMetrics := expectedDimsToMetricsIntegTest
if *useE2EMetrics {
// add GPU count metrics
expectedDimsToMetricsIntegTest["ClusterName"] = append(expectedDimsToMetricsIntegTest["ClusterName"], podReserved, podRequest, podCountTotal, podLimit, nodeCountTotal, nodeCountLimit, nodeReserved)
expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"] = append(expectedDimsToMetricsIntegTest["ClusterName-FullPodName-Namespace-PodName"], podCountTotal, podRequest, podReserved, podLimit)
expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"] = append(expectedDimsToMetricsIntegTest["ClusterName-InstanceId-NodeName"], nodeCountLimit, nodeCountTotal, nodeReserved)
}
testResults = append(testResults, metric.ValidateMetrics(t.env, gpuMetricIndicator, expectedDimsToMetrics)...)
testResults = append(testResults, metric.ValidateLogs(t.env))
// Use the common GPU validation logic
testResults := common.ValidateGPUMetrics(t.env)

return status.TestGroupResult{
Name: t.GetTestName(),
TestResults: testResults,
Expand Down
Loading