diff --git a/go.mod b/go.mod index 442b9e1de5..dd059b28dc 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/aws/amazon-cloudwatch-agent -go 1.24.7 +go 1.24.9 replace github.com/influxdata/telegraf => github.com/aws/telegraf v0.10.2-0.20250113150713-a2dfaa4cdf6d @@ -9,49 +9,49 @@ replace collectd.org v0.4.0 => github.com/collectd/go-collectd v0.4.0 // Replace with https://github.com/amazon-contributing/opentelemetry-collector-contrib, there are no requirements for all receivers/processors/exporters // to be all replaced since there are some changes that will always be from upstream replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251101051523-f6af9813963a ) replace ( - github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/extension/awsproxy => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251014190537-ae1729ad22c4 + github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/extension/awsproxy => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251101051523-f6af9813963a ) replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/awsutil => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/cwlogs => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/k8s => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/metrics => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/proxy => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/xray => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/k8sconfig => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/kubelet => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/metadataproviders => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/awsutil => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/cwlogs => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/k8s => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/metrics => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/proxy => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/xray => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/k8sconfig => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/kubelet => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/internal/metadataproviders => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251101051523-f6af9813963a ) replace ( // For clear resource attributes after copy functionality https://github.com/amazon-contributing/opentelemetry-collector-contrib/pull/148 - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/resourcetotelemetry => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/resourcetotelemetry => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251101051523-f6af9813963a // Replace with contrib to revert upstream change https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/20519 - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251101051523-f6af9813963a ) replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251101051523-f6af9813963a ) replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/jmxreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251014190537-ae1729ad22c4 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251014190537-ae1729ad22c4 + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/jmxreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251101051523-f6af9813963a + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251101051523-f6af9813963a ) // Temporary fix, pending PR https://github.com/shirou/gopsutil/pull/957 @@ -145,6 +145,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.124.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1 github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor v0.124.1 @@ -261,7 +262,7 @@ require ( github.com/alecthomas/participle v0.4.1 // indirect github.com/alecthomas/participle/v2 v2.1.4 // indirect github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b // indirect - github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251014190537-ae1729ad22c4 // indirect + github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251101051523-f6af9813963a // indirect github.com/antchfx/jsonquery v1.1.5 // indirect github.com/antchfx/xmlquery v1.4.4 // indirect github.com/antchfx/xpath v1.3.4 // indirect diff --git a/go.sum b/go.sum index 19c70558cd..c10adebf26 100644 --- a/go.sum +++ b/go.sum @@ -185,60 +185,60 @@ github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vS github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs= github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483 h1:J8HaD+Zpfi1gcel3HCKpoHHEsrcuRrZlSnx7R9SCf5I= github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483/go.mod h1:RcDobYh8k5VP6TNybz9m++gL3ijVI5wueVr0EM10VsU= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4 h1:0gvkXM8HrRo9fu+34OYLVngN87FtstrYjhIjAcOyfTA= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:8dL1mhunGsDXn59xUlnNn1ydT5wp6Fh5KTvlBEaN2Po= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4 h1:XLDJlsz7glQ0PcWYZ9S664H2Hyy/xaRHIT5gGlQFtEk= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:dSu6d3FZqrAECatXDhvYsQIEAaL1iF+fokrPwCjxhC8= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4 h1:jn9YrkY2ZLbpT4n6q/EkfSwGH0cx/diKBmlQcYvLTJ8= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:b9TxHHL62ladWlbU6klYIvDjCN3Ee31oWrKlF50fQns= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251014190537-ae1729ad22c4 h1:YQkkr7FxnF52KvG4CIM8a0A/7m6CIQMmtFHqfd8D/PA= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:R66U9I7kIXbVRsxM2gzH9bbWf7YzVvsa1um7GT+6loA= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251014190537-ae1729ad22c4 h1:TugvEECBEMzbtRq8lvaSpI3LXohMvFeYwua+R6jRi/U= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:7q+XtzB3gsAqzQtQW4BmvufOJg4QfC7D3iuA4Qmublg= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251014190537-ae1729ad22c4 h1:pk1XtdbbpsiUcpdIOGFQUiknwhCbhz1MxEqkbPSwfXM= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:JLn0paY8Ig72gsWixOBeycpgUO96jl/GYisvxfYoFiA= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251014190537-ae1729ad22c4 h1:NLJ568FXCs5tKlCs3t9qqU/oInSxjt3L7i1Tt4yxHAk= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:RfV8eTcty9NU2j06sjXDD452gQt/Ug06go70E18cvOo= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251014190537-ae1729ad22c4 h1:1cf+GEG/X8Nnij2LOSr+PFG3JXUEvNfnHg+8pR7HbXk= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:+EUth6yb4uP18ks0zbNZCOeuQfBxpn/NBHCTmnPP9oU= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251014190537-ae1729ad22c4 h1:JEw+nvfjnw+8MK05tOrc6IO8XCjEIG7Ek+sr6q/V5CQ= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:X9PMDC+hXLzLRF+LLuUXAvmH9lXxWOQKxRAKGdXsNDg= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251014190537-ae1729ad22c4 h1:99sAeYiZXuz9Wqxxv6uCjFhDHTCtlFNV1gEOcsg8Icc= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:Fo5mcaEY9UUKa+3AXuDbDvjjNzwIKkNNMj9/d4snL7A= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251014190537-ae1729ad22c4 h1:AIyeR2QROEOJWo9f4JGAsXguepFcXHm7l8362pf0SZI= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:2Z6Z99nepgSqtFcnSL8gHmKCBXf8Vbs/uYz07V1RpSM= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251014190537-ae1729ad22c4 h1:Fkg4BmFmBSmU+sMTMIpPYcz4Iv3ExKyS+3zur3nlj5w= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:HzL3aeDed+w/KitqvVPL9YDb9gkoIv1DoIAoVo8Cs2I= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251014190537-ae1729ad22c4 h1:90+t3Ykwnsr1uA4R2Hb6s4XR1rFGqIfpAfjnrBMIg0s= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:VdBcw8pO4JmyfIW5TMVDnS4wwceqgKrAnOOlxo7/8E0= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251014190537-ae1729ad22c4 h1:goufNQb4FITGBVP0sT1j9QkBT3E8963KSo19w+GzV6Q= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:pNG09N96GY97XzEGTCRRxx0fIJjIir6VsP9e3+oRrJA= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251014190537-ae1729ad22c4 h1:fn68VhTiMyyWOjFAi4iMi2USr/ZjazZjfYtIK5WyT7k= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:9mCJrmY6COj5H2zHArFzEIJbVMUYYTjNBekzOh6B6oU= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251014190537-ae1729ad22c4 h1:zOkAOSHqdZbg0BJn2YPsiuopGw2Mkts1733jiNziBkA= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:FLIGEFE1+c9FBMXiUMd1tRgfb3SbTQfBt5N+h+FVFO0= -github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251014190537-ae1729ad22c4 h1:2c/Tx4hgwxgm+hI9MYB7d7DWCZsHwfNyoDrWL0YAdTE= -github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:ez+NlSjxHUXdnWWfilSIXMTPlVGaMrxWYkAtzvHUX7o= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251014190537-ae1729ad22c4 h1:G4kFV95L0j76ZqON6klmCo4rFOcsmMgSaGEdnftO8tg= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:ywrseeE8Ymrr+iRRjj9VMTPdE3ru8fdM2CEOdVihQ/0= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251014190537-ae1729ad22c4 h1:MCXbvu8UqjIpBetU/Nb/1xbfO2far9wVYLu3zl3Sen4= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:wrovQ7lNniDkqAEo7vRNgKkTrWCSr7Kh1NAGMpjPUEU= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251014190537-ae1729ad22c4 h1:hnmkqTGIaF57iSCH/w11b/Ea0mLjjdiTrnu5DAqlsOM= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:brdsCvvWCSGXvPUQjC80J5GET69/OxYVOrglj/ZsQjw= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251014190537-ae1729ad22c4 h1:3QNi9PB6LxGZRc52TvxyHmHbfEaIxe82cg2ts5k4a3I= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:Wvs2QPuB4ngUiOjrJpYWLqfU8X0Z27s33uMKP4YHQmE= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251014190537-ae1729ad22c4 h1:AbFzE4JNvrwGoyDCgmg3WiTaA1bxPc6xzJCwgHEtzNw= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:idk0SX/ZWccyRfPyAKPu1uVvd+KBMT0pE75HHFogitY= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:kMAB6h54Q/OJDQEP+XCxjcGnyOHq/KuyGiUliGsUFJI= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:2/Yuy1ePzxKLoHdJIS/BBdWfMD+wpkudvxe1HXZuMAM= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4 h1:2YfnGwsnhAqUNrOsye+w2xI9rya3KT7RucYFwDDAs30= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:431fc3JgruV7R3yhAzu7w0fdPaBp1Tbn4RV+8R4Dtdw= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:81WCB7g7vOUGs5HSNpuHrzMCXXH+XyOJHju/NkygKms= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:y2Z8N/XkGpClvZdkFXt3l7AYRd8Il6CRJBqrGeIbeKs= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:tcJGE27iOBa7aabOfWeR4FMNyhMrBZ+0rfGnVogcISc= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:/89w1XLkfgN/ViSi11FL/2HJ6Zg8zYEKHKuNrTZkQD0= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251014190537-ae1729ad22c4 h1:DhVjwA1aWTdPHsXJ310n62Ivje2wrWYy0zZyTwWd1SQ= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251014190537-ae1729ad22c4/go.mod h1:l6groa6VurnKlq2dhmujTy7ZgwVZ2oKCAXzvMxRMhME= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251101051523-f6af9813963a h1:bNJ0huJ4/4V2kUHDFTXkPuH2G+s3gWETQ/HWj7hmfdY= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20251101051523-f6af9813963a/go.mod h1:h/y8k16N0t89KjloCpZS8ow/H3sd4iU4w7U7anygxcc= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251101051523-f6af9813963a h1:yxpsgZvN96GAv4SKIe3K7aD/mK6KSJW2rV9Xjqqb1Kk= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251101051523-f6af9813963a/go.mod h1:+cEFxvejxnFAPRum2td5Ai/MbtPlMzqcMfitbNbAfCo= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251101051523-f6af9813963a h1:OOdoGNkzWtyS4/MblbiOvz5vXbvoZzUyL1kI6P29F4Q= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20251101051523-f6af9813963a/go.mod h1:UAZddgM5NKCnsVmx4msXHV3jTaJLD9LqbPmV5TctK4k= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251101051523-f6af9813963a h1:3g0r6M4gxNCJ6cxTo1pNBdc7kfedqzODqgpnvD5jtvE= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20251101051523-f6af9813963a/go.mod h1:SpBtq2f0PclYNg6612tjc5MIZFUykEYyND0Tq7l0mzM= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251101051523-f6af9813963a h1:hKC+h9U0dk3kHOos+Cg/+vp1zEDtqu8N9FkSHQ1KNGA= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20251101051523-f6af9813963a/go.mod h1:orUknJg46Fszwz8LI733CI3uyku80NBdWqzzSgyzWKc= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251101051523-f6af9813963a h1:A/Y/2CVN0kasB2YYOMyqsgdgpRl1xFMQZdpIYMCRdzo= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20251101051523-f6af9813963a/go.mod h1:uQv03B/EZ5CVsYWrKqaJbiIMOY2DK6Ox2xjCved+YSM= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251101051523-f6af9813963a h1:SPJIMnLLx/UL0JwPlhPyGOlVDd5fpgr4W0eExHSuOC4= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20251101051523-f6af9813963a/go.mod h1:UhjJJwNhJ2cQ1tQbFXgw3KVE4FsSp/C7KgJMr5c7Nvo= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251101051523-f6af9813963a h1:vGG+/J7t54KmBcEercXrf83raATLEqdEJYINjMTHO5o= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20251101051523-f6af9813963a/go.mod h1:CdK/SDdJRDrk4G3YvlOWjbMR7BGAUUWEquHqVDLTzA4= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251101051523-f6af9813963a h1:eBmF5mmpPsGNJq8PK03qw2kPz+RcvbeiZr3t13pzfoY= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20251101051523-f6af9813963a/go.mod h1:Naqv+7HWtDgPbeT2CzUHStDFy3nQ55Y0WrtBPOvKp8Q= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251101051523-f6af9813963a h1:7fA/9Rko8m9LE2mlZwT7uoeu6SILCHlgOgVlyMu5Oqc= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/metrics v0.0.0-20251101051523-f6af9813963a/go.mod h1:+HNk4dUzu9Fy7FqxdfvxNOoMzvYgVTzUfzVh1ApTKmI= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251101051523-f6af9813963a h1:uYd+2vQbNYJWk8mr5ksDcS+YD9mQW/9vlIpfL77QNJ8= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20251101051523-f6af9813963a/go.mod h1:cU6+bc9gLvH3AATXFf6dE5nEQZKeHXIlLiwTxDq3g7A= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251101051523-f6af9813963a h1:2YPbpTUWVrIULrcHqjVRonoz+Uc14v6JgcL0zxF9B30= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20251101051523-f6af9813963a/go.mod h1:9VGjgdnP8thnbUWzTzOKwv0Ae8Rv6X5rzFY5LqRqNQQ= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251101051523-f6af9813963a h1:FbbBbTJQ7DcnblwnDhSsKig3dUduhSMo7m9Ps45Amcs= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/coreinternal v0.0.0-20251101051523-f6af9813963a/go.mod h1:KB9LZl4Tk6EyP1cRmH8tubLIlpS2nF70qOWSY+b3vHw= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251101051523-f6af9813963a h1:p2ylzNI/wvPP1jREC/YMCU9a6747KexxXiuBoLs08sY= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/k8sconfig v0.0.0-20251101051523-f6af9813963a/go.mod h1:DU5DozRx1wR9D4Vxwe/CRSNMwz15rub6r6TEo7rA2Fo= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251101051523-f6af9813963a h1:oDpxdgpsEKl0UZz8C0pNwrmcn9pKLv6w/la8lOi31ds= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/kubelet v0.0.0-20251101051523-f6af9813963a/go.mod h1:VAIY5q8ntx1MZREDXmVsQp/qIyqx3vq6UaPeWldNuRk= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251101051523-f6af9813963a h1:x0+5/4+ts02XgXNhwiKJFW6cxkljz/nF2w9mae4CJSs= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/metadataproviders v0.0.0-20251101051523-f6af9813963a/go.mod h1:aYprF4me3eU4vuCY4RcMF2+9JIbHX6+qpCs2NSKKke8= +github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251101051523-f6af9813963a h1:DoO6IrcFDXygZVbA28xlekKffynns3L4lt8LT4OhNb8= +github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20251101051523-f6af9813963a/go.mod h1:TuE5ogQgw7qLh+Y+eKj3utx98fjnVZBNFb5P2C9pc+M= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251101051523-f6af9813963a h1:tZw9X6j3g62NgyCQvC7Lb5qIUCFM946qFrtXt+B4Cfg= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/resourcetotelemetry v0.0.0-20251101051523-f6af9813963a/go.mod h1:ugjDletBT4ZGEWiDfMxYl0iP9ALOx4xawNXYOTxzyB8= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251101051523-f6af9813963a h1:Bk0wjlcv5IXHYYmy1b4d4Z95x02XqVFkt/CAWPbqMKc= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20251101051523-f6af9813963a/go.mod h1:pvsUAvdMnMMRXSLA3vdqM4CiGOF1xSMsiVstV80hXr8= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251101051523-f6af9813963a h1:geNduTeY+6m/bSrzvDV6Ar8VS67UnunKzkjvRb311Rg= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20251101051523-f6af9813963a/go.mod h1:41eDZVHh5A1BLJ0Dy3oQIrS+du1Zu+rpKEijUazlnCI= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251101051523-f6af9813963a h1:3HBSq2askj4wLYfVZvMYsww3Fjz9jlup5EJ64nMyZvk= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20251101051523-f6af9813963a/go.mod h1:YTEg21QFJQ1HP9c+w1qnORKd0cLjFqlNONO4GE4RJz4= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251101051523-f6af9813963a h1:8ly/LJ3jxuRsznJ0F2A5Cl2GoHkGFjwssJWlb+1szg8= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20251101051523-f6af9813963a/go.mod h1:I41hi6XJwoAAauMtkaCQBtn0Fom+HE2KbIZjxauVavU= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251101051523-f6af9813963a h1:d2OVQ/qXBfX3X+D5dOhh96hnuhiYH5R1+cC7KDYHnMg= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251101051523-f6af9813963a/go.mod h1:Yjm9Vh3sLSFKA7o28PAidnlioTWOyHvpQ9rsDR4lwNM= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251101051523-f6af9813963a h1:AV/ibYXLVswtT78ygIuCn+GxpIP3YiipEGIKumOyepc= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20251101051523-f6af9813963a/go.mod h1:dnhJ7NI0b85XA4yBAHgkTqnC7x4LDeAIjg6I435n0Jc= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251101051523-f6af9813963a h1:dN8Y+YjjXNoED9B+Ut+KK9lv/L1BwC6SPp+nL/qA9DA= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20251101051523-f6af9813963a/go.mod h1:CEwLaA2QDABq95X2IgQkt2cB+vvlJbMrB64LnJwSKHA= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251101051523-f6af9813963a h1:aUc0uAD2G5zVhIPbfkzuqgO6v+OPHya2Gzn9CBkjU+c= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20251101051523-f6af9813963a/go.mod h1:SGDGG+izRRsBNyC3ws1v1ZPIHIPgry6O03UrtF7akgg= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251101051523-f6af9813963a h1:sqUVg6nZRXQL5UV0eQ2h9sjsAks0XgaLlG8KcvqpD58= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20251101051523-f6af9813963a/go.mod h1:Qjc9QUa9UuVy+diYfC+hGcOZNt+4JPFqpqAzXlnuPQI= github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9/go.mod h1:eliMa/PW+RDr2QLWRmLH1R1ZA4RInpmvOzDDXtaIZkc= github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= @@ -1278,6 +1278,8 @@ github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatoratep github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1/go.mod h1:bp3Y5GT4dkGWRGEZqKgfanyk6ZSsVGNY5aNDvX4c8WE= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1 h1:qkqiqLwfg7hj+oDYvpmMD64p+poaxXwo654ZE44uPm4= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1/go.mod h1:B/GP3l4Y1qNsNtWVIzpwS8jWB1Nn/vx0sFBlVDkWt9E= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1 h1:gYlUUIR+lzLQCpj5phh+Ogmk5BRaOrEuKGjIixCk89I= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1/go.mod h1:bkAXtBtShDOA8SuF8IpYbhx1BYWUEE1rW10HXXEXW/4= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1 h1:6MAKxLXfQWHEadn9AgY1jWdKFTJkLYVBa+/h3Rk23lE= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1/go.mod h1:6q2oIAtCuX9HklnqGPO8sWPoTAjhZX1x23O0aTR/zd0= github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1 h1:esSFJIhlZaZslW9EYY/Ss5zUnfkuN2qiS+7ujk73/gU= diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 6602971a7f..acee94c653 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -19,6 +19,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor" @@ -107,6 +108,7 @@ func Factories() (otelcol.Factories, error) { gpuattributes.NewFactory(), kueueattributes.NewFactory(), groupbytraceprocessor.NewFactory(), + groupbyattrsprocessor.NewFactory(), k8sattributesprocessor.NewFactory(), memorylimiterprocessor.NewFactory(), metricsgenerationprocessor.NewFactory(), diff --git a/service/defaultcomponents/components_test.go b/service/defaultcomponents/components_test.go index 98e789ebd4..c77d7e43da 100644 --- a/service/defaultcomponents/components_test.go +++ b/service/defaultcomponents/components_test.go @@ -54,6 +54,7 @@ func TestComponents(t *testing.T) { "gpuattributes", "kueueattributes", "groupbytrace", + "groupbyattrs", "k8sattributes", "memory_limiter", "metricstransform", diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml index 5d7236220e..039ea34a49 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml @@ -1477,6 +1477,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_fallback_and_eks_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml index e879d58968..6cfea39dde 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_over_fallback_config.yaml @@ -1476,6 +1476,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml index 3074a5d230..9a7fbc2b0f 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml @@ -196,6 +196,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml b/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml index abc6f62001..a044866ec0 100644 --- a/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml +++ b/translator/tocwconfig/sampleConfig/container_insights_jmx.yaml @@ -509,6 +509,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 6f63f5cebe..620bc80d7d 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -650,6 +650,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 2777967e14..6bce1ffd50 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -1409,6 +1409,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.conf b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.conf new file mode 100644 index 0000000000..007bb60efb --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.conf @@ -0,0 +1,27 @@ +[agent] + collection_jitter = "0s" + debug = false + flush_interval = "1s" + flush_jitter = "0s" + hostname = "host_name_from_env" + interval = "60s" + logfile = "" + logtarget = "lumberjack" + metric_batch_size = 1000 + metric_buffer_limit = 10000 + omit_hostname = false + precision = "" + quiet = false + round_interval = false + +[inputs] + +[outputs] + + [[outputs.cloudwatchlogs]] + endpoint_override = "https://fake_endpoint" + force_flush_interval = "5s" + log_stream_name = "host_name_from_env" + region = "us-east-1" + +[processors] diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.json b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.json new file mode 100644 index 0000000000..a408ad5f2b --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.json @@ -0,0 +1,21 @@ +{ + "agent": { + "region": "us-east-1" + }, + "logs": { + "metrics_collected": { + "emf": { + }, + "kubernetes": { + "cluster_name": "TestCluster", + "metrics_collection_interval": 30, + "disable_metric_extraction": true, + "enhanced_container_insights": true, + "kueue_container_insights": false, + "accelerated_compute_gpu_metrics_collection_interval": 1 + } + }, + "force_flush_interval": 5, + "endpoint_override":"https://fake_endpoint" + } +} diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.yaml new file mode 100644 index 0000000000..7de9ef3b46 --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_high_frequency_config.yaml @@ -0,0 +1,1532 @@ +exporters: + awscloudwatchlogs/emf_logs: + certificate_file_path: "" + emf_only: true + endpoint: https://fake_endpoint + external_id: "" + imds_retries: 2 + local_mode: true + log_group_name: emf/logs/default + log_retention: 0 + log_stream_name: host_name_from_env + max_retries: 2 + middleware: agenthealth/logs + no_verify_ssl: false + num_workers: 8 + profile: default + proxy_address: "" + raw_log: true + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + retry_on_failure: + enabled: true + initial_interval: 5s + max_elapsed_time: 5m0s + max_interval: 30s + multiplier: 1.5 + randomization_factor: 0.5 + role_arn: "" + sending_queue: + block_on_overflow: false + blocking: false + enabled: true + num_consumers: 1 + queue_size: 1000 + sizer: {} + wait_for_result: false + shared_credentials_file: + - /root/.aws/credentials + awsemf/containerinsights: + add_entity: true + certificate_file_path: "" + detailed_metrics: false + dimension_rollup_option: NoDimensionRollup + disable_metric_extraction: true + eks_fargate_container_insights_enabled: false + endpoint: https://fake_endpoint + enhanced_container_insights: true + external_id: "" + imds_retries: 2 + local_mode: true + log_group_name: /aws/containerinsights/{ClusterName}/performance + log_retention: 0 + log_stream_name: '{NodeName}' + max_retries: 2 + metric_as_distribution: + - container_gpu_utilization + - container_gpu_memory_utilization + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - container_gpu_tensor_core_utilization + - pod_gpu_utilization + - pod_gpu_memory_utilization + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - pod_gpu_tensor_core_utilization + - node_gpu_utilization + - node_gpu_memory_utilization + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - node_gpu_tensor_core_utilization + metric_declarations: + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - Namespace + - PodName + metric_name_selectors: + - container_cpu_utilization + - container_cpu_utilization_over_container_limit + - container_cpu_limit + - container_cpu_request + - container_memory_utilization + - container_memory_utilization_over_container_limit + - container_memory_failures_total + - container_memory_limit + - container_memory_request + - container_filesystem_usage + - container_filesystem_available + - container_filesystem_utilization + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - - ClusterName + - FullPodName + - Namespace + - PodName + metric_name_selectors: + - pod_cpu_utilization + - pod_memory_utilization + - pod_network_rx_bytes + - pod_network_tx_bytes + - pod_cpu_utilization_over_pod_limit + - pod_memory_utilization_over_pod_limit + - dimensions: + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - PodName + - - ClusterName + - Namespace + - - ClusterName + metric_name_selectors: + - pod_interface_network_rx_dropped + - pod_interface_network_tx_dropped + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - Service + metric_name_selectors: + - pod_cpu_reserved_capacity + - pod_memory_reserved_capacity + - pod_number_of_container_restarts + - pod_number_of_containers + - pod_number_of_running_containers + - pod_status_ready + - pod_status_scheduled + - pod_status_running + - pod_status_pending + - pod_status_failed + - pod_status_unknown + - pod_status_succeeded + - pod_memory_request + - pod_memory_limit + - pod_cpu_limit + - pod_cpu_request + - pod_cpu_usage_total + - pod_memory_working_set + - pod_container_status_running + - pod_container_status_terminated + - pod_container_status_waiting + - pod_container_status_waiting_reason_crash_loop_back_off + - pod_container_status_waiting_reason_image_pull_error + - pod_container_status_waiting_reason_start_error + - pod_container_status_waiting_reason_create_container_error + - pod_container_status_waiting_reason_create_container_config_error + - pod_container_status_terminated_reason_oom_killed + - pod_gpu_request + - pod_gpu_limit + - pod_gpu_usage_total + - pod_gpu_reserved_capacity + - pod_neuroncore_request + - pod_neuroncore_limit + - pod_neuroncore_usage_total + - pod_neuroncore_reserved_capacity + - pod_efa_request + - pod_efa_limit + - pod_efa_usage_total + - pod_efa_reserved_capacity + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_cpu_utilization + - node_memory_utilization + - node_network_total_bytes + - node_cpu_reserved_capacity + - node_memory_reserved_capacity + - node_number_of_running_pods + - node_number_of_running_containers + - node_cpu_usage_total + - node_cpu_limit + - node_memory_working_set + - node_memory_limit + - node_status_condition_ready + - node_status_condition_disk_pressure + - node_status_condition_memory_pressure + - node_status_condition_pid_pressure + - node_status_condition_network_unavailable + - node_status_condition_unknown + - node_status_capacity_pods + - node_status_allocatable_pods + - node_gpu_limit + - node_gpu_usage_total + - node_gpu_reserved_capacity + - node_gpu_unreserved_capacity + - node_gpu_available_capacity + - node_neuroncore_limit + - node_neuroncore_usage_total + - node_neuroncore_reserved_capacity + - node_neuroncore_unreserved_capacity + - node_neuroncore_available_capacity + - node_efa_limit + - node_efa_usage_total + - node_efa_reserved_capacity + - node_efa_unreserved_capacity + - node_efa_available_capacity + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_interface_network_rx_dropped + - node_interface_network_tx_dropped + - node_diskio_io_service_bytes_total + - node_diskio_io_serviced_total + - hyperpod_node_health_status_schedulable + - hyperpod_node_health_status_unschedulable_pending_replacement + - hyperpod_node_health_status_unschedulable_pending_reboot + - hyperpod_node_health_status_unschedulable + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + metric_name_selectors: + - node_filesystem_utilization + - node_filesystem_inodes + - node_filesystem_inodes_free + - dimensions: + - - ClusterName + - Namespace + - Service + - - ClusterName + metric_name_selectors: + - service_number_of_running_pods + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + metric_name_selectors: + - replicas_desired + - replicas_ready + - status_replicas_available + - status_replicas_unavailable + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + metric_name_selectors: + - daemonset_status_number_available + - daemonset_status_number_unavailable + - dimensions: + - - ClusterName + - Namespace + - - ClusterName + metric_name_selectors: + - namespace_number_of_running_pods + - namespace_ingress_count + - dimensions: + - - ClusterName + metric_name_selectors: + - cluster_node_count + - cluster_failed_node_count + - cluster_number_of_running_pods + - dimensions: + - - ClusterName + - endpoint + - - ClusterName + metric_name_selectors: + - apiserver_storage_size_bytes + - apiserver_storage_db_total_size_in_bytes + - etcd_db_total_size_in_bytes + - dimensions: + - - ClusterName + - resource + - - ClusterName + metric_name_selectors: + - apiserver_storage_list_duration_seconds + - apiserver_longrunning_requests + - apiserver_storage_objects + - dimensions: + - - ClusterName + - verb + - - ClusterName + metric_name_selectors: + - apiserver_request_duration_seconds + - rest_client_request_duration_seconds + - dimensions: + - - ClusterName + - code + - verb + - - ClusterName + metric_name_selectors: + - apiserver_request_total + - apiserver_request_total_5xx + - dimensions: + - - ClusterName + - operation + - - ClusterName + metric_name_selectors: + - apiserver_admission_controller_admission_duration_seconds + - apiserver_admission_step_admission_duration_seconds + - etcd_request_duration_seconds + - dimensions: + - - ClusterName + - code + - method + - - ClusterName + metric_name_selectors: + - rest_client_requests_total + - dimensions: + - - ClusterName + - request_kind + - - ClusterName + metric_name_selectors: + - apiserver_current_inflight_requests + - apiserver_current_inqueue_requests + - dimensions: + - - ClusterName + - name + - - ClusterName + metric_name_selectors: + - apiserver_admission_webhook_admission_duration_seconds + - dimensions: + - - ClusterName + - group + - - ClusterName + metric_name_selectors: + - apiserver_requested_deprecated_apis + - dimensions: + - - ClusterName + - reason + - - ClusterName + metric_name_selectors: + - apiserver_flowcontrol_rejected_requests_total + - dimensions: + - - ClusterName + - priority_level + - - ClusterName + metric_name_selectors: + - apiserver_flowcontrol_request_concurrency_limit + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - GpuDevice + - Namespace + - PodName + metric_name_selectors: + - container_gpu_utilization + - container_gpu_memory_utilization + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - container_gpu_tensor_core_utilization + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - GpuDevice + - Namespace + - PodName + metric_name_selectors: + - pod_gpu_utilization + - pod_gpu_memory_utilization + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - pod_gpu_tensor_core_utilization + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - GpuDevice + - InstanceId + - InstanceType + - NodeName + metric_name_selectors: + - node_gpu_utilization + - node_gpu_memory_utilization + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - node_gpu_tensor_core_utilization + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - NeuronCore + - NeuronDevice + - PodName + metric_name_selectors: + - container_neuroncore_utilization + - container_neuroncore_memory_usage_total + - container_neuroncore_memory_usage_constants + - container_neuroncore_memory_usage_model_code + - container_neuroncore_memory_usage_model_shared_scratchpad + - container_neuroncore_memory_usage_runtime_memory + - container_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - NeuronDevice + - PodName + metric_name_selectors: + - container_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - NeuronCore + - NeuronDevice + - PodName + metric_name_selectors: + - pod_neuroncore_utilization + - pod_neuroncore_memory_usage_total + - pod_neuroncore_memory_usage_constants + - pod_neuroncore_memory_usage_model_code + - pod_neuroncore_memory_usage_model_shared_scratchpad + - pod_neuroncore_memory_usage_runtime_memory + - pod_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - NeuronDevice + - PodName + metric_name_selectors: + - pod_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - UltraServer + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - InstanceType + - NeuronCore + - NeuronDevice + - NodeName + metric_name_selectors: + - node_neuroncore_utilization + - node_neuroncore_memory_usage_total + - node_neuroncore_memory_usage_constants + - node_neuroncore_memory_usage_model_code + - node_neuroncore_memory_usage_model_shared_scratchpad + - node_neuroncore_memory_usage_runtime_memory + - node_neuroncore_memory_usage_tensors + - dimensions: + - - ClusterName + - - ClusterName + - UltraServer + - - ClusterName + - InstanceId + - NodeName + metric_name_selectors: + - node_neuron_execution_errors_total + - node_neurondevice_runtime_memory_used_bytes + - node_neuron_execution_latency + - dimensions: + - - ClusterName + - - ClusterName + - UltraServer + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - NeuronDevice + - NodeName + metric_name_selectors: + - node_neurondevice_hw_ecc_events_total + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - NetworkInterfaceId + - PodName + metric_name_selectors: + - container_efa_rx_bytes + - container_efa_tx_bytes + - container_efa_rx_dropped + - container_efa_rdma_read_bytes + - container_efa_rdma_write_bytes + - container_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - NetworkInterfaceId + - PodName + metric_name_selectors: + - pod_efa_rx_bytes + - pod_efa_tx_bytes + - pod_efa_rx_dropped + - pod_efa_rdma_read_bytes + - pod_efa_rdma_write_bytes + - pod_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - InstanceType + - NetworkInterfaceId + - NodeName + metric_name_selectors: + - node_efa_rx_bytes + - node_efa_tx_bytes + - node_efa_rx_dropped + - node_efa_rdma_read_bytes + - node_efa_rdma_write_bytes + - node_efa_rdma_write_recv_bytes + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - InstanceId + - NodeName + - VolumeId + metric_name_selectors: + - node_diskio_ebs_total_read_ops + - node_diskio_ebs_total_write_ops + - node_diskio_ebs_total_read_bytes + - node_diskio_ebs_total_write_bytes + - node_diskio_ebs_total_read_time + - node_diskio_ebs_total_write_time + - node_diskio_ebs_volume_performance_exceeded_iops + - node_diskio_ebs_volume_performance_exceeded_tp + - node_diskio_ebs_ec2_instance_performance_exceeded_iops + - node_diskio_ebs_ec2_instance_performance_exceeded_tp + - node_diskio_ebs_volume_queue_length + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - PersistentVolumeClaimName + metric_name_selectors: + - persistent_volume_claim_status_bound + - persistent_volume_claim_status_lost + - persistent_volume_claim_status_pending + - persistent_volume_claim_count + - dimensions: + - - ClusterName + metric_name_selectors: + - persistent_volume_count + metric_descriptors: + - metric_name: apiserver_admission_controller_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_step_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_webhook_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_current_inflight_requests + overwrite: true + unit: Count + - metric_name: apiserver_current_inqueue_requests + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_rejected_requests_total + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_request_concurrency_limit + overwrite: true + unit: Count + - metric_name: apiserver_longrunning_requests + overwrite: true + unit: Count + - metric_name: apiserver_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_request_total + overwrite: true + unit: Count + - metric_name: apiserver_request_total_5xx + overwrite: true + unit: Count + - metric_name: apiserver_requested_deprecated_apis + overwrite: true + unit: Count + - metric_name: apiserver_storage_objects + overwrite: true + unit: Count + - metric_name: etcd_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_list_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: apiserver_storage_size_bytes + overwrite: true + unit: Bytes + - metric_name: etcd_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: rest_client_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: rest_client_requests_total + overwrite: true + unit: Count + middleware: agenthealth/logs + namespace: ContainerInsights + no_verify_ssl: false + num_workers: 8 + output_destination: cloudwatch + parse_json_encoded_attr_values: + - Sources + - kubernetes + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + resource_to_telemetry_conversion: + enabled: true + retain_initial_value_of_delta_metric: false + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + version: "0" +extensions: + agenthealth/logs: + is_usage_data_enabled: true + stats: + operations: + - PutLogEvents + usage_flags: + mode: OP + region_type: ACJ + agenthealth/statuscode: + is_status_code_enabled: true + is_usage_data_enabled: true + stats: + usage_flags: + mode: OP + region_type: ACJ + entitystore: + mode: onPremise + profile: default + region: us-east-1 + shared_credential_file: /root/.aws/credentials +processors: + awsentity/resource/containerinsights: + entity_type: Resource + platform: onPremise + batch/containerinsights: + metadata_cardinality_limit: 1000 + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 1m0s + batch/emf_logs: + metadata_cardinality_limit: 1000 + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + filter/containerinsights: + error_mode: propagate + logs: {} + metrics: + exclude: + match_type: strict + metric_names: + - up + - scrape_duration_seconds + - scrape_samples_scraped + - scrape_series_added + - scrape_samples_post_metric_relabeling + spans: {} + traces: {} + gpuattributes/containerinsights: {} + groupbyattrs/containerinsights: + keys: [] + metricstransform/containerinsights: + transforms: + - action: insert + aggregation_type: "" + experimental_match_labels: + code: ^5.* + include: apiserver_request_total + match_type: regexp + new_name: apiserver_request_total_5xx + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_ec2_exceeded_tp_seconds_total + match_type: "" + new_name: node_diskio_ebs_ec2_instance_performance_exceeded_tp + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_volume_queue_length + match_type: "" + new_name: node_diskio_ebs_volume_queue_length + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_write_ops_total + match_type: "" + new_name: node_diskio_ebs_total_write_ops + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_read_bytes_total + match_type: "" + new_name: node_diskio_ebs_total_read_bytes + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_write_bytes_total + match_type: "" + new_name: node_diskio_ebs_total_write_bytes + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_read_seconds_total + match_type: "" + new_name: node_diskio_ebs_total_read_time + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_exceeded_tp_seconds_total + match_type: "" + new_name: node_diskio_ebs_volume_performance_exceeded_tp + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_read_ops_total + match_type: "" + new_name: node_diskio_ebs_total_read_ops + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_write_seconds_total + match_type: "" + new_name: node_diskio_ebs_total_write_time + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_exceeded_iops_seconds_total + match_type: "" + new_name: node_diskio_ebs_volume_performance_exceeded_iops + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: update + aggregation_type: "" + include: aws_ebs_csi_ec2_exceeded_iops_seconds_total + match_type: "" + new_name: node_diskio_ebs_ec2_instance_performance_exceeded_iops + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeEBS + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: container_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: pod_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: node_gpu_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: container_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: pod_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: node_gpu_memory_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: container_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: pod_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: node_gpu_memory_used + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: container_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: pod_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: node_gpu_memory_total + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: container_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: container_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + match_type: "" + new_name: container_gpu_tensor_core_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: ContainerGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + match_type: "" + new_name: pod_gpu_tensor_core_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: PodGPU + submatch_case: "" + - action: insert + aggregation_type: "" + include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + match_type: "" + new_name: node_gpu_tensor_core_utilization + operations: + - action: add_label + aggregation_type: "" + experimental_scale: 0 + label: "" + label_value: "" + new_label: Type + new_value: NodeGPU + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_status_total + match_type: "" + new_name: neuron_execution_status + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_constants + match_type: "" + new_name: neuroncore_memory_usage_constants + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_model_code + match_type: "" + new_name: neuroncore_memory_usage_model_code + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_model_shared_scratchpad + match_type: "" + new_name: neuroncore_memory_usage_model_shared_scratchpad + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_runtime_memory + match_type: "" + new_name: neuroncore_memory_usage_runtime_memory + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_memory_usage_tensors + match_type: "" + new_name: neuroncore_memory_usage_tensors + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: instance_info + match_type: "" + new_name: instance_info + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuron_hardware + match_type: "" + new_name: neuron_hardware + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_errors_total + match_type: "" + new_name: neuron_execution_errors + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuron_runtime_memory_used_bytes + match_type: "" + new_name: neurondevice_runtime_memory_used_bytes + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: neuroncore_utilization_ratio + match_type: "" + new_name: neuroncore_utilization + operations: + - action: experimental_scale_value + aggregation_type: "" + experimental_scale: 100 + label: "" + label_value: "" + new_label: "" + new_value: "" + submatch_case: "" + - action: update + aggregation_type: "" + include: hardware_ecc_events_total + match_type: "" + new_name: neurondevice_hw_ecc_events + operations: [] + submatch_case: "" + - action: update + aggregation_type: "" + include: execution_latency_seconds + match_type: "" + new_name: neuron_execution_latency + operations: [] + submatch_case: "" +receivers: + awscontainerinsightreceiver: + accelerated_compute_gpu_metrics_collection_interval: 1s + accelerated_compute_metrics: true + add_container_name_metric_label: true + add_full_pod_name_metric_label: true + add_service_as_attribute: true + certificate_file_path: "" + cluster_name: TestCluster + collection_interval: 30s + collection_role: ALL + container_orchestrator: eks + enable_control_plane_metrics: true + endpoint: "" + external_id: "" + host_ip: "" + host_name: "" + imds_retries: 2 + kube_config_path: "" + leader_lock_name: cwagent-clusterleader + leader_lock_using_config_map_only: true + local_mode: true + max_retries: 0 + middleware: agenthealth/statuscode + no_verify_ssl: false + num_workers: 0 + prefer_full_pod_name: true + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 0 + resource_arn: "" + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + tcplog/emf_logs: + encoding: utf-8 + id: tcp_input + listen_address: 0.0.0.0:25888 + operators: [] + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + type: tcp_input + udplog/emf_logs: + encoding: utf-8 + id: udp_input + listen_address: 0.0.0.0:25888 + multiline: + line_end_pattern: .^ + line_start_pattern: "" + omit_pattern: false + operators: [] + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + type: udp_input +service: + extensions: + - agenthealth/logs + - agenthealth/statuscode + - entitystore + pipelines: + logs/emf_logs: + exporters: + - awscloudwatchlogs/emf_logs + processors: + - batch/emf_logs + receivers: + - tcplog/emf_logs + - udplog/emf_logs + metrics/containerinsights: + exporters: + - awsemf/containerinsights + processors: + - batch/containerinsights + - filter/containerinsights + - groupbyattrs/containerinsights + - awsentity/resource/containerinsights + - metricstransform/containerinsights + - gpuattributes/containerinsights + receivers: + - awscontainerinsightreceiver + telemetry: + logs: + encoding: console + level: info + sampling: + enabled: true + initial: 2 + thereafter: 500 + tick: 10s + metrics: + level: None + traces: + level: None diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml index 9ac6191348..b4ba7af0bc 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_kueue_config.yaml @@ -752,6 +752,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index 986ecc1f08..5521dd14ae 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -607,6 +607,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml index 6db5f681ce..d2a2107e29 100644 --- a/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/kueue_container_insights_config.yaml @@ -296,6 +296,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml index 50248a3159..79377d6781 100644 --- a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml +++ b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml @@ -139,6 +139,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: true + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: false add_full_pod_name_metric_label: false add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 9fed73f560..cd9c3688be 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -644,6 +644,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/sampleConfig/otlp_metrics_cloudwatchlogs_eks_config.yaml b/translator/tocwconfig/sampleConfig/otlp_metrics_cloudwatchlogs_eks_config.yaml index 6536e81434..d19058410d 100644 --- a/translator/tocwconfig/sampleConfig/otlp_metrics_cloudwatchlogs_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/otlp_metrics_cloudwatchlogs_eks_config.yaml @@ -661,6 +661,7 @@ processors: submatch_case: "" receivers: awscontainerinsightreceiver: + accelerated_compute_gpu_metrics_collection_interval: 1m0s accelerated_compute_metrics: false add_container_name_metric_label: true add_full_pod_name_metric_label: true diff --git a/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml index aac57b548a..89cc44cb6f 100644 --- a/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/prometheus_and_kubernetes_config.yaml @@ -649,6 +649,7 @@ processors: receivers: awscontainerinsightreceiver: accelerated_compute_metrics: false + accelerated_compute_gpu_metrics_collection_interval: 1m0s add_container_name_metric_label: true add_full_pod_name_metric_label: true add_service_as_attribute: true diff --git a/translator/tocwconfig/tocwconfig_test.go b/translator/tocwconfig/tocwconfig_test.go index ec737a7fae..ac5cef7c73 100644 --- a/translator/tocwconfig/tocwconfig_test.go +++ b/translator/tocwconfig/tocwconfig_test.go @@ -240,6 +240,18 @@ func TestEmfAndKubernetesWithGpuConfig(t *testing.T) { checkTranslation(t, "emf_and_kubernetes_with_gpu_config", "darwin", nil, "") } +func TestEmfAndKubernetesWithGpuHighFrequencyConfig(t *testing.T) { + resetContext(t) + readCommonConfig(t, "./sampleConfig/commonConfig/withCredentials.toml") + context.CurrentContext().SetRunInContainer(true) + context.CurrentContext().SetMode(config.ModeOnPremise) + t.Setenv(config.HOST_NAME, "host_name_from_env") + t.Setenv(config.HOST_IP, "127.0.0.1") + expectedEnvVars := map[string]string{} + checkTranslation(t, "emf_and_kubernetes_with_gpu_high_frequency_config", "linux", expectedEnvVars, "") + checkTranslation(t, "emf_and_kubernetes_with_gpu_high_frequency_config", "darwin", nil, "") +} + func TestEmfAndKubernetesWithKueueConfig(t *testing.T) { resetContext(t) readCommonConfig(t, "./sampleConfig/commonConfig/withCredentials.toml") diff --git a/translator/translate/otel/common/common.go b/translator/translate/otel/common/common.go index 3b2eba8dda..50af9792b8 100644 --- a/translator/translate/otel/common/common.go +++ b/translator/translate/otel/common/common.go @@ -21,67 +21,69 @@ import ( ) const ( - AgentKey = "agent" - DebugKey = "debug" - MetricsKey = "metrics" - LogsKey = "logs" - TracesKey = "traces" - MetricsCollectedKey = "metrics_collected" - LogsCollectedKey = "logs_collected" - TracesCollectedKey = "traces_collected" - MetricsDestinationsKey = "metrics_destinations" - ECSKey = "ecs" - KubernetesKey = "kubernetes" - CloudWatchKey = "cloudwatch" - CloudWatchLogsKey = "cloudwatchlogs" - PrometheusKey = "prometheus" - PrometheusConfigPathKey = "prometheus_config_path" - AMPKey = "amp" - WorkspaceIDKey = "workspace_id" - EMFProcessorKey = "emf_processor" - DisableMetricExtraction = "disable_metric_extraction" - XrayKey = "xray" - OtlpKey = "otlp" - JmxKey = "jmx" - TLSKey = "tls" - Endpoint = "endpoint" - EndpointOverrideKey = "endpoint_override" - RegionOverrideKey = "region_override" - ProxyOverrideKey = "proxy_override" - InsecureKey = "insecure" - LocalModeKey = "local_mode" - CredentialsKey = "credentials" - RoleARNKey = "role_arn" - SigV4Auth = "sigv4auth" - MetricsCollectionIntervalKey = "metrics_collection_interval" - AggregationDimensionsKey = "aggregation_dimensions" - MeasurementKey = "measurement" - DropOriginalMetricsKey = "drop_original_metrics" - ForceFlushIntervalKey = "force_flush_interval" - ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights - EnhancedContainerInsights = "enhanced_container_insights" - ResourcesKey = "resources" - PreferFullPodName = "prefer_full_pod_name" - EnableAcceleratedComputeMetric = "accelerated_compute_metrics" - EnableKueueContainerInsights = "kueue_container_insights" - AppendDimensionsKey = "append_dimensions" - Console = "console" - DiskKey = "disk" - DiskIOKey = "diskio" - NetKey = "net" - Emf = "emf" - StructuredLog = "structuredlog" - ServiceAddress = "service_address" - Udp = "udp" - Tcp = "tcp" - TlsKey = "tls" - Tags = "tags" - Region = "region" - LogGroupName = "log_group_name" - LogStreamName = "log_stream_name" - NameKey = "name" - RenameKey = "rename" - UnitKey = "unit" + AgentKey = "agent" + DebugKey = "debug" + MetricsKey = "metrics" + LogsKey = "logs" + TracesKey = "traces" + MetricsCollectedKey = "metrics_collected" + LogsCollectedKey = "logs_collected" + TracesCollectedKey = "traces_collected" + MetricsDestinationsKey = "metrics_destinations" + ECSKey = "ecs" + KubernetesKey = "kubernetes" + CloudWatchKey = "cloudwatch" + CloudWatchLogsKey = "cloudwatchlogs" + PrometheusKey = "prometheus" + PrometheusConfigPathKey = "prometheus_config_path" + AMPKey = "amp" + WorkspaceIDKey = "workspace_id" + EMFProcessorKey = "emf_processor" + DisableMetricExtraction = "disable_metric_extraction" + XrayKey = "xray" + OtlpKey = "otlp" + JmxKey = "jmx" + TLSKey = "tls" + Endpoint = "endpoint" + EndpointOverrideKey = "endpoint_override" + RegionOverrideKey = "region_override" + ProxyOverrideKey = "proxy_override" + InsecureKey = "insecure" + LocalModeKey = "local_mode" + CredentialsKey = "credentials" + RoleARNKey = "role_arn" + SigV4Auth = "sigv4auth" + MetricsCollectionIntervalKey = "metrics_collection_interval" + AggregationDimensionsKey = "aggregation_dimensions" + MeasurementKey = "measurement" + DropOriginalMetricsKey = "drop_original_metrics" + ForceFlushIntervalKey = "force_flush_interval" + ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights + EnhancedContainerInsights = "enhanced_container_insights" + ResourcesKey = "resources" + PreferFullPodName = "prefer_full_pod_name" + EnableAcceleratedComputeMetric = "accelerated_compute_metrics" + AcceleratedComputeGPUMetricsCollectionInterval = "accelerated_compute_gpu_metrics_collection_interval" + HighFrequencyGpuMetrics = "high_frequency_gpu_metrics" + EnableKueueContainerInsights = "kueue_container_insights" + AppendDimensionsKey = "append_dimensions" + Console = "console" + DiskKey = "disk" + DiskIOKey = "diskio" + NetKey = "net" + Emf = "emf" + StructuredLog = "structuredlog" + ServiceAddress = "service_address" + UDP = "udp" + TCP = "tcp" + TlsKey = "tls" //nolint:revive + Tags = "tags" + Region = "region" + LogGroupName = "log_group_name" + LogStreamName = "log_stream_name" + NameKey = "name" + RenameKey = "rename" + UnitKey = "unit" ) const ( diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index d74597bd6f..cb4c3cbf5c 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -7,6 +7,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "go.opentelemetry.io/collector/confmap" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) @@ -60,7 +61,7 @@ func setKubernetesMetricDeclaration(conf *confmap.Conf, cfg *awsemfexporter.Conf cfg.MetricDeclarations = kubernetesMetricDeclarations cfg.MetricDescriptors = getControlPlaneMetricDescriptors(conf) - + cfg.MetricAsDistribution = getMetricAsDistribution(conf) return nil } @@ -722,3 +723,33 @@ func getVolumesMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDe } return metricDeclarations } + +func getMetricAsDistribution(conf *confmap.Conf) []string { + var metricsAsDistribution []string + if awscontainerinsight.IsHighFrequencyGPUMetricsEnabled(conf) { + var gpuMetricTypes = []string{ + containerinsightscommon.TypeGpuContainer, + containerinsightscommon.TypeGpuPod, + containerinsightscommon.TypeGpuNode, + } + + // GPU metrics to be compacted to distribution representantion in values and counts + gpuMetrics := []string{ + containerinsightscommon.GpuUtilization, + containerinsightscommon.GpuMemUtilization, + containerinsightscommon.GpuMemTotal, + containerinsightscommon.GpuMemUsed, + containerinsightscommon.GpuPowerDraw, + containerinsightscommon.GpuTemperature, + containerinsightscommon.GpuTensorCoreUtilization, + } + + // Generate metric names by looping through types and metrics + for _, t := range gpuMetricTypes { + for _, m := range gpuMetrics { + metricsAsDistribution = append(metricsAsDistribution, containerinsightscommon.MetricName(t, m)) + } + } + } + return metricsAsDistribution +} diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index dad41112f1..282749391e 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -905,6 +905,398 @@ func TestTranslator(t *testing.T) { "add_entity": true, }, }, + "GenerateAwsEmfExporterConfigKubernetesWithHighFrequencyGPUMetrics": { + translator: NewTranslatorWithName(common.PipelineNameContainerInsights), + input: map[string]any{ + "logs": map[string]any{ + "metrics_collected": map[string]any{ + "kubernetes": map[string]any{ + "enhanced_container_insights": true, + "enable_accelerated_compute_metric": true, + "accelerated_compute_gpu_metrics_collection_interval": "1s", + }, + }, + }, + }, + want: map[string]any{ + "namespace": "ContainerInsights", + "log_group_name": "/aws/containerinsights/{ClusterName}/performance", + "log_stream_name": "{NodeName}", + "dimension_rollup_option": "NoDimensionRollup", + "disable_metric_extraction": false, + "enhanced_container_insights": true, + "parse_json_encoded_attr_values": []string{"Sources", "kubernetes"}, + "output_destination": "cloudwatch", + "eks_fargate_container_insights_enabled": false, + "resource_to_telemetry_conversion": resourcetotelemetry.Settings{ + Enabled: true, + }, + "metric_as_distribution": []string{ + "container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", "container_gpu_tensor_core_utilization", + "pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", "pod_gpu_tensor_core_utilization", + "node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_tensor_core_utilization", + }, + "metric_declarations": []*awsemfexporter.MetricDeclaration{ + { + Dimensions: [][]string{{"ClusterName"}, {"ContainerName", "FullPodName", "PodName", "Namespace", "ClusterName"}, {"ContainerName", "PodName", "Namespace", "ClusterName"}}, + MetricNameSelectors: []string{ + "container_cpu_utilization", "container_cpu_utilization_over_container_limit", "container_cpu_limit", "container_cpu_request", + "container_memory_utilization", "container_memory_utilization_over_container_limit", "container_memory_failures_total", "container_memory_limit", "container_memory_request", + "container_filesystem_usage", "container_filesystem_available", "container_filesystem_utilization", + }, + }, + { + Dimensions: [][]string{{"PodName", "Namespace", "ClusterName"}, {"ClusterName"}, {"Service", "Namespace", "ClusterName"}, {"ClusterName", "Namespace"}, {"FullPodName", "PodName", "Namespace", "ClusterName"}}, + MetricNameSelectors: []string{"pod_cpu_utilization", "pod_memory_utilization", + "pod_network_rx_bytes", "pod_network_tx_bytes", "pod_cpu_utilization_over_pod_limit", + "pod_memory_utilization_over_pod_limit"}, + }, + { + Dimensions: [][]string{ + {"FullPodName", "PodName", "Namespace", "ClusterName"}, + {"PodName", "Namespace", "ClusterName"}, + {"Namespace", "ClusterName"}, + {"ClusterName"}, + }, + MetricNameSelectors: []string{"pod_interface_network_rx_dropped", "pod_interface_network_tx_dropped"}, + }, + { + Dimensions: [][]string{{"PodName", "Namespace", "ClusterName"}, {"ClusterName"}, {"FullPodName", "PodName", "Namespace", "ClusterName"}, {"Service", "Namespace", "ClusterName"}}, + MetricNameSelectors: []string{"pod_cpu_reserved_capacity", "pod_memory_reserved_capacity", "pod_number_of_container_restarts", "pod_number_of_containers", "pod_number_of_running_containers", + "pod_status_ready", "pod_status_scheduled", "pod_status_running", "pod_status_pending", "pod_status_failed", "pod_status_unknown", + "pod_status_succeeded", "pod_memory_request", "pod_memory_limit", "pod_cpu_limit", "pod_cpu_request", "pod_cpu_usage_total", "pod_memory_working_set", + "pod_container_status_running", "pod_container_status_terminated", "pod_container_status_waiting", "pod_container_status_waiting_reason_crash_loop_back_off", + "pod_container_status_waiting_reason_image_pull_error", "pod_container_status_waiting_reason_start_error", "pod_container_status_waiting_reason_create_container_error", + "pod_container_status_waiting_reason_create_container_config_error", "pod_container_status_terminated_reason_oom_killed", + "pod_gpu_request", "pod_gpu_limit", "pod_gpu_usage_total", "pod_gpu_reserved_capacity", + "pod_neuroncore_request", "pod_neuroncore_limit", "pod_neuroncore_usage_total", "pod_neuroncore_reserved_capacity", + "pod_efa_request", "pod_efa_limit", "pod_efa_usage_total", "pod_efa_reserved_capacity", + }, + }, + { + Dimensions: [][]string{{"NodeName", "InstanceId", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"node_cpu_utilization", "node_memory_utilization", "node_network_total_bytes", "node_cpu_reserved_capacity", + "node_memory_reserved_capacity", "node_number_of_running_pods", "node_number_of_running_containers", + "node_cpu_usage_total", "node_cpu_limit", "node_memory_working_set", "node_memory_limit", + "node_status_condition_ready", "node_status_condition_disk_pressure", "node_status_condition_memory_pressure", + "node_status_condition_pid_pressure", "node_status_condition_network_unavailable", "node_status_condition_unknown", "node_status_capacity_pods", "node_status_allocatable_pods", + "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity", + "node_neuroncore_limit", "node_neuroncore_usage_total", "node_neuroncore_reserved_capacity", "node_neuroncore_unreserved_capacity", "node_neuroncore_available_capacity", + "node_efa_limit", "node_efa_usage_total", "node_efa_reserved_capacity", "node_efa_unreserved_capacity", "node_efa_available_capacity"}, + }, + { + Dimensions: [][]string{ + {"NodeName", "InstanceId", "ClusterName"}, + {"ClusterName"}, + }, + MetricNameSelectors: []string{ + "node_interface_network_rx_dropped", "node_interface_network_tx_dropped", + "node_diskio_io_service_bytes_total", "node_diskio_io_serviced_total", + "hyperpod_node_health_status_schedulable", "hyperpod_node_health_status_unschedulable_pending_replacement", + "hyperpod_node_health_status_unschedulable_pending_reboot", + "hyperpod_node_health_status_unschedulable"}, + }, + { + Dimensions: [][]string{{"NodeName", "InstanceId", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"node_filesystem_utilization", "node_filesystem_inodes", "node_filesystem_inodes_free"}, + }, + { + Dimensions: [][]string{{"Service", "Namespace", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"service_number_of_running_pods"}, + }, + { + Dimensions: [][]string{{"PodName", "Namespace", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"replicas_desired", "replicas_ready", "status_replicas_available", "status_replicas_unavailable"}, + }, + { + Dimensions: [][]string{{"PodName", "Namespace", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"daemonset_status_number_available", "daemonset_status_number_unavailable"}, + }, + { + Dimensions: [][]string{{"Namespace", "ClusterName"}, {"ClusterName"}}, + MetricNameSelectors: []string{"namespace_number_of_running_pods", "namespace_ingress_count"}, + }, + { + Dimensions: [][]string{{"ClusterName"}}, + MetricNameSelectors: []string{"cluster_node_count", "cluster_failed_node_count", "cluster_number_of_running_pods"}, + }, + { + Dimensions: [][]string{{"ClusterName", "endpoint"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_storage_size_bytes", "apiserver_storage_db_total_size_in_bytes", "etcd_db_total_size_in_bytes"}, + }, + { + Dimensions: [][]string{{"ClusterName", "resource"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_storage_list_duration_seconds", "apiserver_longrunning_requests", "apiserver_storage_objects"}, + }, + { + Dimensions: [][]string{{"ClusterName", "verb"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_request_duration_seconds", "rest_client_request_duration_seconds"}, + }, + { + Dimensions: [][]string{{"ClusterName", "code", "verb"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_request_total", "apiserver_request_total_5xx"}, + }, + { + Dimensions: [][]string{{"ClusterName", "operation"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_admission_controller_admission_duration_seconds", "apiserver_admission_step_admission_duration_seconds", "etcd_request_duration_seconds"}, + }, + { + Dimensions: [][]string{{"ClusterName", "code", "method"}, {"ClusterName"}}, + MetricNameSelectors: []string{"rest_client_requests_total"}, + }, + { + Dimensions: [][]string{{"ClusterName", "request_kind"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_current_inflight_requests", "apiserver_current_inqueue_requests"}, + }, + { + Dimensions: [][]string{{"ClusterName", "name"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_admission_webhook_admission_duration_seconds"}, + }, + { + Dimensions: [][]string{{"ClusterName", "group"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_requested_deprecated_apis"}, + }, + { + Dimensions: [][]string{{"ClusterName", "reason"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_flowcontrol_rejected_requests_total"}, + }, + { + Dimensions: [][]string{{"ClusterName", "priority_level"}, {"ClusterName"}}, + MetricNameSelectors: []string{"apiserver_flowcontrol_request_concurrency_limit"}, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, + MetricNameSelectors: []string{ + "container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", "container_gpu_tensor_core_utilization", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, + MetricNameSelectors: []string{ + "pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", "pod_gpu_tensor_core_utilization", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType", "GpuDevice"}}, + MetricNameSelectors: []string{ + "node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_tensor_core_utilization", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "NeuronDevice", "NeuronCore"}}, + MetricNameSelectors: []string{ + "container_neuroncore_utilization", + "container_neuroncore_memory_usage_total", + "container_neuroncore_memory_usage_constants", + "container_neuroncore_memory_usage_model_code", + "container_neuroncore_memory_usage_model_shared_scratchpad", + "container_neuroncore_memory_usage_runtime_memory", + "container_neuroncore_memory_usage_tensors", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "NeuronDevice"}}, + MetricNameSelectors: []string{ + "container_neurondevice_hw_ecc_events_total", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "NeuronDevice", "NeuronCore"}}, + MetricNameSelectors: []string{ + "pod_neuroncore_utilization", + "pod_neuroncore_memory_usage_total", + "pod_neuroncore_memory_usage_constants", + "pod_neuroncore_memory_usage_model_code", + "pod_neuroncore_memory_usage_model_shared_scratchpad", + "pod_neuroncore_memory_usage_runtime_memory", + "pod_neuroncore_memory_usage_tensors", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "NeuronDevice"}}, + MetricNameSelectors: []string{ + "pod_neurondevice_hw_ecc_events_total", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceType", "InstanceId", "NodeName", "NeuronDevice", "NeuronCore"}}, + MetricNameSelectors: []string{ + "node_neuroncore_utilization", + "node_neuroncore_memory_usage_total", + "node_neuroncore_memory_usage_constants", + "node_neuroncore_memory_usage_model_code", + "node_neuroncore_memory_usage_model_shared_scratchpad", + "node_neuroncore_memory_usage_runtime_memory", + "node_neuroncore_memory_usage_tensors", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}}, + MetricNameSelectors: []string{ + "node_neuron_execution_errors_total", + "node_neurondevice_runtime_memory_used_bytes", + "node_neuron_execution_latency", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}}, + MetricNameSelectors: []string{ + "node_neurondevice_hw_ecc_events_total", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "NetworkInterfaceId"}}, + MetricNameSelectors: []string{ + "container_efa_rx_bytes", "container_efa_tx_bytes", "container_efa_rx_dropped", "container_efa_rdma_read_bytes", "container_efa_rdma_write_bytes", "container_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "NetworkInterfaceId"}}, + MetricNameSelectors: []string{ + "pod_efa_rx_bytes", "pod_efa_tx_bytes", "pod_efa_rx_dropped", "pod_efa_rdma_read_bytes", "pod_efa_rdma_write_bytes", "pod_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType", "NetworkInterfaceId"}}, + MetricNameSelectors: []string{ + "node_efa_rx_bytes", "node_efa_tx_bytes", "node_efa_rx_dropped", "node_efa_rdma_read_bytes", "node_efa_rdma_write_bytes", "node_efa_rdma_write_recv_bytes", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "VolumeId"}}, + MetricNameSelectors: []string{ + "node_diskio_ebs_total_read_ops", "node_diskio_ebs_total_write_ops", "node_diskio_ebs_total_read_bytes", "node_diskio_ebs_total_write_bytes", + "node_diskio_ebs_total_read_time", "node_diskio_ebs_total_write_time", "node_diskio_ebs_volume_performance_exceeded_iops", "node_diskio_ebs_volume_performance_exceeded_tp", + "node_diskio_ebs_ec2_instance_performance_exceeded_iops", "node_diskio_ebs_ec2_instance_performance_exceeded_tp", "node_diskio_ebs_volume_queue_length", + }, + }, + { + Dimensions: [][]string{ + {"ClusterName"}, + {"ClusterName", "Namespace"}, + {"ClusterName", "Namespace", "PersistentVolumeClaimName"}, + }, + MetricNameSelectors: []string{ + "persistent_volume_claim_status_bound", + "persistent_volume_claim_status_lost", + "persistent_volume_claim_status_pending", + "persistent_volume_claim_count", + }, + }, + { + Dimensions: [][]string{ + {"ClusterName"}, + }, + MetricNameSelectors: []string{ + "persistent_volume_count", + }, + }, + }, + "metric_descriptors": []awsemfexporter.MetricDescriptor{ + { + MetricName: "apiserver_admission_controller_admission_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "apiserver_admission_step_admission_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "apiserver_admission_webhook_admission_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "apiserver_current_inflight_requests", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_current_inqueue_requests", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_flowcontrol_rejected_requests_total", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_flowcontrol_request_concurrency_limit", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_longrunning_requests", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_request_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "apiserver_request_total", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_request_total_5xx", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_requested_deprecated_apis", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_storage_objects", + Unit: "Count", + Overwrite: true, + }, + { + MetricName: "apiserver_storage_list_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "apiserver_storage_db_total_size_in_bytes", + Unit: "Bytes", + Overwrite: true, + }, + { + MetricName: "apiserver_storage_size_bytes", + Unit: "Bytes", + Overwrite: true, + }, + { + MetricName: "etcd_db_total_size_in_bytes", + Unit: "Bytes", + Overwrite: true, + }, + { + MetricName: "etcd_request_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "rest_client_request_duration_seconds", + Unit: "Seconds", + Overwrite: true, + }, + { + MetricName: "rest_client_requests_total", + Unit: "Count", + Overwrite: true, + }, + }, + "local_mode": false, + }, + }, } for name, testCase := range testCases { t.Run(name, func(t *testing.T) { @@ -940,6 +1332,9 @@ func TestTranslator(t *testing.T) { if addEntity, exists := testCase.want["add_entity"]; exists { assert.Equal(t, addEntity, gotCfg.AddEntity) } + if metricAsDistribution, exists := testCase.want["metric_as_distribution"]; exists { + assert.ElementsMatch(t, metricAsDistribution, gotCfg.MetricAsDistribution) + } assert.Equal(t, "/ca/bundle", gotCfg.CertificateFilePath) assert.Equal(t, "global_arn", gotCfg.RoleARN) assert.Equal(t, "us-east-1", gotCfg.Region) diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index 91a6fd6b10..9d9c58f272 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -5,6 +5,7 @@ package containerinsights import ( "fmt" + "time" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" @@ -17,6 +18,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/batchprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/filterprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/groupbyattrsprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/kueue" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/metricstransformprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" @@ -58,13 +60,29 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprint(ecsKey, " or ", eksKey)} } + highFrequencyGPUMetricsEnabled := t.pipelineName == ciPipelineName && awscontainerinsight.IsHighFrequencyGPUMetricsEnabled(conf) + // create processor map with - // - default batch processor + // - batch processor (with timeout override if high-frequency GPU metrics are enabled) // - filter processor to drop prometheus metadata + var batchTranslator common.ComponentTranslator + if highFrequencyGPUMetricsEnabled { + // Use 1 minute timeout directly for high-frequency GPU metrics + batchTranslator = batchprocessor.NewTranslator(common.WithName(t.pipelineName), batchprocessor.WithTelemetrySection(common.LogsKey), batchprocessor.WithTimeout(time.Minute)) + } else { + // Use default timeout based on telemetry section + batchTranslator = batchprocessor.NewTranslator(common.WithName(t.pipelineName), batchprocessor.WithTelemetrySection(common.LogsKey)) + } + processors := common.NewTranslatorMap( - batchprocessor.NewTranslatorWithNameAndSection(t.pipelineName, common.LogsKey), + batchTranslator, filterprocessor.NewTranslator(common.WithName(t.pipelineName)), ) + + if highFrequencyGPUMetricsEnabled { + processors.Set(groupbyattrsprocessor.NewTranslatorWithName(t.pipelineName)) + } + // create exporter map with default emf exporter based on pipeline name exporters := common.NewTranslatorMap(awsemf.NewTranslatorWithName(t.pipelineName)) // create extensions map based on pipeline name diff --git a/translator/translate/otel/pipeline/containerinsights/translators_test.go b/translator/translate/otel/pipeline/containerinsights/translators_test.go index 78720fd415..e615fb0305 100644 --- a/translator/translate/otel/pipeline/containerinsights/translators_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translators_test.go @@ -18,8 +18,9 @@ import ( func TestTranslators(t *testing.T) { type want struct { - receivers []string - exporters []string + receivers []string + processors []string + exporters []string } testCases := map[string]struct { input map[string]any @@ -37,8 +38,9 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -55,8 +57,9 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights", "metricstransform/containerinsights", "gpuattributes/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -73,12 +76,35 @@ func TestTranslators(t *testing.T) { }, want: map[string]want{ "metrics/containerinsights": { - receivers: []string{"awscontainerinsightreceiver"}, - exporters: []string{"awsemf/containerinsights"}, + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "awsentity/resource/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, "metrics/kueueContainerInsights": { - receivers: []string{"awscontainerinsightskueuereceiver"}, - exporters: []string{"awsemf/kueueContainerInsights"}, + receivers: []string{"awscontainerinsightskueuereceiver"}, + processors: []string{"batch/kueueContainerInsights", "filter/kueueContainerInsights", "kueueattributes/kueueContainerInsights"}, + exporters: []string{"awsemf/kueueContainerInsights"}, + }, + }, + }, + "WithEnhancedContainerInsightsAndHighFrequencyGPUMetrics": { + input: map[string]interface{}{ + "logs": map[string]interface{}{ + "metrics_collected": map[string]interface{}{ + "kubernetes": map[string]interface{}{ + "enhanced_container_insights": true, + "accelerated_compute_metrics": true, + "accelerated_compute_gpu_metrics_collection_interval": 30, // 30 seconds, less than default 60s + "cluster_name": "TestCluster", + }, + }, + }, + }, + want: map[string]want{ + "metrics/containerinsights": { + receivers: []string{"awscontainerinsightreceiver"}, + processors: []string{"batch/containerinsights", "filter/containerinsights", "groupbyattrs/containerinsights", "awsentity/resource/containerinsights", "metricstransform/containerinsights", "gpuattributes/containerinsights"}, + exporters: []string{"awsemf/containerinsights"}, }, }, }, @@ -98,6 +124,7 @@ func TestTranslators(t *testing.T) { g, err := tr.Translate(conf) assert.NoError(t, err) assert.Equal(t, w.receivers, collections.MapSlice(g.Receivers.Keys(), component.ID.String)) + assert.Equal(t, w.processors, collections.MapSlice(g.Processors.Keys(), component.ID.String)) assert.Equal(t, w.exporters, collections.MapSlice(g.Exporters.Keys(), component.ID.String)) }) } diff --git a/translator/translate/otel/pipeline/emf_logs/translator.go b/translator/translate/otel/pipeline/emf_logs/translator.go index 221b52fc9b..c7c0bf9b6a 100644 --- a/translator/translate/otel/pipeline/emf_logs/translator.go +++ b/translator/translate/otel/pipeline/emf_logs/translator.go @@ -54,13 +54,13 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators ), } if serviceAddress, ok := common.GetString(conf, serviceAddressEMFKey); ok { - if strings.Contains(serviceAddress, common.Udp) { + if strings.Contains(serviceAddress, common.UDP) { translators.Receivers.Set(udplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } else { translators.Receivers.Set(tcplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } } else if serviceAddress, ok = common.GetString(conf, serviceAddressStructuredLogKey); ok { - if strings.Contains(serviceAddress, common.Udp) { + if strings.Contains(serviceAddress, common.UDP) { translators.Receivers.Set(udplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) } else { translators.Receivers.Set(tcplog.NewTranslatorWithName(common.PipelineNameEmfLogs)) diff --git a/translator/translate/otel/processor/batchprocessor/translator.go b/translator/translate/otel/processor/batchprocessor/translator.go index af9bef3aa6..1e23e1adcb 100644 --- a/translator/translate/otel/processor/batchprocessor/translator.go +++ b/translator/translate/otel/processor/batchprocessor/translator.go @@ -20,25 +20,59 @@ var defaultForceFlushInterval = map[string]time.Duration{ common.LogsKey: 5 * time.Second, } +// WithTelemetrySection sets the telemetry section key for the translator +func WithTelemetrySection(section string) common.TranslatorOption { + return func(target any) { + if t, ok := target.(*translator); ok { + t.telemetrySectionKey = section + } + } +} + +// WithTimeout sets a timeout override for the translator +func WithTimeout(timeout time.Duration) common.TranslatorOption { + return func(target any) { + if t, ok := target.(*translator); ok { + t.timeout = &timeout + } + } +} + type translator struct { - name string + factory processor.Factory + common.NameProvider telemetrySectionKey string - factory processor.Factory + timeout *time.Duration } var _ common.ComponentTranslator = (*translator)(nil) +var _ common.NameSetter = (*translator)(nil) +// NewTranslator creates a new batch processor translator with options +func NewTranslator(opts ...common.TranslatorOption) common.ComponentTranslator { + t := &translator{factory: batchprocessor.NewFactory()} + for _, opt := range opts { + opt(t) + } + return t +} + +// Use NewTranslator with WithName and WithTelemetrySection options func NewTranslatorWithNameAndSection(name string, telemetrySectionKey string) common.ComponentTranslator { - return &translator{name, telemetrySectionKey, batchprocessor.NewFactory()} + return NewTranslator(common.WithName(name), WithTelemetrySection(telemetrySectionKey)) } func (t *translator) ID() component.ID { - return component.NewIDWithName(t.factory.Type(), t.name) + return component.NewIDWithName(t.factory.Type(), t.Name()) } func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { cfg := t.factory.CreateDefaultConfig().(*batchprocessor.Config) - if duration, ok := common.GetDuration(conf, common.ConfigKey(t.telemetrySectionKey, common.ForceFlushIntervalKey)); ok { + + // First check if we have a timeout override + if t.timeout != nil { + cfg.Timeout = *t.timeout + } else if duration, ok := common.GetDuration(conf, common.ConfigKey(t.telemetrySectionKey, common.ForceFlushIntervalKey)); ok { cfg.Timeout = duration } else if defaultDuration, ok := defaultForceFlushInterval[t.telemetrySectionKey]; ok { cfg.Timeout = defaultDuration diff --git a/translator/translate/otel/processor/batchprocessor/translator_test.go b/translator/translate/otel/processor/batchprocessor/translator_test.go index 4de64a35b6..84b45bc282 100644 --- a/translator/translate/otel/processor/batchprocessor/translator_test.go +++ b/translator/translate/otel/processor/batchprocessor/translator_test.go @@ -77,6 +77,41 @@ func TestTranslator(t *testing.T) { SendBatchMaxSize: 0, }, }, + "TimeoutOverrideMetricsSection": { + translator: NewTranslator(common.WithName("test"), WithTelemetrySection(common.MetricsKey), WithTimeout(45*time.Second)), + input: map[string]interface{}{ + "metrics": map[string]interface{}{ + "force_flush_interval": 30, + }, + }, + want: &batchprocessor.Config{ + Timeout: 45 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, + "TimeoutOverrideLogsSection": { + translator: NewTranslator(common.WithName("test"), WithTelemetrySection(common.LogsKey), WithTimeout(15*time.Second)), + input: map[string]interface{}{ + "logs": map[string]interface{}{}, + }, + want: &batchprocessor.Config{ + Timeout: 15 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, + "TimeoutOverrideNotConfiguredSection": { + translator: NewTranslator(common.WithName("test"), WithTelemetrySection(common.TracesKey), WithTimeout(25*time.Second)), + input: map[string]interface{}{ + "traces": map[string]interface{}{}, + }, + want: &batchprocessor.Config{ + Timeout: 25 * time.Second, + SendBatchSize: 8192, + SendBatchMaxSize: 0, + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/translator/translate/otel/processor/groupbyattrsprocessor/translator.go b/translator/translate/otel/processor/groupbyattrsprocessor/translator.go new file mode 100644 index 0000000000..54a986491d --- /dev/null +++ b/translator/translate/otel/processor/groupbyattrsprocessor/translator.go @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package groupbyattrsprocessor + +import ( + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/processor" + + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" +) + +type translator struct { + name string + factory processor.Factory +} + +var _ common.ComponentTranslator = (*translator)(nil) + +func NewTranslatorWithName(name string) common.ComponentTranslator { + return &translator{name, groupbyattrsprocessor.NewFactory()} +} + +func (t *translator) ID() component.ID { + return component.NewIDWithName(t.factory.Type(), t.name) +} + +func (t *translator) Translate(_ *confmap.Conf) (component.Config, error) { + cfg := t.factory.CreateDefaultConfig().(*groupbyattrsprocessor.Config) + return cfg, nil +} diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator.go b/translator/translate/otel/receiver/awscontainerinsight/translator.go index ba3566c801..f83dc32fe4 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator.go @@ -8,7 +8,6 @@ import ( "fmt" "os" "strings" - "time" "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver" "go.opentelemetry.io/collector/component" @@ -32,8 +31,7 @@ const ( ecs = "ecs" eks = "eks" - defaultMetricsCollectionInterval = time.Minute - defaultLeaderLockName = "cwagent-clusterleader" // To maintain backwards compatability with https://github.com/aws/amazon-cloudwatch-agent/blob/2dd89abaab4590cffbbc31ef89319b62809b09d1/plugins/inputs/k8sapiserver/k8sapiserver.go#L30 + defaultLeaderLockName = "cwagent-clusterleader" // To maintain backwards compatibility with https://github.com/aws/amazon-cloudwatch-agent/blob/2dd89abaab4590cffbbc31ef89319b62809b09d1/plugins/inputs/k8sapiserver/k8sapiserver.go#L30 ) type translator struct { @@ -82,10 +80,12 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { common.ConfigKey(configuredService.Key, common.MetricsCollectionIntervalKey), common.ConfigKey(common.AgentKey, common.MetricsCollectionIntervalKey), } - cfg.CollectionInterval = common.GetOrDefaultDuration(conf, intervalKeyChain, defaultMetricsCollectionInterval) + cfg.CollectionInterval = common.GetOrDefaultDuration(conf, intervalKeyChain, DefaultMetricsCollectionInterval) cfg.CollectionRole = getCollectionRole() cfg.ContainerOrchestrator = configuredService.Value cfg.AWSSessionSettings.Region = agent.Global_Config.Region + cfg.AcceleratedComputeGPUMetricsCollectionInterval = GetAcceleratedComputeGPUMetricsCollectionInterval(conf) + if profileKey, ok := agent.Global_Config.Credentials[agent.Profile_Key]; ok { cfg.AWSSessionSettings.Profile = fmt.Sprintf("%v", profileKey) } diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator_test.go b/translator/translate/otel/receiver/awscontainerinsight/translator_test.go index 09570dcf46..76b578d2f7 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator_test.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator_test.go @@ -46,6 +46,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: time.Minute, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECS/WithAgentInterval": { @@ -64,6 +65,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: 20 * time.Second, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECS/WithSectionInterval": { @@ -84,6 +86,7 @@ func TestTranslator(t *testing.T) { CollectionInterval: 10 * time.Second, LeaderLockName: "otel-container-insight-clusterleader", TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes": { @@ -105,6 +108,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, TagService: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithoutClusterName": { @@ -136,6 +140,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithCollectionRoleLeader": { @@ -160,6 +165,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithCollectionRoleNode": { @@ -184,6 +190,7 @@ func TestTranslator(t *testing.T) { LeaderLockUsingConfigMapOnly: true, ClusterName: "TestCluster", KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithEnhancedContainerInsights": { @@ -209,6 +216,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel1Granularity": { @@ -233,6 +241,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: false, AddContainerNameMetricLabel: false, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel2Granularity": { @@ -258,6 +267,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithKubernetes/WithLevel3Granularity": { @@ -283,6 +293,7 @@ func TestTranslator(t *testing.T) { AddFullPodNameMetricLabel: true, AddContainerNameMetricLabel: true, KubeConfigPath: "", + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithECSAndKubernetes": { @@ -304,6 +315,7 @@ func TestTranslator(t *testing.T) { LeaderLockName: "otel-container-insight-clusterleader", LeaderLockUsingConfigMapOnly: false, TagService: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value }, }, "WithEKSAndCustomKubeConfigPathHostDetails": { @@ -332,6 +344,36 @@ func TestTranslator(t *testing.T) { HostName: "test-hostname", HostIP: "1.2.3.4", RunOnSystemd: true, + AcceleratedComputeGPUMetricsCollectionInterval: 60 * time.Second, // Default value + }, + }, + "WithKubernetes/WithEnhancedContainerInsights/WithHighFrequencyGPUMetrics": { + input: map[string]interface{}{ + "logs": map[string]interface{}{ + "metrics_collected": map[string]interface{}{ + "kubernetes": map[string]interface{}{ + "enhanced_container_insights": true, + "accelerated_compute_metrics": true, + "accelerated_compute_gpu_metrics_collection_interval": 30, // 30 seconds, less than default 60s + "cluster_name": "TestCluster", + }, + }, + }, + }, + want: &awscontainerinsightreceiver.Config{ + ContainerOrchestrator: eks, + CollectionInterval: 60 * time.Second, + PrefFullPodName: true, + LeaderLockName: defaultLeaderLockName, + LeaderLockUsingConfigMapOnly: true, + ClusterName: "TestCluster", + TagService: true, + EnableControlPlaneMetrics: true, + AddFullPodNameMetricLabel: true, + AddContainerNameMetricLabel: true, + KubeConfigPath: "", + EnableAcceleratedComputeMetrics: true, + AcceleratedComputeGPUMetricsCollectionInterval: 30 * time.Second, // Custom value }, }, } @@ -364,6 +406,7 @@ func TestTranslator(t *testing.T) { require.Equal(t, testCase.want.HostName, gotCfg.HostName) require.Equal(t, testCase.want.HostIP, gotCfg.HostIP) require.Equal(t, testCase.want.RunOnSystemd, gotCfg.RunOnSystemd) + require.Equal(t, testCase.want.AcceleratedComputeGPUMetricsCollectionInterval, gotCfg.AcceleratedComputeGPUMetricsCollectionInterval) } }) } diff --git a/translator/translate/otel/receiver/awscontainerinsight/utils.go b/translator/translate/otel/receiver/awscontainerinsight/utils.go index 721951b056..37daf25e66 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/utils.go +++ b/translator/translate/otel/receiver/awscontainerinsight/utils.go @@ -4,13 +4,16 @@ package awscontainerinsight import ( + "time" + "go.opentelemetry.io/collector/confmap" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) const ( - BaseContainerInsights = iota + 1 + BaseContainerInsights = iota + 1 + DefaultMetricsCollectionInterval = time.Minute ) func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { @@ -27,3 +30,15 @@ func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { func AcceleratedComputeMetricsEnabled(conf *confmap.Conf) bool { return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableAcceleratedComputeMetric), true) } + +func GetAcceleratedComputeGPUMetricsCollectionInterval(conf *confmap.Conf) time.Duration { + return common.GetOrDefaultDuration(conf, []string{ + common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.AcceleratedComputeGPUMetricsCollectionInterval), + }, DefaultMetricsCollectionInterval) +} + +func IsHighFrequencyGPUMetricsEnabled(conf *confmap.Conf) bool { + return EnhancedContainerInsightsEnabled(conf) && + AcceleratedComputeMetricsEnabled(conf) && + GetAcceleratedComputeGPUMetricsCollectionInterval(conf) < DefaultMetricsCollectionInterval +} diff --git a/translator/translate/otel/receiver/tcplog/translator.go b/translator/translate/otel/receiver/tcplog/translator.go index 3db0906c74..2863802d4f 100644 --- a/translator/translate/otel/receiver/tcplog/translator.go +++ b/translator/translate/otel/receiver/tcplog/translator.go @@ -61,7 +61,7 @@ func (t *translator) ID() component.ID { // tcp:localhost:25888 func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { if !conf.IsSet(baseKey) || - (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.Tcp)) { + (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.TCP)) { return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprintf("missing %s or tcp service address", baseKey)} } cfg := t.factory.CreateDefaultConfig().(*tcplogreceiver.TCPLogConfig) diff --git a/translator/translate/otel/receiver/udplog/translator.go b/translator/translate/otel/receiver/udplog/translator.go index c315bf9818..7b163c0e99 100644 --- a/translator/translate/otel/receiver/udplog/translator.go +++ b/translator/translate/otel/receiver/udplog/translator.go @@ -61,7 +61,7 @@ func (t *translator) ID() component.ID { // udp:localhost:25888 func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { if !conf.IsSet(baseKey) || - (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.Udp)) { + (conf.IsSet(common.ConfigKey(serviceAddressKey)) && !strings.Contains(fmt.Sprintf("%v", conf.Get(serviceAddressKey)), common.UDP)) { return nil, &common.MissingKeyError{ID: t.ID(), JsonKey: fmt.Sprintf("missing %s or udp service address", baseKey)} } cfg := t.factory.CreateDefaultConfig().(*udplogreceiver.UDPLogConfig)