Skip to content

Commit 69a8d55

Browse files
committed
Enable opt-in for high frequency GPU metrics
1 parent aadbc30 commit 69a8d55

File tree

17 files changed

+260
-88
lines changed

17 files changed

+260
-88
lines changed

go.mod

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ replace collectd.org v0.4.0 => github.com/collectd/go-collectd v0.4.0
1010
// to be all replaced since there are some changes that will always be from upstream
1111
replace (
1212
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20250916235509-415059d978ca
13-
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20250916235509-415059d978ca
13+
//TODO replace with offical repo after dependent PR merged
14+
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014130217-db2a1ad255e0
1415
github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20250916235509-415059d978ca
1516
)
1617

@@ -47,7 +48,8 @@ replace (
4748
)
4849

4950
replace (
50-
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20250916235509-415059d978ca
51+
//TODO replace with offical repo after dependent PR merged
52+
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f
5153
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20250916235509-415059d978ca
5254
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20250916235509-415059d978ca
5355
github.com/open-telemetry/opentelemetry-collector-contrib/receiver/jmxreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/jmxreceiver v0.0.0-20250916235509-415059d978ca
@@ -145,6 +147,7 @@ require (
145147
github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.124.0
146148
github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1
147149
github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1
150+
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1
148151
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1
149152
github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1
150153
github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor v0.124.1

go.sum

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,6 @@ github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483 h1:J8HaD+Zpfi1gcel3HCKpoHHEsrc
187187
github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483/go.mod h1:RcDobYh8k5VP6TNybz9m++gL3ijVI5wueVr0EM10VsU=
188188
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20250916235509-415059d978ca h1:Q5N9Pk9Ll+vxtekedhIHTzkodFY+eeCFY5qm8fUt0V4=
189189
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20250916235509-415059d978ca/go.mod h1:8dL1mhunGsDXn59xUlnNn1ydT5wp6Fh5KTvlBEaN2Po=
190-
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20250916235509-415059d978ca h1:3rWH4r1+GL0GLkSvXCxeOce/QpwWXiO3MEye9QibuEk=
191-
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20250916235509-415059d978ca/go.mod h1:dSu6d3FZqrAECatXDhvYsQIEAaL1iF+fokrPwCjxhC8=
192190
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20250916235509-415059d978ca h1:jFtqaXiCa0eJAkcAwe0ZDfc0h8ZLH6qBbxCsF5/6u8c=
193191
github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20250916235509-415059d978ca/go.mod h1:b9TxHHL62ladWlbU6klYIvDjCN3Ee31oWrKlF50fQns=
194192
github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20250916235509-415059d978ca h1:0eUGH0ApRikIg6+9dDa5afBJFxrpelSg9H2EhVofbrY=
@@ -229,8 +227,6 @@ github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulat
229227
github.com/amazon-contributing/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.0.0-20250916235509-415059d978ca/go.mod h1:Wvs2QPuB4ngUiOjrJpYWLqfU8X0Z27s33uMKP4YHQmE=
230228
github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20250916235509-415059d978ca h1:V/9Rtbg0/3I2EG/m2PTNphZMfHVEvvfwR/Ju5g0S7yg=
231229
github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20250916235509-415059d978ca/go.mod h1:idk0SX/ZWccyRfPyAKPu1uVvd+KBMT0pE75HHFogitY=
232-
github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20250916235509-415059d978ca h1:xgz1XZXLv3suBBAqcIptP18aOUmXRNjHqdUUjxs/vE0=
233-
github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20250916235509-415059d978ca/go.mod h1:2/Yuy1ePzxKLoHdJIS/BBdWfMD+wpkudvxe1HXZuMAM=
234230
github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20250916235509-415059d978ca h1:AxsN9F05JXYnht9EZ6rJsPa1LQoLxd3QN65FAugl26Q=
235231
github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightskueuereceiver v0.0.0-20250916235509-415059d978ca/go.mod h1:431fc3JgruV7R3yhAzu7w0fdPaBp1Tbn4RV+8R4Dtdw=
236232
github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20250916235509-415059d978ca h1:A8rtTWYIckZa/vpWsBbkjgIqOfPscg1z1nv+VnP7CEI=
@@ -1278,6 +1274,8 @@ github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatoratep
12781274
github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.124.1/go.mod h1:bp3Y5GT4dkGWRGEZqKgfanyk6ZSsVGNY5aNDvX4c8WE=
12791275
github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1 h1:qkqiqLwfg7hj+oDYvpmMD64p+poaxXwo654ZE44uPm4=
12801276
github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.124.1/go.mod h1:B/GP3l4Y1qNsNtWVIzpwS8jWB1Nn/vx0sFBlVDkWt9E=
1277+
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1 h1:gYlUUIR+lzLQCpj5phh+Ogmk5BRaOrEuKGjIixCk89I=
1278+
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.124.1/go.mod h1:bkAXtBtShDOA8SuF8IpYbhx1BYWUEE1rW10HXXEXW/4=
12811279
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1 h1:6MAKxLXfQWHEadn9AgY1jWdKFTJkLYVBa+/h3Rk23lE=
12821280
github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor v0.124.1/go.mod h1:6q2oIAtCuX9HklnqGPO8sWPoTAjhZX1x23O0aTR/zd0=
12831281
github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.124.1 h1:esSFJIhlZaZslW9EYY/Ss5zUnfkuN2qiS+7ujk73/gU=
@@ -1590,6 +1588,10 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo
15901588
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
15911589
github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
15921590
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
1591+
github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014130217-db2a1ad255e0 h1:3LO5T302XgJL4wkBnaFPgtomAqxpAN8c6ujuF5QjWvg=
1592+
github.com/yanhaoluo666/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20251014130217-db2a1ad255e0/go.mod h1:dSu6d3FZqrAECatXDhvYsQIEAaL1iF+fokrPwCjxhC8=
1593+
github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f h1:heA619m3+WxatwdpSaBqCpnlA719p2nvjmSpJDhDy44=
1594+
github.com/yanhaoluo666/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20251013162607-385c069b9f1f/go.mod h1:2/Yuy1ePzxKLoHdJIS/BBdWfMD+wpkudvxe1HXZuMAM=
15931595
github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a h1:fZHgsYlfvtyqToslyjUt3VOPF4J7aK/3MPcK7xp3PDk=
15941596
github.com/youmark/pkcs8 v0.0.0-20201027041543-1326539a0a0a/go.mod h1:ul22v+Nro/R083muKhosV54bj5niojjWZvU8xrevuH4=
15951597
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=

service/defaultcomponents/components.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor"
2020
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor"
2121
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor"
22+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor"
2223
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbytraceprocessor"
2324
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor"
2425
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricsgenerationprocessor"
@@ -107,6 +108,7 @@ func Factories() (otelcol.Factories, error) {
107108
gpuattributes.NewFactory(),
108109
kueueattributes.NewFactory(),
109110
groupbytraceprocessor.NewFactory(),
111+
groupbyattrsprocessor.NewFactory(),
110112
k8sattributesprocessor.NewFactory(),
111113
memorylimiterprocessor.NewFactory(),
112114
metricsgenerationprocessor.NewFactory(),

service/defaultcomponents/components_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ func TestComponents(t *testing.T) {
5454
"gpuattributes",
5555
"kueueattributes",
5656
"groupbytrace",
57+
"groupbyattrs",
5758
"k8sattributes",
5859
"memory_limiter",
5960
"metricstransform",

translator/tocwconfig/sampleConfig/base_container_insights_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ processors:
196196
receivers:
197197
awscontainerinsightreceiver:
198198
accelerated_compute_metrics: false
199+
accelerated_compute_gpu_metrics_collection_interval: 1m0s
199200
add_container_name_metric_label: false
200201
add_full_pod_name_metric_label: false
201202
add_service_as_attribute: true

translator/translate/otel/common/common.go

Lines changed: 63 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -21,67 +21,69 @@ import (
2121
)
2222

2323
const (
24-
AgentKey = "agent"
25-
DebugKey = "debug"
26-
MetricsKey = "metrics"
27-
LogsKey = "logs"
28-
TracesKey = "traces"
29-
MetricsCollectedKey = "metrics_collected"
30-
LogsCollectedKey = "logs_collected"
31-
TracesCollectedKey = "traces_collected"
32-
MetricsDestinationsKey = "metrics_destinations"
33-
ECSKey = "ecs"
34-
KubernetesKey = "kubernetes"
35-
CloudWatchKey = "cloudwatch"
36-
CloudWatchLogsKey = "cloudwatchlogs"
37-
PrometheusKey = "prometheus"
38-
PrometheusConfigPathKey = "prometheus_config_path"
39-
AMPKey = "amp"
40-
WorkspaceIDKey = "workspace_id"
41-
EMFProcessorKey = "emf_processor"
42-
DisableMetricExtraction = "disable_metric_extraction"
43-
XrayKey = "xray"
44-
OtlpKey = "otlp"
45-
JmxKey = "jmx"
46-
TLSKey = "tls"
47-
Endpoint = "endpoint"
48-
EndpointOverrideKey = "endpoint_override"
49-
RegionOverrideKey = "region_override"
50-
ProxyOverrideKey = "proxy_override"
51-
InsecureKey = "insecure"
52-
LocalModeKey = "local_mode"
53-
CredentialsKey = "credentials"
54-
RoleARNKey = "role_arn"
55-
SigV4Auth = "sigv4auth"
56-
MetricsCollectionIntervalKey = "metrics_collection_interval"
57-
AggregationDimensionsKey = "aggregation_dimensions"
58-
MeasurementKey = "measurement"
59-
DropOriginalMetricsKey = "drop_original_metrics"
60-
ForceFlushIntervalKey = "force_flush_interval"
61-
ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights
62-
EnhancedContainerInsights = "enhanced_container_insights"
63-
ResourcesKey = "resources"
64-
PreferFullPodName = "prefer_full_pod_name"
65-
EnableAcceleratedComputeMetric = "accelerated_compute_metrics"
66-
EnableKueueContainerInsights = "kueue_container_insights"
67-
AppendDimensionsKey = "append_dimensions"
68-
Console = "console"
69-
DiskKey = "disk"
70-
DiskIOKey = "diskio"
71-
NetKey = "net"
72-
Emf = "emf"
73-
StructuredLog = "structuredlog"
74-
ServiceAddress = "service_address"
75-
Udp = "udp"
76-
Tcp = "tcp"
77-
TlsKey = "tls"
78-
Tags = "tags"
79-
Region = "region"
80-
LogGroupName = "log_group_name"
81-
LogStreamName = "log_stream_name"
82-
NameKey = "name"
83-
RenameKey = "rename"
84-
UnitKey = "unit"
24+
AgentKey = "agent"
25+
DebugKey = "debug"
26+
MetricsKey = "metrics"
27+
LogsKey = "logs"
28+
TracesKey = "traces"
29+
MetricsCollectedKey = "metrics_collected"
30+
LogsCollectedKey = "logs_collected"
31+
TracesCollectedKey = "traces_collected"
32+
MetricsDestinationsKey = "metrics_destinations"
33+
ECSKey = "ecs"
34+
KubernetesKey = "kubernetes"
35+
CloudWatchKey = "cloudwatch"
36+
CloudWatchLogsKey = "cloudwatchlogs"
37+
PrometheusKey = "prometheus"
38+
PrometheusConfigPathKey = "prometheus_config_path"
39+
AMPKey = "amp"
40+
WorkspaceIDKey = "workspace_id"
41+
EMFProcessorKey = "emf_processor"
42+
DisableMetricExtraction = "disable_metric_extraction"
43+
XrayKey = "xray"
44+
OtlpKey = "otlp"
45+
JmxKey = "jmx"
46+
TLSKey = "tls"
47+
Endpoint = "endpoint"
48+
EndpointOverrideKey = "endpoint_override"
49+
RegionOverrideKey = "region_override"
50+
ProxyOverrideKey = "proxy_override"
51+
InsecureKey = "insecure"
52+
LocalModeKey = "local_mode"
53+
CredentialsKey = "credentials"
54+
RoleARNKey = "role_arn"
55+
SigV4Auth = "sigv4auth"
56+
MetricsCollectionIntervalKey = "metrics_collection_interval"
57+
AggregationDimensionsKey = "aggregation_dimensions"
58+
MeasurementKey = "measurement"
59+
DropOriginalMetricsKey = "drop_original_metrics"
60+
ForceFlushIntervalKey = "force_flush_interval"
61+
ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights
62+
EnhancedContainerInsights = "enhanced_container_insights"
63+
ResourcesKey = "resources"
64+
PreferFullPodName = "prefer_full_pod_name"
65+
EnableAcceleratedComputeMetric = "accelerated_compute_metrics"
66+
AcceleratedComputeGPUMetricsCollectionInterval = "accelerated_compute_gpu_metrics_collection_interval"
67+
HighFrequencyGpuMetrics = "high_frequency_gpu_metrics"
68+
EnableKueueContainerInsights = "kueue_container_insights"
69+
AppendDimensionsKey = "append_dimensions"
70+
Console = "console"
71+
DiskKey = "disk"
72+
DiskIOKey = "diskio"
73+
NetKey = "net"
74+
Emf = "emf"
75+
StructuredLog = "structuredlog"
76+
ServiceAddress = "service_address"
77+
UDP = "udp"
78+
TCP = "tcp"
79+
TlsKey = "tls" //nolint:revive
80+
Tags = "tags"
81+
Region = "region"
82+
LogGroupName = "log_group_name"
83+
LogStreamName = "log_stream_name"
84+
NameKey = "name"
85+
RenameKey = "rename"
86+
UnitKey = "unit"
8587
)
8688

8789
const (

translator/translate/otel/common/common_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ func (t testTranslator) ID() component.ID {
3131

3232
func TestConfigKeys(t *testing.T) {
3333
require.Equal(t, "1::2", ConfigKey("1", "2"))
34+
require.Equal(t, "logs::metrics_collected::kubernetes::accelerated_compute_gpu_metrics_collection_interval",
35+
ConfigKey(LogsKey, MetricsCollectedKey, KubernetesKey, AcceleratedComputeGPUMetricsCollectionInterval))
3436
}
3537

3638
func TestGetString(t *testing.T) {

translator/translate/otel/exporter/awsemf/kubernetes.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func setKubernetesMetricDeclaration(conf *confmap.Conf, cfg *awsemfexporter.Conf
6060

6161
cfg.MetricDeclarations = kubernetesMetricDeclarations
6262
cfg.MetricDescriptors = getControlPlaneMetricDescriptors(conf)
63-
63+
cfg.GaugageMetricsToHistogram = getGaugageMetricsToHistogram(conf)
6464
return nil
6565
}
6666

@@ -722,3 +722,32 @@ func getVolumesMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDe
722722
}
723723
return metricDeclarations
724724
}
725+
726+
func getGaugageMetricsToHistogram(conf *confmap.Conf) []string {
727+
var metricsToHistogram []string
728+
enhancedContainerInsightsEnabled := awscontainerinsight.EnhancedContainerInsightsEnabled(conf)
729+
730+
if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) && enhancedContainerInsightsEnabled && awscontainerinsight.IsHighFrequencyGPUMetricsEnabled(conf) {
731+
metricsToHistogram = append(metricsToHistogram, []string{
732+
"container_gpu_utilization",
733+
"container_gpu_memory_utilization",
734+
"container_gpu_memory_total",
735+
"container_gpu_memory_used",
736+
"container_gpu_power_draw",
737+
"container_gpu_temperature",
738+
"pod_gpu_utilization",
739+
"pod_gpu_memory_utilization",
740+
"pod_gpu_memory_total",
741+
"pod_gpu_memory_used",
742+
"pod_gpu_power_draw",
743+
"pod_gpu_temperature",
744+
"node_gpu_utilization",
745+
"node_gpu_memory_utilization",
746+
"node_gpu_memory_total",
747+
"node_gpu_memory_used",
748+
"node_gpu_power_draw",
749+
"node_gpu_temperature",
750+
}...)
751+
}
752+
return metricsToHistogram
753+
}

0 commit comments

Comments
 (0)