From c65b8600f2db7427a9a938417751789319389da1 Mon Sep 17 00:00:00 2001 From: "minquan.chen" Date: Fri, 29 Mar 2024 14:57:31 +0800 Subject: [PATCH] fix servicegraph bug(orgin #32019) --- connector/servicegraphconnector/connector.go | 10 +- .../servicegraphconnector/connector_test.go | 93 ++++-- connector/servicegraphconnector/go.mod | 9 + .../failed-label-not-work-expect-metrics.yaml | 272 ++++++++++++++++++ .../failed-label-not-work-simple-trace.yaml | 66 +++++ 5 files changed, 419 insertions(+), 31 deletions(-) create mode 100644 connector/servicegraphconnector/testdata/failed-label-not-work-expect-metrics.yaml create mode 100644 connector/servicegraphconnector/testdata/failed-label-not-work-simple-trace.yaml diff --git a/connector/servicegraphconnector/connector.go b/connector/servicegraphconnector/connector.go index bd8a22a431be..a98cc3ee8523 100644 --- a/connector/servicegraphconnector/connector.go +++ b/connector/servicegraphconnector/connector.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "sort" + "strconv" "strings" "sync" "time" @@ -395,15 +396,16 @@ func (p *serviceGraphConnector) onExpire(e *store.Edge) { } func (p *serviceGraphConnector) aggregateMetricsForEdge(e *store.Edge) { - metricKey := p.buildMetricKey(e.ClientService, e.ServerService, string(e.ConnectionType), e.Dimensions) + metricKey := p.buildMetricKey(e.ClientService, e.ServerService, string(e.ConnectionType), strconv.FormatBool(e.Failed), e.Dimensions) dimensions := buildDimensions(e) p.seriesMutex.Lock() defer p.seriesMutex.Unlock() p.updateSeries(metricKey, dimensions) - p.updateCountMetrics(metricKey) if e.Failed { p.updateErrorMetrics(metricKey) + } else { + p.updateCountMetrics(metricKey) } p.updateDurationMetrics(metricKey, e.ServerLatencySec, e.ClientLatencySec) } @@ -602,9 +604,9 @@ func (p *serviceGraphConnector) collectServerLatencyMetrics(ilm pmetric.ScopeMet return nil } -func (p *serviceGraphConnector) buildMetricKey(clientName, serverName, connectionType string, edgeDimensions map[string]string) string { +func (p *serviceGraphConnector) buildMetricKey(clientName, serverName, connectionType string, failed string, edgeDimensions map[string]string) string { var metricKey strings.Builder - metricKey.WriteString(clientName + metricKeySeparator + serverName + metricKeySeparator + connectionType) + metricKey.WriteString(clientName + metricKeySeparator + serverName + metricKeySeparator + connectionType + metricKeySeparator + failed) for _, dimName := range p.config.Dimensions { dim, ok := edgeDimensions[dimName] diff --git a/connector/servicegraphconnector/connector_test.go b/connector/servicegraphconnector/connector_test.go index 9537104ced61..c48b49369c57 100644 --- a/connector/servicegraphconnector/connector_test.go +++ b/connector/servicegraphconnector/connector_test.go @@ -27,6 +27,9 @@ import ( "go.opentelemetry.io/otel/sdk/metric/metricdata" "go.opentelemetry.io/otel/sdk/metric/metricdata/metricdatatest" "go.uber.org/zap/zaptest" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden" + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest/pmetrictest" ) func TestConnectorStart(t *testing.T) { @@ -65,32 +68,68 @@ func TestConnectorShutdown(t *testing.T) { } func TestConnectorConsume(t *testing.T) { - // Prepare - cfg := &Config{ - Dimensions: []string{"some-attribute", "non-existing-attribute"}, - Store: StoreConfig{MaxItems: 10}, - } - - set := componenttest.NewNopTelemetrySettings() - set.Logger = zaptest.NewLogger(t) - conn := newConnector(set, cfg) - conn.metricsConsumer = newMockMetricsExporter() - - assert.NoError(t, conn.Start(context.Background(), componenttest.NewNopHost())) - - // Test & verify - td := buildSampleTrace(t, "val") - // The assertion is part of verifyHappyCaseMetrics func. - assert.NoError(t, conn.ConsumeTraces(context.Background(), td)) - - // Force collection - conn.store.Expire() - md, err := conn.buildMetrics() - assert.NoError(t, err) - verifyHappyCaseMetrics(t, md) - - // Shutdown the connector - assert.NoError(t, conn.Shutdown(context.Background())) + t.Run("test common case", func(t *testing.T) { + // Prepare + cfg := &Config{ + Dimensions: []string{"some-attribute", "non-existing-attribute"}, + Store: StoreConfig{MaxItems: 10}, + } + + set := componenttest.NewNopTelemetrySettings() + set.Logger = zaptest.NewLogger(t) + conn := newConnector(set, cfg) + conn.metricsConsumer = newMockMetricsExporter() + + assert.NoError(t, conn.Start(context.Background(), componenttest.NewNopHost())) + + // Test & verify + td := buildSampleTrace(t, "val") + // The assertion is part of verifyHappyCaseMetrics func. + assert.NoError(t, conn.ConsumeTraces(context.Background(), td)) + + // Force collection + conn.store.Expire() + md, err := conn.buildMetrics() + assert.NoError(t, err) + verifyHappyCaseMetrics(t, md) + + // Shutdown the connector + assert.NoError(t, conn.Shutdown(context.Background())) + }) + t.Run("test fix failed label not work", func(t *testing.T) { + cfg := &Config{ + Store: StoreConfig{MaxItems: 10}, + } + set := componenttest.NewNopTelemetrySettings() + set.Logger = zaptest.NewLogger(t) + conn := newConnector(set, cfg) + conn.metricsConsumer = newMockMetricsExporter() + + assert.NoError(t, conn.Start(context.Background(), componenttest.NewNopHost())) + defer require.NoError(t, conn.Shutdown(context.Background())) + + // this trace simulate two services' trace: foo, bar + // foo called bar three times, two success, one failed + td, err := golden.ReadTraces("testdata/failed-label-not-work-simple-trace.yaml") + assert.NoError(t, err) + assert.NoError(t, conn.ConsumeTraces(context.Background(), td)) + + // Force collection + conn.store.Expire() + actualMetrics, err := conn.buildMetrics() + assert.NoError(t, err) + + // Verify + expectedMetrics, err := golden.ReadMetrics("testdata/failed-label-not-work-expect-metrics.yaml") + assert.NoError(t, err) + + err = pmetrictest.CompareMetrics(expectedMetrics, actualMetrics, + pmetrictest.IgnoreMetricDataPointsOrder(), + pmetrictest.IgnoreResourceMetricsOrder(), + pmetrictest.IgnoreResourceMetricsOrder(), pmetrictest.IgnoreStartTimestamp(), + pmetrictest.IgnoreTimestamp()) + require.NoError(t, err) + }) } func verifyHappyCaseMetrics(t *testing.T, md pmetric.Metrics) { @@ -267,7 +306,7 @@ func TestUpdateDurationMetrics(t *testing.T) { Dimensions: []string{}, }, } - metricKey := p.buildMetricKey("foo", "bar", "", map[string]string{}) + metricKey := p.buildMetricKey("foo", "bar", "", "false", map[string]string{}) testCases := []struct { caseStr string diff --git a/connector/servicegraphconnector/go.mod b/connector/servicegraphconnector/go.mod index f5f66d15a690..630ec0532385 100644 --- a/connector/servicegraphconnector/go.mod +++ b/connector/servicegraphconnector/go.mod @@ -3,6 +3,8 @@ module github.com/open-telemetry/opentelemetry-collector-contrib/connector/servi go 1.21 require ( + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.97.0 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.97.0 github.com/stretchr/testify v1.9.0 go.opentelemetry.io/collector/component v0.97.1-0.20240327181407-1038b67c85a0 go.opentelemetry.io/collector/config/configtelemetry v0.97.1-0.20240327181407-1038b67c85a0 @@ -47,6 +49,7 @@ require ( github.com/mitchellh/reflectwalk v1.0.2 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.97.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/client_golang v1.19.0 // indirect @@ -102,3 +105,9 @@ retract ( v0.76.1 v0.65.0 ) + +replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden => ../../pkg/golden + +replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest => ../../pkg/pdatatest + +replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil => ../../pkg/pdatautil diff --git a/connector/servicegraphconnector/testdata/failed-label-not-work-expect-metrics.yaml b/connector/servicegraphconnector/testdata/failed-label-not-work-expect-metrics.yaml new file mode 100644 index 000000000000..49d74851b8a5 --- /dev/null +++ b/connector/servicegraphconnector/testdata/failed-label-not-work-expect-metrics.yaml @@ -0,0 +1,272 @@ +resourceMetrics: + - resource: {} + scopeMetrics: + - metrics: + - name: traces_service_graph_request_total + sum: + aggregationTemporality: 2 + dataPoints: + - asInt: "2" + attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: false + - key: server + value: + stringValue: bar + startTimeUnixNano: "1000000" + timeUnixNano: "2000000" + isMonotonic: true + - name: traces_service_graph_request_failed_total + sum: + aggregationTemporality: 2 + dataPoints: + - asInt: "1" + attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: true + - key: server + value: + stringValue: bar + startTimeUnixNano: "1000000" + timeUnixNano: "2000000" + isMonotonic: true + - histogram: + aggregationTemporality: 2 + dataPoints: + - attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: false + - key: server + value: + stringValue: bar + bucketCounts: + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "2" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + count: "2" + explicitBounds: + - 0.002 + - 0.004 + - 0.006 + - 0.008 + - 0.01 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1 + - 1.4 + - 2 + - 5 + - 10 + - 15 + startTimeUnixNano: "1000000" + sum: 2 + timeUnixNano: "2000000" + name: traces_service_graph_request_server_seconds + - histogram: + aggregationTemporality: 2 + dataPoints: + - attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: true + - key: server + value: + stringValue: bar + bucketCounts: + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "1" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + count: "1" + explicitBounds: + - 0.002 + - 0.004 + - 0.006 + - 0.008 + - 0.01 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1 + - 1.4 + - 2 + - 5 + - 10 + - 15 + startTimeUnixNano: "1000000" + sum: 1 + timeUnixNano: "2000000" + name: traces_service_graph_request_server_seconds + - histogram: + aggregationTemporality: 2 + dataPoints: + - attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: false + - key: server + value: + stringValue: bar + bucketCounts: + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "2" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + count: "2" + explicitBounds: + - 0.002 + - 0.004 + - 0.006 + - 0.008 + - 0.01 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1 + - 1.4 + - 2 + - 5 + - 10 + - 15 + startTimeUnixNano: "1000000" + sum: 2 + timeUnixNano: "2000000" + name: traces_service_graph_request_client_seconds + - histogram: + aggregationTemporality: 2 + dataPoints: + - attributes: + - key: client + value: + stringValue: foo + - key: connection_type + value: + stringValue: "" + - key: failed + value: + boolValue: true + - key: server + value: + stringValue: bar + bucketCounts: + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + - "1" + - "0" + - "0" + - "0" + - "0" + - "0" + - "0" + count: "1" + explicitBounds: + - 0.002 + - 0.004 + - 0.006 + - 0.008 + - 0.01 + - 0.05 + - 0.1 + - 0.2 + - 0.4 + - 0.8 + - 1 + - 1.4 + - 2 + - 5 + - 10 + - 15 + startTimeUnixNano: "1000000" + sum: 1 + timeUnixNano: "2000000" + name: traces_service_graph_request_client_seconds + scope: + name: traces_service_graph diff --git a/connector/servicegraphconnector/testdata/failed-label-not-work-simple-trace.yaml b/connector/servicegraphconnector/testdata/failed-label-not-work-simple-trace.yaml new file mode 100644 index 000000000000..63ee176afa57 --- /dev/null +++ b/connector/servicegraphconnector/testdata/failed-label-not-work-simple-trace.yaml @@ -0,0 +1,66 @@ +resourceSpans: + - resource: + attributes: + - key: service.name + value: + stringValue: foo + scopeSpans: + - scope: {} + spans: + - endTimeUnixNano: "1641092646000000006" + kind: 3 + name: client span + parentSpanId: "" + spanId: 83c55286956529e1 + startTimeUnixNano: "1641092645000000006" + status: {} + traceId: b3a53e9c2ac533afc48b511c2f368dc1 + - endTimeUnixNano: "1641092646000000006" + kind: 3 + name: client span + parentSpanId: "" + spanId: bd57fe885dbc995d + startTimeUnixNano: "1641092645000000006" + status: {} + traceId: c6aafb2dcf4d71c81ae52e50e8f805ee + - endTimeUnixNano: "1641092646000000006" + kind: 3 + name: client span + parentSpanId: "" + spanId: 9ceb481d4fb78ea7 + startTimeUnixNano: "1641092645000000006" + status: {} + traceId: 7af657de319d6a57967bc241882ea94f + - resource: + attributes: + - key: service.name + value: + stringValue: bar + scopeSpans: + - scope: {} + spans: + - endTimeUnixNano: "1641092646000000006" + kind: 2 + name: server span + parentSpanId: 83c55286956529e1 + spanId: 4ec10b829444ffc8 + startTimeUnixNano: "1641092645000000006" + status: {} + traceId: b3a53e9c2ac533afc48b511c2f368dc1 + - endTimeUnixNano: "1641092646000000006" + kind: 2 + name: server span + parentSpanId: bd57fe885dbc995d + spanId: "" + startTimeUnixNano: "1641092645000000006" + status: + code: 2 + traceId: c6aafb2dcf4d71c81ae52e50e8f805ee + - endTimeUnixNano: "1641092646000000006" + kind: 2 + name: server span + parentSpanId: 9ceb481d4fb78ea7 + spanId: 328a31133be32867 + startTimeUnixNano: "1641092645000000006" + status: {} + traceId: 7af657de319d6a57967bc241882ea94f