From 85f0103d0c061fb882b607b9ca66c4b13b139b25 Mon Sep 17 00:00:00 2001 From: Baptiste Girard-Carrabin Date: Thu, 10 Aug 2023 17:29:45 +0200 Subject: [PATCH] Add otel gRPC tracing support The feature is disabled by default and hidden behind a the flag --enable-otel-tracing. When enabled, all the gRPC calls made by the driver will be instrumented and can be forwarded to an opentelemetry- compatible collector. The configuration of opentelemetry tracing can be done in the chart with values `controller.otelTracing` and `node.otelTracing`. The service name and the endpoint that will be used by the sdk can be configured with `otelServiceName` and `otelExporterEndpoint` for both components. --- .../templates/controller.yaml | 9 +++ .../templates/node-windows.yaml | 9 +++ charts/aws-ebs-csi-driver/templates/node.yaml | 17 ++++-- charts/aws-ebs-csi-driver/values.yaml | 8 +++ cmd/main.go | 20 +++++++ cmd/options/server_options.go | 3 + cmd/options_test.go | 7 +++ docs/options.md | 2 + go.mod | 10 ++-- pkg/driver/driver.go | 16 +++++- pkg/driver/driver_test.go | 9 +++ pkg/driver/trace.go | 56 +++++++++++++++++++ 12 files changed, 156 insertions(+), 10 deletions(-) create mode 100644 pkg/driver/trace.go diff --git a/charts/aws-ebs-csi-driver/templates/controller.yaml b/charts/aws-ebs-csi-driver/templates/controller.yaml index dda851b1ff..6d15e32fb1 100644 --- a/charts/aws-ebs-csi-driver/templates/controller.yaml +++ b/charts/aws-ebs-csi-driver/templates/controller.yaml @@ -93,6 +93,9 @@ spec: {{- with .Values.controller.userAgentExtra }} - --user-agent-extra={{ . }} {{- end }} + {{- if .Values.controller.otelTracing }} + - --enable-otel-tracing=true + {{- end}} - --v={{ .Values.controller.logLevel }} {{- range .Values.controller.additionalArgs }} - {{ . }} @@ -134,6 +137,12 @@ spec: {{- with .Values.controller.env }} {{- . | toYaml | nindent 12 }} {{- end }} + {{- with .Values.controller.otelTracing }} + - name: OTEL_SERVICE_NAME + value: {{ .otelServiceName }} + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .otelExporterEndpoint }} + {{- end }} {{- with .Values.controller.envFrom }} envFrom: {{- . | toYaml | nindent 12 }} diff --git a/charts/aws-ebs-csi-driver/templates/node-windows.yaml b/charts/aws-ebs-csi-driver/templates/node-windows.yaml index 3f460563d3..4baa2832ac 100644 --- a/charts/aws-ebs-csi-driver/templates/node-windows.yaml +++ b/charts/aws-ebs-csi-driver/templates/node-windows.yaml @@ -58,6 +58,9 @@ spec: - --logging-format={{ . }} {{- end }} - --v={{ .Values.node.logLevel }} + {{- if .Values.node.otelTracing }} + - --enable-otel-tracing=true + {{- end}} env: - name: CSI_ENDPOINT value: unix:/csi/csi.sock @@ -68,6 +71,12 @@ spec: {{- if .Values.proxy.http_proxy }} {{- include "aws-ebs-csi-driver.http-proxy" . | nindent 12 }} {{- end }} + {{- with .Values.node.otelTracing }} + - name: OTEL_SERVICE_NAME + value: {{ .otelServiceName }} + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .otelExporterEndpoint }} + {{- end }} {{- with .Values.node.env }} {{- . | toYaml | nindent 12 }} {{- end }} diff --git a/charts/aws-ebs-csi-driver/templates/node.yaml b/charts/aws-ebs-csi-driver/templates/node.yaml index 9e24c102af..511d31a557 100644 --- a/charts/aws-ebs-csi-driver/templates/node.yaml +++ b/charts/aws-ebs-csi-driver/templates/node.yaml @@ -47,7 +47,7 @@ spec: operator: "Exists" {{- end }} {{- with .Values.node.securityContext }} - securityContext: + securityContext: {{- toYaml . | nindent 8 }} {{- end }} containers: @@ -64,6 +64,9 @@ spec: - --logging-format={{ . }} {{- end }} - --v={{ .Values.node.logLevel }} + {{- if .Values.node.otelTracing }} + - --enable-otel-tracing=true + {{- end}} env: - name: CSI_ENDPOINT value: unix:/csi/csi.sock @@ -74,6 +77,12 @@ spec: {{- if .Values.proxy.http_proxy }} {{- include "aws-ebs-csi-driver.http-proxy" . | nindent 12 }} {{- end }} + {{- with .Values.node.otelTracing }} + - name: OTEL_SERVICE_NAME + value: {{ .otelServiceName }} + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .otelExporterEndpoint }} + {{- end }} {{- with .Values.node.env }} {{- . | toYaml | nindent 12 }} {{- end }} @@ -109,7 +118,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} {{- with .Values.node.containerSecurityContext }} - securityContext: + securityContext: {{- toYaml . | nindent 12 }} {{- end }} - name: node-driver-registrar @@ -158,7 +167,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} {{- with .Values.sidecars.nodeDriverRegistrar.securityContext }} - securityContext: + securityContext: {{- toYaml . | nindent 12 }} {{- end }} - name: liveness-probe @@ -181,7 +190,7 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} {{- with .Values.sidecars.livenessProbe.securityContext }} - securityContext: + securityContext: {{- toYaml . | nindent 12 }} {{- end }} {{- if .Values.imagePullSecrets }} diff --git a/charts/aws-ebs-csi-driver/values.yaml b/charts/aws-ebs-csi-driver/values.yaml index e20f57e3e8..14533a131f 100644 --- a/charts/aws-ebs-csi-driver/values.yaml +++ b/charts/aws-ebs-csi-driver/values.yaml @@ -288,6 +288,10 @@ controller: # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] + # Enable opentelemetry tracing for the plugin running on the daemonset + otelTracing: {} + # otelServiceName: ebs-csi-controller + # otelExporterEndpoint: "http://localhost:4317" node: env: [] @@ -354,6 +358,10 @@ node: containerSecurityContext: readOnlyRootFilesystem: true privileged: true + # Enable opentelemetry tracing for the plugin running on the daemonset + otelTracing: {} + # otelServiceName: ebs-csi-node + # otelExporterEndpoint: "http://localhost:4317" storageClasses: [] # Add StorageClass resources like: diff --git a/cmd/main.go b/cmd/main.go index d8cf6fe843..2dc8e84037 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,7 +17,9 @@ limitations under the License. package main import ( + "context" "net/http" + "time" flag "github.com/spf13/pflag" @@ -39,6 +41,23 @@ func main() { options := GetOptions(fs) + // Start tracing as soon as possible + if options.ServerOptions.EnableOtelTracing { + exporter, err := driver.InitOtelTracing() + if err != nil { + klog.ErrorS(err, "failed to initialize otel tracing") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) + } + // Exporter will flush traces on shutdown + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := exporter.Shutdown(ctx); err != nil { + klog.ErrorS(err, "could not shutdown otel exporter") + } + }() + } + cloud.RegisterMetrics() if options.ServerOptions.HttpEndpoint != "" { mux := http.NewServeMux() @@ -62,6 +81,7 @@ func main() { driver.WithAwsSdkDebugLog(options.ControllerOptions.AwsSdkDebugLog), driver.WithWarnOnInvalidTag(options.ControllerOptions.WarnOnInvalidTag), driver.WithUserAgentExtra(options.ControllerOptions.UserAgentExtra), + driver.WithOtelTracing(options.ServerOptions.EnableOtelTracing), ) if err != nil { klog.ErrorS(err, "failed to create driver") diff --git a/cmd/options/server_options.go b/cmd/options/server_options.go index ddfbdaec72..52d6d7b49e 100644 --- a/cmd/options/server_options.go +++ b/cmd/options/server_options.go @@ -28,9 +28,12 @@ type ServerOptions struct { Endpoint string // HttpEndpoint is the endpoint that the HTTP server for metrics should listen on. HttpEndpoint string + // EnableOtelTracing enables opentelemetry tracing. + EnableOtelTracing bool } func (s *ServerOptions) AddFlags(fs *flag.FlagSet) { fs.StringVar(&s.Endpoint, "endpoint", driver.DefaultCSIEndpoint, "Endpoint for the CSI driver server") fs.StringVar(&s.HttpEndpoint, "http-endpoint", "", "The TCP network address where the HTTP server for metrics will listen (example: `:8080`). The default is empty string, which means the server is disabled.") + fs.BoolVar(&s.EnableOtelTracing, "enable-otel-tracing", false, "To enable opentelemetry tracing for the driver. The tracing is disabled by default. Configure the exporter endpoint with OTEL_EXPORTER_OTLP_ENDPOINT and other env variables, see https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#general-sdk-configuration.") } diff --git a/cmd/options_test.go b/cmd/options_test.go index 783b6ac0bf..e87ad3d3b9 100644 --- a/cmd/options_test.go +++ b/cmd/options_test.go @@ -53,6 +53,8 @@ func TestGetOptions(t *testing.T) { var VolumeAttachLimit int64 = 42 userAgentExtraFlag := "user-agent-extra" userAgentExtraFlagValue := "test" + otelTracingFlagName := "enable-otel-tracing" + otelTracingFlagValue := true args := append([]string{ "aws-ebs-csi-driver", @@ -60,6 +62,7 @@ func TestGetOptions(t *testing.T) { if withServerOptions { args = append(args, "--"+endpointFlagName+"="+endpoint) + args = append(args, "--"+otelTracingFlagName+"="+strconv.FormatBool(otelTracingFlagValue)) } if withControllerOptions { args = append(args, "--"+extraTagsFlagName+"="+extraTagKey+"="+extraTagValue) @@ -83,6 +86,10 @@ func TestGetOptions(t *testing.T) { if options.ServerOptions.Endpoint != endpoint { t.Fatalf("expected endpoint to be %q but it is %q", endpoint, options.ServerOptions.Endpoint) } + otelTracingFlag := flagSet.Lookup(otelTracingFlagName) + if otelTracingFlag == nil { + t.Fatalf("expected %q flag to be added but it is not", otelTracingFlagName) + } } if withControllerOptions { diff --git a/docs/options.md b/docs/options.md index 01c4251153..3eb7663e50 100644 --- a/docs/options.md +++ b/docs/options.md @@ -1,4 +1,5 @@ # Driver Options + There are a couple of driver options that can be passed as arguments when starting the driver container. | Option argument | value sample | default | Description | @@ -10,3 +11,4 @@ There are a couple of driver options that can be passed as arguments when starti | aws-sdk-debug-log | true | false | If set to true, the driver will enable the aws sdk debug log level| | logging-format | json | text | Sets the log format. Permitted formats: text, json| | user-agent-extra | csi-ebs | helm | Extra string appended to user agent| +| enable-otel-tracing | true | false | If set to true, the driver will enable opentelemetry tracing. Might need [additional env variables](https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#general-sdk-configuration) to export the traces to the right collector| diff --git a/go.mod b/go.mod index b63e1e8e23..a73c0463b7 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,11 @@ require ( github.com/onsi/gomega v1.27.8 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.8.4 + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.41.1 + go.opentelemetry.io/otel v1.15.1 + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.15.1 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.15.1 + go.opentelemetry.io/otel/sdk v1.15.1 golang.org/x/sys v0.11.0 google.golang.org/grpc v1.56.2 google.golang.org/protobuf v1.31.0 @@ -89,14 +94,9 @@ require ( go.etcd.io/etcd/api/v3 v3.5.9 // indirect go.etcd.io/etcd/client/pkg/v3 v3.5.9 // indirect go.etcd.io/etcd/client/v3 v3.5.9 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.41.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.41.1 // indirect - go.opentelemetry.io/otel v1.15.1 // indirect go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.15.1 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.15.1 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.15.1 // indirect go.opentelemetry.io/otel/metric v0.38.1 // indirect - go.opentelemetry.io/otel/sdk v1.15.1 // indirect go.opentelemetry.io/otel/trace v1.15.1 // indirect go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.11.0 // indirect diff --git a/pkg/driver/driver.go b/pkg/driver/driver.go index 394554386c..d92964af96 100644 --- a/pkg/driver/driver.go +++ b/pkg/driver/driver.go @@ -24,6 +24,7 @@ import ( "github.com/awslabs/volume-modifier-for-k8s/pkg/rpc" csi "github.com/container-storage-interface/spec/lib/go/csi" "github.com/kubernetes-sigs/aws-ebs-csi-driver/pkg/util" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "k8s.io/klog/v2" ) @@ -69,6 +70,7 @@ type DriverOptions struct { awsSdkDebugLog bool warnOnInvalidTag bool userAgentExtra string + otelTracing bool } func NewDriver(options ...func(*DriverOptions)) (*Driver, error) { @@ -123,8 +125,14 @@ func (d *Driver) Run() error { } return resp, err } + + grpcInterceptor := grpc.UnaryInterceptor(logErr) + if d.options.otelTracing { + grpcInterceptor = grpc.ChainUnaryInterceptor(logErr, otelgrpc.UnaryServerInterceptor()) + } + opts := []grpc.ServerOption{ - grpc.UnaryInterceptor(logErr), + grpcInterceptor, } d.srv = grpc.NewServer(opts...) @@ -208,3 +216,9 @@ func WithUserAgentExtra(userAgentExtra string) func(*DriverOptions) { o.userAgentExtra = userAgentExtra } } + +func WithOtelTracing(enableOtelTracing bool) func(*DriverOptions) { + return func(o *DriverOptions) { + o.otelTracing = enableOtelTracing + } +} diff --git a/pkg/driver/driver_test.go b/pkg/driver/driver_test.go index 239e1a28ad..e98229ae0b 100644 --- a/pkg/driver/driver_test.go +++ b/pkg/driver/driver_test.go @@ -103,3 +103,12 @@ func TestWithUserAgentExtra(t *testing.T) { t.Fatalf("expected userAgentExtra option got set to %s but is set to %s", userAgentExtra, options.userAgentExtra) } } + +func TestWithOtelTracing(t *testing.T) { + var enableOtelTracing bool = true + options := &DriverOptions{} + WithOtelTracing(enableOtelTracing)(options) + if options.otelTracing != enableOtelTracing { + t.Fatalf("expected otelTracing option got set to %v but is set to %v", enableOtelTracing, options.otelTracing) + } +} diff --git a/pkg/driver/trace.go b/pkg/driver/trace.go new file mode 100644 index 0000000000..19e3bdb3b9 --- /dev/null +++ b/pkg/driver/trace.go @@ -0,0 +1,56 @@ +/* +Copyright 2023 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package driver + +import ( + "context" + "fmt" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + "k8s.io/klog/v2" +) + +func InitOtelTracing() (*otlptrace.Exporter, error) { + // Setup OTLP exporter + ctx := context.Background() + exporter, err := otlptracegrpc.New(ctx) + if err != nil { + return nil, fmt.Errorf("failed to create the OTLP exporter: %w", err) + } + + // Resource will auto populate spans with common attributes + resource, err := resource.New(ctx, + resource.WithFromEnv(), // pull attributes from OTEL_RESOURCE_ATTRIBUTES and OTEL_SERVICE_NAME environment variables + resource.WithProcess(), + resource.WithOS(), + resource.WithContainer(), + resource.WithHost(), + ) + if err != nil { + klog.ErrorS(err, "failed to create the OTLP resource, spans will lack some metadata") + } + + // Create a trace provider with the exporter. + // Use propagator and sampler defined in environment variables. + traceProvider := trace.NewTracerProvider(trace.WithBatcher(exporter), trace.WithResource(resource)) + + // Register the trace provider as global. + otel.SetTracerProvider(traceProvider) + + return exporter, nil +}