Add RecordError checking, init install instructions

jjti · Dec 27, 2023 · 6087973 · 6087973
1 parent 04f61f0
commit 6087973
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -9,40 +9,94 @@ Checks usage of [OpenTelemetry spans](https://pkg.go.dev/go.opentelemetry.io/ote
 
 ## Problem Statement
 
-Tracing is an — often celebrated [[1](https://andydote.co.uk/2023/09/19/tracing-is-better/), [2](https://charity.wtf/2022/08/15/live-your-best-life-with-structured-events/)] — pillar of observability. But it's easy to shoot yourself in the foot when creating and managing OTEL spans. For two reasons:
+Tracing is a celebrated [[1](https://andydote.co.uk/2023/09/19/tracing-is-better/),[2](https://charity.wtf/2022/08/15/live-your-best-life-with-structured-events/)] and well marketed [[3](https://docs.datadoghq.com/tracing/),[4](https://www.honeycomb.io/distributed-tracing)] pillar of observability. But self-instrumented traces requires a lot of easy-to-forget boilerplate:
+
+```go
+import (
+	"context"
+	"errors"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/codes"
+)
+
+
+func task(ctx context.Context) error {
+    ctx, span := otel.Tracer("foo").Start(ctx, "bar")
+    defer span.End() // call `.End()`
+
+    if err := subTask(ctx); err != nil {
+        span.SetStatus(codes.Error, err.Error()) // call SetStatus(codes.Error, msg) to set status:error
+        span.RecordError(err) // call RecordError(err) to record an error event
+        return err
+    }
+
+    return nil
+}
+```
 
 ### Forgetting to call `span.End()`
 
-Not calling `.End()` can cause memory leaks.
+Not calling `End` can cause memory leaks and prevents spans from being closed.
 
 > Any Span that is created MUST also be ended. This is the responsibility of the user. Implementations of this API may leak memory or other resources if Spans are not ended.
 
 [source: trace.go](https://github.com/open-telemetry/opentelemetry-go/blob/98b32a6c3a87fbee5d34c063b9096f416b250897/trace/trace.go#L523)
 
 ```go
 func task(ctx context.Context) error {
-    otel.Tracer().Start(ctx, "foo") // span is unassigned, probable memory leak
+    otel.Tracer("app").Start(ctx, "foo") // span is unassigned, probable memory leak
     _, span := otel.Tracer().Start(ctx, "foo") // span.End is not called on all paths, possible memory leak
     return nil // this return statement may be reached without calling span.End
 }
 ```
 
 ### Forgetting to call `span.SetStatus(codes.Error, "msg")`
 
-Setting spans' status to `codes.Error` matters for a couple reasons:
+Not calling `SetStatus` prevents `status:error` spans from being highlighted and searchable in DataDog, New Relic, etc. Spans without an `Error` status are also more likely not to be retained.
 
 1. observability platforms and APMs differentiate "success" vs "failure" using [span's status codes](https://docs.datadoghq.com/tracing/metrics/).
-1. telemetry collector agents, like the [Open Telemetry Collector](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/tailsamplingprocessor/README.md#:~:text=Sampling%20Processor.-,status_code,-%3A%20Sample%20based%20upon), are configurable to sample `Error` spans at a higher rate than `OK` spans. Similarly, observability platforms like DataDog support trace retention filters based on spans' status. In other words, `Error` spans often receive special treatment with the assumption they are more useful for debugging.
+1. telemetry collector agents, like the [Open Telemetry Collector's Tail Sampling Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/tailsamplingprocessor/README.md#:~:text=Sampling%20Processor.-,status_code,-%3A%20Sample%20based%20upon), are configurable to sample `Error` spans at a higher rate than `OK` spans.
+1. observability platforms, like [DataDog, have trace retention filters that use spans' status](https://docs.datadoghq.com/tracing/trace_pipeline/trace_retention/). In other words, `status:error` spans often receive special treatment with the assumption they are more useful for debugging. And forgetting to set the status can lead to spans, with useful debugging information, being dropped.
 
 ```go
 func _() error {
 	_, span := otel.Tracer("foo").Start(context.Background(), "bar") // span.SetStatus is not called on all paths
 	defer span.End()
 
-	if true {
-		return errors.New("foo") // this return statement may be reached without calling span.SetStatus
+	if err := subTask(); err != nil {
+        span.RecordError(err)
+		return errors.New(err) // this return statement may be reached without calling span.SetStatus
 	}
 
 	return nil
 }
 ```
+
+OpenTelemetry docs: [Set span status](https://opentelemetry.io/docs/instrumentation/go/manual/#set-span-status).
+
+### Forgetting to call `span.RecordError(err)`
+
+Calling `RecordError` creates a new exception-type [event (structured log message)](https://opentelemetry.io/docs/concepts/signals/traces/#span-events) on the span. This is recommended to capture the error's stack trace.
+
+```go
+func _() error {
+	_, span := otel.Tracer("foo").Start(context.Background(), "bar") // span.RecordError is not called on all paths
+	defer span.End()
+
+	if err := subTask(); err != nil {
+        span.SetStatus(codes.Error, err.Error())
+		return errors.New(err) // this return statement may be reached without calling span.RecordError
+	}
+
+	return nil
+}
+```
+
+OpenTelemetry docs: [Record errors](https://opentelemetry.io/docs/instrumentation/go/manual/#record-errors).
+
+## Installation
+
+```bash
+go install github.com/jjti/go-spanlint/cmd/spanlint@latest
+```
diff --git a/spanlint.go b/spanlint.go
@@ -22,13 +22,25 @@ Common mistakes with OTEL trace spans include forgetting to call End:
 		// do stuff
 	}
 
-And forgetting to set an Error status:
+Forgetting to set an Error status:
 
 	ctx, span := otel.Tracer("app").Start(ctx, "span")
 	defer span.End()
 
 	if err := task(); err != nil {
 		// span.SetStatus(codes.Error, err.Error()) should be here
+		span.RecordError(err)
+		return fmt.Errorf("failed to run task: %w", err)
+	}
+
+Forgetting to record the Error:
+
+	ctx, span := otel.Tracer("app").Start(ctx, "span")
+	defer span.End()
+
+	if err := task(); err != nil {
+		span.SetStatus(codes.Error, err.Error())
+		// span.RecordError(err) should be here
 		return fmt.Errorf("failed to run task: %w", err)
 	}
 
@@ -173,6 +185,12 @@ func runFunc(pass *analysis.Pass, node ast.Node) {
 			pass.ReportRangef(sv.stmt, "%s.SetStatus is not called on all paths", sv.vr.Name())
 			pass.ReportRangef(ret, "this return statement may be reached without calling %s.SetStatus", sv.vr.Name())
 		}
+
+		// Check if there's no RecordError to the span setting an error.
+		if ret := missingSpanCalls(pass, g, sv, "RecordError", returnsErr); ret != nil {
+			pass.ReportRangef(sv.stmt, "%s.RecordError is not called on all paths", sv.vr.Name())
+			pass.ReportRangef(ret, "this return statement may be reached without calling %s.RecordError", sv.vr.Name())
+		}
 	}
 }
 

diff --git a/testdata/test.go b/testdata/test.go
@@ -46,6 +46,7 @@ func _() error {
 
 	if true {
 		err := errors.New("foo")
+		span.RecordError(err)
 		return err // want "this return statement may be reached without calling span.SetStatus"
 	}
 
@@ -57,6 +58,7 @@ func _() error {
 	defer span.End()
 
 	if true {
+		span.RecordError(errors.New("foo"))
 		return errors.New("foo") // want "this return statement may be reached without calling span.SetStatus"
 	}
 
@@ -68,17 +70,31 @@ func _() error {
 	defer span.End()
 
 	if true {
+		span.RecordError(errors.New("foo"))
 		return &testError{} // want "this return statement may be reached without calling span.SetStatus"
 	}
 
 	return nil
 }
 
+func _() error {
+	_, span := otel.Tracer("foo").Start(context.Background(), "bar") // want "span.RecordError is not called on all paths"
+	defer span.End()
+
+	if true {
+		span.SetStatus(codes.Error, "foo")
+		return &testError{} // want "this return statement may be reached without calling span.RecordError"
+	}
+
+	return nil
+}
+
 func _() (string, error) {
 	_, span := otel.Tracer("foo").Start(context.Background(), "bar") // want "span.SetStatus is not called on all paths"
 	defer span.End()
 
 	if true {
+		span.RecordError(errors.New("foo"))
 		return "", &testError{} // want "this return statement may be reached without calling span.SetStatus"
 	}
 
@@ -90,6 +106,7 @@ func _() (string, error) {
 	defer span.End()
 
 	if true {
+		span.RecordError(errors.New("foo"))
 		return "", errors.New("foo") // want "this return statement may be reached without calling span.SetStatus"
 	}
 
@@ -102,6 +119,7 @@ func _() {
 		defer span.End()
 
 		if true {
+			span.RecordError(errors.New("foo"))
 			return errors.New("foo") // want "this return statement may be reached without calling span.SetStatus"
 		}
 
@@ -116,6 +134,7 @@ func _() error {
 
 	{
 		if true {
+			span.RecordError(errors.New("foo"))
 			return errors.New("foo") // want "this return statement may be reached without calling span.SetStatus"
 		}
 	}
@@ -150,11 +169,13 @@ func _() error {
 	if false {
 		err := errors.New("foo")
 		span.SetStatus(codes.Error, err.Error())
+		span.RecordError(err)
 		return err
 	}
 
 	if true {
 		span.SetStatus(codes.Error, "foo")
+		span.RecordError(errors.New("foo"))
 		return errors.New("bar")
 	}