Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Go error tracking #1004

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions bpf/errors.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef __ERRORS_H_
#define __ERRORS_H_

#ifndef TASK_COMM_LEN
#define TASK_COMM_LEN 16
#endif

#ifndef ERR_MSG_LEN
#define ERR_MSG_LEN 128
#endif

#ifndef MAX_STACK_DEPTH
#define MAX_STACK_DEPTH 32
#endif

typedef __u64 stack_trace_t[MAX_STACK_DEPTH];

typedef struct error_event {
__u32 pid;
__u32 cpu_id;
char comm[TASK_COMM_LEN];
__s32 ustack_sz;
stack_trace_t ustack;
u8 err_msg[ERR_MSG_LEN];
} error_event;

#endif /* __ERRORS_H_ */
84 changes: 83 additions & 1 deletion bpf/go_nethttp.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "tracing.h"
#include "hpack.h"
#include "ringbuf.h"
#include "errors.h"

typedef struct http_func_invocation {
u64 start_monotime_ns;
Expand All @@ -46,6 +47,13 @@ struct {
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} ongoing_http_client_requests SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, void *); // key: pointer to the request goroutine
__type(value, struct error_event);
__uint(max_entries, MAX_CONCURRENT_REQUESTS);
} last_error SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, void *); // key: pointer to the request goroutine
Expand Down Expand Up @@ -198,6 +206,9 @@ int uprobe_ServeHTTPReturns(struct pt_regs *ctx) {
resp_ptr = deref_resp_ptr;
}

struct error_event *error = bpf_map_lookup_elem(&last_error, &goroutine_addr);
bpf_map_delete_elem(&last_error, &goroutine_addr);

http_request_trace *trace = bpf_ringbuf_reserve(&events, sizeof(http_request_trace), 0);
if (!trace) {
bpf_dbg_printk("can't reserve space in the ringbuffer");
Expand All @@ -208,6 +219,8 @@ int uprobe_ServeHTTPReturns(struct pt_regs *ctx) {
trace->type = EVENT_HTTP_REQUEST;
trace->start_monotime_ns = invocation->start_monotime_ns;
trace->end_monotime_ns = bpf_ktime_get_ns();
if (error)
trace->error = *error;

goroutine_metadata *g_metadata = bpf_map_lookup_elem(&ongoing_goroutines, &goroutine_addr);
if (g_metadata) {
Expand Down Expand Up @@ -436,6 +449,75 @@ int uprobe_roundTripReturn(struct pt_regs *ctx) {
return 0;
}


SEC("uprobe/error")
int uprobe_error(struct pt_regs *ctx) {
bpf_dbg_printk("=== uprobe/proc error === ");

void *goroutine_addr = GOROUTINE_PTR(ctx);
bpf_dbg_printk("goroutine_addr %lx", goroutine_addr);

int pid = bpf_get_current_pid_tgid() >> 32;
int cpu_id = bpf_get_smp_processor_id();
int BPF_F_USER_STACK = (1ULL << 8);
struct error_event event = {
.pid = pid,
.cpu_id = cpu_id,
};

if (bpf_get_current_comm(event.comm, sizeof(event.comm)))
event.comm[0] = 0;

// Read the stack trace
event.ustack_sz = bpf_get_stack(ctx, event.ustack, sizeof(event.ustack), BPF_F_USER_STACK);

// Get the caller of the error function and store it in the first slot of the stack
void *sp_caller = STACK_PTR(ctx);
u64 caller = 0;
bpf_probe_read(&caller, sizeof(u64), sp_caller);
bpf_dbg_printk("sp_caller %lx caller %lx", sp_caller, caller);
event.ustack[0] = caller;

// Write event
if (bpf_map_update_elem(&last_error, &goroutine_addr, &event, BPF_ANY)) {
bpf_dbg_printk("can't update event error map element");
}
return 0;
}

SEC("uprobe/error_return")
int uprobe_errorReturn(struct pt_regs *ctx) {
bpf_dbg_printk("=== uprobe/proc error return === ");

void *goroutine_addr = GOROUTINE_PTR(ctx);
bpf_dbg_printk("goroutine_addr %lx", goroutine_addr);

error_event *event = bpf_map_lookup_elem(&last_error, &goroutine_addr);
if (event == NULL) {
bpf_dbg_printk("can't read error event");
return 0;
}

// Read the error message
// GO_PARAM1(ctx) is the pointer to the error message
// GO_PARAM2(ctx) is the length of the error message
void *msg_ptr = GO_PARAM1(ctx);
u64 len = (u64)GO_PARAM2(ctx);
u64 max_size = sizeof(event->err_msg);
u64 size = max_size < len ? max_size : len;
bpf_probe_read(&event->err_msg, size, msg_ptr);
if (size < max_size) {
((char *)event->err_msg)[size] = 0;
}
bpf_dbg_printk("error msg %llx, %s", msg_ptr, event->err_msg);

// Write event
if (bpf_map_update_elem(&last_error, &goroutine_addr, event, BPF_ANY)) {
bpf_dbg_printk("can't update event error map element");
}
return 0;
}

#ifndef NO_HEADER_PROPAGATION
// Context propagation through HTTP headers
SEC("uprobe/header_writeSubset")
Expand Down Expand Up @@ -938,4 +1020,4 @@ int uprobe_queryReturn(struct pt_regs *ctx) {
bpf_dbg_printk("can't reserve space in the ringbuffer");
}
return 0;
}
}
1 change: 0 additions & 1 deletion bpf/go_nethttp.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,4 @@ volatile const u64 rwc_conn_pos;
volatile const u64 rws_conn_pos;
volatile const u64 http2_server_conn_pos;
volatile const u64 cc_tconn_pos;

#endif
1 change: 1 addition & 0 deletions bpf/go_str.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#define GO_STR_H

#include "utils.h"
#include "bpf_dbg.h"

static __always_inline int read_go_str_n(char *name, void *base_ptr, u64 len, void *field, u64 max_size) {
u64 size = max_size < len ? max_size : len;
Expand Down
3 changes: 2 additions & 1 deletion bpf/headers/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
// In x86, current goroutine is pointed by r14, according to
// https://go.googlesource.com/go/+/refs/heads/dev.regabi/src/cmd/compile/internal-abi.md#amd64-architecture
#define GOROUTINE_PTR(x) ((void*)(x)->r14)

#define STACK_PTR(x) ((void*)(x)->sp)
#elif defined(__TARGET_ARCH_arm64)

#define GO_PARAM1(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[0])
Expand All @@ -50,6 +50,7 @@
// In arm64, current goroutine is pointed by R28 according to
// https://github.com/golang/go/blob/master/src/cmd/compile/abi-internal.md#arm64-architecture
#define GOROUTINE_PTR(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[28])
#define STACK_PTR(x) ((void*)((PT_REGS_ARM64 *)(x))->regs[13])

#endif /*defined(__TARGET_ARCH_arm64)*/

Expand Down
2 changes: 2 additions & 0 deletions bpf/http_trace.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "pid_types.h"
#include "utils.h"
#include "errors.h"
#include "http_types.h"

#define PATH_MAX_LEN 100
Expand All @@ -39,6 +40,7 @@ typedef struct http_request_trace_t {
u16 status;
connection_info_t conn __attribute__ ((aligned (8)));
s64 content_length;
error_event error;
tp_info_t tp;

pid_info pid;
Expand Down
11 changes: 6 additions & 5 deletions pkg/beyla/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ var DefaultConfig = Config{
TTL: defaultMetricsTTL,
},
Traces: otel.TracesConfig{
Protocol: otel.ProtocolUnset,
TracesProtocol: otel.ProtocolUnset,
MaxQueueSize: 4096,
MaxExportBatchSize: 4096,
ReportersCacheLen: ReporterLRUSize,
Protocol: otel.ProtocolUnset,
TracesProtocol: otel.ProtocolUnset,
MaxQueueSize: 4096,
MaxExportBatchSize: 4096,
ReportersCacheLen: ReporterLRUSize,
ReportExceptionEvents: false,
Instrumentations: []string{
instrumentations.InstrumentationALL,
},
Expand Down
2 changes: 1 addition & 1 deletion pkg/internal/discover/typer.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func (t *typer) inspectOffsets(execElf *exec.FileInfo) (*goexec.Offsets, bool, e
t.log.Debug("skipping inspection for Go functions", "pid", execElf.Pid, "comm", execElf.CmdExePath)
} else {
t.log.Debug("inspecting", "pid", execElf.Pid, "comm", execElf.CmdExePath)
offsets, err := goexec.InspectOffsets(execElf, t.allGoFunctions)
offsets, err := goexec.InspectOffsets(&t.cfg.Traces, execElf, t.allGoFunctions)
if err != nil {
t.log.Debug("couldn't find go specific tracers", "error", err)
return nil, false, err
Expand Down
Binary file added pkg/internal/ebpf/common/bpf_bpf.o
Binary file not shown.
11 changes: 10 additions & 1 deletion pkg/internal/ebpf/common/bpf_bpfel_arm64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/common/bpf_bpfel_arm64.o
Binary file not shown.
11 changes: 10 additions & 1 deletion pkg/internal/ebpf/common/bpf_bpfel_x86.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified pkg/internal/ebpf/common/bpf_bpfel_x86.o
Binary file not shown.
3 changes: 1 addition & 2 deletions pkg/internal/ebpf/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,7 @@ func ReadBPFTraceAsSpan(record *ringbuf.Record, filter ServiceFilter) (request.S
if err != nil {
return request.Span{}, true, err
}

return HTTPRequestTraceToSpan(&event), false, nil
return HTTPRequestTraceToSpan(&event, filter), false, nil
}

func ReadSQLRequestTraceAsSpan(record *ringbuf.Record) (request.Span, bool, error) {
Expand Down
31 changes: 26 additions & 5 deletions pkg/internal/ebpf/common/pids.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package ebpfcommon

import (
"debug/gosym"
"log/slog"
"sync"

Expand Down Expand Up @@ -31,11 +32,12 @@ type PIDInfo struct {
}

type ServiceFilter interface {
AllowPID(uint32, uint32, svc.ID, PIDType)
AllowPID(uint32, uint32, svc.ID, PIDType, *gosym.Table)
BlockPID(uint32, uint32)
ValidPID(uint32, uint32, PIDType) bool
Filter(inputSpans []request.Span) []request.Span
CurrentPIDs(PIDType) map[uint32]map[uint32]svc.ID
GetSymTab(uint32) *gosym.Table
}

// PIDsFilter keeps a thread-safe copy of the PIDs whose traces are allowed to
Expand All @@ -44,6 +46,7 @@ type ServiceFilter interface {
type PIDsFilter struct {
log *slog.Logger
current map[uint32]map[uint32]PIDInfo
symTabs map[uint32]*gosym.Table
mux *sync.RWMutex
}

Expand All @@ -55,6 +58,7 @@ func NewPIDsFilter(log *slog.Logger) *PIDsFilter {
log: log,
current: map[uint32]map[uint32]PIDInfo{},
mux: &sync.RWMutex{},
symTabs: make(map[uint32]*gosym.Table),
}
}

Expand All @@ -73,16 +77,17 @@ func CommonPIDsFilter(systemWide bool) ServiceFilter {
return commonPIDsFilter
}

func (pf *PIDsFilter) AllowPID(pid, ns uint32, svc svc.ID, pidType PIDType) {
func (pf *PIDsFilter) AllowPID(pid, ns uint32, svc svc.ID, pidType PIDType, symTab *gosym.Table) {
pf.mux.Lock()
defer pf.mux.Unlock()
pf.addPID(pid, ns, svc, pidType)
pf.addPID(pid, ns, svc, pidType, symTab)
}

func (pf *PIDsFilter) BlockPID(pid, ns uint32) {
pf.mux.Lock()
defer pf.mux.Unlock()
pf.removePID(pid, ns)

}

func (pf *PIDsFilter) ValidPID(userPID, ns uint32, pidType PIDType) bool {
Expand Down Expand Up @@ -151,13 +156,24 @@ func (pf *PIDsFilter) Filter(inputSpans []request.Span) []request.Span {
return outputSpans
}

func (pf *PIDsFilter) addPID(pid, nsid uint32, s svc.ID, t PIDType) {
func (pf *PIDsFilter) GetSymTab(pid uint32) *gosym.Table {
pf.mux.RLock()
defer pf.mux.RUnlock()
return pf.symTabs[pid]
}

func (pf *PIDsFilter) addPID(pid, nsid uint32, s svc.ID, t PIDType, symTab *gosym.Table) {
ns, nsExists := pf.current[nsid]
if !nsExists {
ns = make(map[uint32]PIDInfo)
pf.current[nsid] = ns
}

_, stExists := pf.symTabs[pid]
if !stExists {
pf.symTabs[pid] = symTab
}

allPids, err := readNamespacePIDs(int32(pid))

if err != nil {
Expand All @@ -180,13 +196,14 @@ func (pf *PIDsFilter) removePID(pid, nsid uint32) {
if len(ns) == 0 {
delete(pf.current, nsid)
}
delete(pf.symTabs, pid)
}

// IdentityPidsFilter is a PIDsFilter that does not filter anything. It is feasible
// for system-wide instrumenation
type IdentityPidsFilter struct{}

func (pf *IdentityPidsFilter) AllowPID(_ uint32, _ uint32, _ svc.ID, _ PIDType) {}
func (pf *IdentityPidsFilter) AllowPID(_ uint32, _ uint32, _ svc.ID, _ PIDType, _ *gosym.Table) {}

func (pf *IdentityPidsFilter) BlockPID(_ uint32, _ uint32) {}

Expand All @@ -206,6 +223,10 @@ func (pf *IdentityPidsFilter) Filter(inputSpans []request.Span) []request.Span {
return inputSpans
}

func (pf *IdentityPidsFilter) GetSymTab(_ uint32) *gosym.Table {
return nil
}

func serviceInfo(pid uint32) svc.ID {
cached, ok := activePids.Get(pid)
if ok {
Expand Down
Loading
Loading