Skip to content

Commit

Permalink
Fix context propagation edge cases (#1572)
Browse files Browse the repository at this point in the history
  • Loading branch information
grcevski authored Jan 24, 2025
1 parent ad84610 commit 9e3697b
Show file tree
Hide file tree
Showing 24 changed files with 540 additions and 137 deletions.
13 changes: 5 additions & 8 deletions bpf/protocol_http.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ static __always_inline void http_get_or_create_trace_info(http_connection_metada
//make_tp_string(tp_buf, &tp_p->tp);
//bpf_dbg_printk("tp: %s", tp_buf);

u8 tp_in_headers = false;

if (k_bpf_traceparent_enabled) {
// The below buffer scan can be expensive on high volume of requests. We make it optional
// for customers to enable it. Off by default.
Expand All @@ -113,7 +111,6 @@ static __always_inline void http_get_or_create_trace_info(http_connection_metada
unsigned char *res = bpf_strstr_tp_loop(buf, buf_len);

if (res) {
tp_in_headers = true;
bpf_dbg_printk("Found traceparent %s", res);
unsigned char *t_id = extract_trace_id(res);
unsigned char *s_id = extract_span_id(res);
Expand All @@ -137,11 +134,11 @@ static __always_inline void http_get_or_create_trace_info(http_connection_metada
if (meta) {
u32 type = trace_type_from_meta(meta);
set_trace_info_for_connection(conn, type, tp_p);
// If the user code setup traceparent manually, don't interfere and add
// something else with TC L7
if (meta->type == EVENT_HTTP_CLIENT && tp_in_headers) {
return;
}
// TODO: If the user code setup traceparent manually, don't interfere and add
// something else with TC L7. The main challenge is that with kprobes, the
// sock_msg program has already punched a hole in the HTTP headers and has made
// the HTTP header invalid. We need to add more smarts there or pull the
// sock msg information here and mark it so that we don't override the span_id.
server_or_client_trace(meta->type, conn, tp_p);
}
}
Expand Down
1 change: 1 addition & 0 deletions bpf/tc_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
enum { MAX_INLINE_LEN = 0x3ff };

const char TP[] = "Traceparent: 00-00000000000000000000000000000000-0000000000000000-01\r\n";
const char INV_TP[] = "W3C-BeylaID: 00-00000000000000000000000000000000-0000000000000000-01\r\n";
const u32 EXTEND_SIZE = sizeof(TP) - 1;
const char TP_PREFIX[] = "Traceparent: ";
const u32 TP_PREFIX_SIZE = sizeof(TP_PREFIX) - 1;
Expand Down
52 changes: 38 additions & 14 deletions bpf/tc_sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// A map of sockets which we track with sock_ops. The sock_msg
// program subscribes to this map and runs for each new socket
// activity
// The map size must be max u16 to avoid accidentally losing
// the socket information
struct {
__uint(type, BPF_MAP_TYPE_SOCKHASH);
__uint(max_entries, SOCKOPS_MAP_SIZE);
Expand All @@ -34,11 +36,13 @@ typedef struct tc_http_ctx {

// A map that keeps all the HTTP packets we've extended with
// the sock_msg program and that Traffic Control needs to write to.
// The map size must be max u16 to avoid accidentally overwriting
// prior information of a live extended header.
struct tc_http_ctx_map {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__type(key, u32);
__type(value, struct tc_http_ctx);
__uint(max_entries, 10240);
__uint(max_entries, SOCKOPS_MAP_SIZE);
} tc_http_ctx_map SEC(".maps");

// Memory buffer and a map bellow as temporary storage for
Expand Down Expand Up @@ -209,7 +213,10 @@ static __always_inline u8 protocol_detector(struct sk_msg_md *msg,
// actual 'Traceparent:...' string.
if (is_http_request_buf((const unsigned char *)msg_buf.buf)) {
bpf_dbg_printk("Setting up request to be extended");
bpf_map_update_elem(&msg_buffers, &e_key, &msg_buf, BPF_ANY);
if (bpf_map_update_elem(&msg_buffers, &e_key, &msg_buf, BPF_ANY)) {
// fail if we can't setup a msg buffer
return 0;
}

return 1;
}
Expand Down Expand Up @@ -266,19 +273,36 @@ int beyla_packet_extender(struct sk_msg_md *msg) {

if (newline_pos >= 0) {
newline_pos++;
// Push extends the packet with empty space and sets up the
// metadata for Traffic Control to finish the writing
if (!bpf_msg_push_data(msg, newline_pos, EXTEND_SIZE, 0)) {
tc_http_ctx_t ctx = {
.offset = newline_pos,
.seen = 0,
.written = 0,
};
u32 port = msg->local_port;

bpf_map_update_elem(&tc_http_ctx_map, &port, &ctx, BPF_ANY);
tc_http_ctx_t ctx = {
.offset = newline_pos,
.seen = 0,
.written = 0,
};
u32 port = msg->local_port;

// We first attempt to register the metadata for TC to work with. If we
// fail, we shouldn't expand the packet!
long failed = bpf_map_update_elem(&tc_http_ctx_map, &port, &ctx, BPF_ANY);

if (!failed) {
// Push extends the packet with empty space and sets up the
// metadata for Traffic Control to finish the writing. If we
// fail (non-zero return value), we delete the metadata.
if (bpf_msg_push_data(msg, newline_pos, EXTEND_SIZE, 0)) {
// We two things to disable this context, we set the written
// and seen to their max value to disable the TC code, and then
// we also attempt delete. This is to ensure that we still have
// disabled the TC code if delete failed.
if (bpf_map_delete_elem(&tc_http_ctx_map, &port)) {
tc_http_ctx_t *bad_ctx = bpf_map_lookup_elem(&tc_http_ctx_map, &port);
if (bad_ctx) {
bad_ctx->written = EXTEND_SIZE;
bad_ctx->seen = bad_ctx->offset;
}
}
bpf_dbg_printk("offset to split %d", newline_pos);
}
}
bpf_dbg_printk("offset to split %d", newline_pos);
}
}

Expand Down
8 changes: 8 additions & 0 deletions bpf/tc_tracer.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ int beyla_app_egress(struct __sk_buff *skb) {
if (tp->valid) {
encode_data_in_ip_options(skb, &conn, &tcp, tp, &e_key);
}
} else {
u32 s_port = e_key.s_port;
tc_http_ctx_t *ctx = (tc_http_ctx_t *)bpf_map_lookup_elem(&tc_http_ctx_map, &s_port);

if (ctx) {
bpf_dbg_printk("No trace-map info, filling up the hole setup by sk_msg");
write_traceparent(skb, &tcp, &e_key, ctx, (unsigned char *)INV_TP);
}
}

return TC_ACT_UNSPEC;
Expand Down
177 changes: 97 additions & 80 deletions bpf/tc_tracer_l7.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ static __always_inline unsigned char *tp_buf_mem(tp_info_t *tp) {
}

static __always_inline void l7_app_ctx_cleanup(egress_key_t *e_key) {
bpf_map_delete_elem(&tc_http_ctx_map, &e_key->s_port);
u32 s_port = e_key->s_port;
bpf_map_delete_elem(&tc_http_ctx_map, &s_port);
bpf_map_delete_elem(&outgoing_trace_map, e_key);
}

Expand All @@ -129,6 +130,99 @@ static __always_inline struct bpf_sock *lookup_sock_from_tuple(struct __sk_buff
return 0;
}

static __always_inline int write_traceparent(struct __sk_buff *skb,
protocol_info_t *tcp,
egress_key_t *e_key,
tc_http_ctx_t *ctx,
unsigned char *tp_buf) {
u32 tot_len = (u64)ctx_data_end(skb) - (u64)ctx_data(skb);
u32 packet_size = tot_len - tcp->hdr_len;
bpf_dbg_printk("Writing traceparent packet_size %d, offset %d, tot_len %d",
packet_size,
ctx->offset,
tot_len);
bpf_dbg_printk("seen %d, written %d", ctx->seen, ctx->written);

if (packet_size > 0) {
u32 len = 0;
u32 off = tcp->hdr_len;
// picked a value large enough to support TCP headers
bpf_clamp_umax(off, 128);
void *start = ctx_data(skb) + off;

// We haven't seen enough bytes coming through from the start
// until we did the split (at ctx->offset is where we injected)
// the empty space.
if (ctx->seen < ctx->offset) {
// Diff = How much more before we cross offset
u32 diff = ctx->offset - ctx->seen;
// We received less or equal bytes to what we want to
// reach ctx->offset, i.e the split point.
if (diff > packet_size) {
ctx->seen += packet_size;
return 0;
} else {
// We went over the split point, calculate how much can we
// write, but cap it to the max size = 70 bytes.
bpf_clamp_umax(diff, 2048);
start += diff;
ctx->seen = ctx->offset;
len = packet_size - diff;
bpf_clamp_umax(len, EXTEND_SIZE);
}
} else {
// Fast path. We are exactly at the offset, we've written
// nothing of the 'Traceparent: ...' text yet and the packet
// is exactly 70 bytes.
if (ctx->written == 0 && packet_size == EXTEND_SIZE) {
if ((start + EXTEND_SIZE) <= ctx_data_end(skb)) {
__builtin_memcpy(start, tp_buf, EXTEND_SIZE);
bpf_dbg_printk("Set the string fast_path!");
ctx->written = EXTEND_SIZE;
l7_app_ctx_cleanup(e_key);
return 0;
}
}

// Nope, we've written some bytes in another packet and we
// are not done writing yet.
if (ctx->written < EXTEND_SIZE) {
len = EXTEND_SIZE - ctx->written;
bpf_clamp_umax(len, EXTEND_SIZE);

if (len > packet_size) {
len = packet_size;
}
} else {
// We've written everything already, just clean up
l7_app_ctx_cleanup(e_key);
return 0;
}
}

if (len > 0) {
u32 tp_off = ctx->written;
// Keeps verifier happy
bpf_clamp_umax(tp_off, EXTEND_SIZE);
bpf_clamp_umax(len, EXTEND_SIZE);

if ((start + len) <= ctx_data_end(skb)) {
buf_memcpy((char *)start, (char *)tp_buf + tp_off, len, ctx_data_end(skb));
bpf_dbg_printk("Set the string off = %d, len = %d!", tp_off, len);
}

ctx->written += len;
// If we've written the full string this time around
// cleanup the metadata.
if (ctx->written >= EXTEND_SIZE) {
l7_app_ctx_cleanup(e_key);
}
}
}

return 0;
}

// This function does two things:
// 1. It adds sockets in the socket hash map which have already been
// established and we see them for the first time in Traffic Control, i.e
Expand Down Expand Up @@ -194,9 +288,7 @@ static __always_inline int l7_app_egress(struct __sk_buff *skb,
}
}

u32 tot_len = (u64)ctx_data_end(skb) - (u64)ctx_data(skb);
bpf_dbg_printk(
"egress, tot_len %d, s_port %d, data_start %d", tot_len, conn->s_port, tcp->hdr_len);
bpf_dbg_printk("egress, s_port %d, data_start %d", conn->s_port, tcp->hdr_len);

unsigned char *tp_buf = tp_buf_mem(&tp->tp);

Expand All @@ -211,82 +303,7 @@ static __always_inline int l7_app_egress(struct __sk_buff *skb,
// the case, seems plausible, but then the code below tries to handle any
// split. The 'fast path' handles the exact split as above.
if (ctx) {
u32 packet_size = tot_len - tcp->hdr_len;
bpf_dbg_printk("Found it! packet_size %d, offset %d", packet_size, ctx->offset);
bpf_dbg_printk("seen %d, written %d", ctx->seen, ctx->written);

if (packet_size > 0) {
u32 len = 0;
u32 off = tcp->hdr_len;
// picked a value large enough to support TCP headers
bpf_clamp_umax(off, 128);
void *start = ctx_data(skb) + off;

// We haven't seen enough bytes coming through from the start
// until we did the split (at ctx->offset is where we injected)
// the empty space.
if (ctx->seen < ctx->offset) {
// Diff = How much more before we cross offset
u32 diff = ctx->offset - ctx->seen;
// We received less or equal bytes to what we want to
// reach ctx->offset, i.e the split point.
if (diff <= packet_size) {
ctx->seen += packet_size;
return 0;
} else {
// We went over the split point, calculate how much can we
// write, but cap it to the max size = 70 bytes.
len = packet_size - diff;
bpf_clamp_umax(len, EXTEND_SIZE);
}
} else {
// Fast path. We are exactly at the offset, we've written
// nothing of the 'Traceparent: ...' text yet and the packet
// is exactly 70 bytes.
if (ctx->written == 0 && packet_size == EXTEND_SIZE) {
if ((start + EXTEND_SIZE) <= ctx_data_end(skb)) {
__builtin_memcpy(start, tp_buf, EXTEND_SIZE);
bpf_dbg_printk("Set the string fast_path!");
l7_app_ctx_cleanup(e_key);
return 0;
}
}

// Nope, we've written some bytes in another packet and we
// are not done writing yet.
if (ctx->written < EXTEND_SIZE) {
len = EXTEND_SIZE - ctx->written;
bpf_clamp_umax(len, EXTEND_SIZE);

if (len > packet_size) {
len = packet_size;
}
} else {
// We've written everything already, just clean up
l7_app_ctx_cleanup(e_key);
return 0;
}
}

if (len > 0) {
u32 tp_off = ctx->written;
// Keeps verifier happy
bpf_clamp_umax(tp_off, EXTEND_SIZE);
bpf_clamp_umax(len, EXTEND_SIZE);

if ((start + len) <= ctx_data_end(skb)) {
buf_memcpy((char *)start, (char *)tp_buf + tp_off, len, ctx_data_end(skb));
bpf_dbg_printk("Set the string off = %d, len = %d!", tp_off, len);
}

ctx->written += len;
// If we've written the full string this time around
// cleanup the metadata.
if (ctx->written >= EXTEND_SIZE) {
l7_app_ctx_cleanup(e_key);
}
}
}
return write_traceparent(skb, tcp, e_key, ctx, tp_buf);
}

return 0;
Expand Down
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_arm64_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_debug_arm64_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_debug_x86_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_tp_arm64_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_tp_debug_arm64_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_tp_debug_x86_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_tp_x86_bpfel.o
Git LFS file not shown
4 changes: 2 additions & 2 deletions pkg/internal/ebpf/generictracer/bpf_x86_bpfel.o
Git LFS file not shown
2 changes: 1 addition & 1 deletion pkg/internal/ebpf/gotracer/bpf_debug_x86_bpfel.o
Git LFS file not shown
Loading

0 comments on commit 9e3697b

Please sign in to comment.