Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

net-timestamp: bpf extension to equip applications transparently #4926

Open
wants to merge 12 commits into
base: bpf-next_base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern {
void *skb_data_end;
u8 op;
u8 is_fullsock;
u8 is_locked_tcp_sock;
u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
Expand Down
12 changes: 9 additions & 3 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ struct skb_shared_hwtstamps {
/* Definitions for tx_flags in struct skb_shared_info */
enum {
/* generate hardware time stamp */
SKBTX_HW_TSTAMP = 1 << 0,
SKBTX_HW_TSTAMP_NOBPF = 1 << 0,

/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,
Expand All @@ -489,10 +489,16 @@ enum {

/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,

/* used for bpf extension when a bpf program is loaded */
SKBTX_BPF = 1 << 7,
};

#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)

#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
SKBTX_SCHED_TSTAMP)
SKBTX_SCHED_TSTAMP | \
SKBTX_BPF)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
SKBTX_HW_TSTAMP_USE_CYCLES | \
SKBTX_ANY_SW_TSTAMP)
Expand Down Expand Up @@ -4564,7 +4570,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
skb_tstamp_tx(skb, NULL);
}

Expand Down
10 changes: 10 additions & 0 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
* @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
Expand Down Expand Up @@ -445,6 +446,8 @@ struct sock {
u32 sk_reserved_mem;
int sk_forward_alloc;
u32 sk_tsflags;
#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
u32 sk_bpf_cb_flags;
__cacheline_group_end(sock_write_rxtx);

__cacheline_group_begin(sock_write_tx);
Expand Down Expand Up @@ -2920,6 +2923,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);

void sock_enable_timestamps(struct sock *sk);
#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
}
#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
Expand Down
5 changes: 3 additions & 2 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -958,10 +958,10 @@ struct tcp_skb_cb {

__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
__u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
unused:5;
unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
Expand Down Expand Up @@ -2649,6 +2649,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
sock_ops.is_locked_tcp_sock = 1;
sock_owned_by_me(sk);
}

Expand Down
30 changes: 30 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -6916,6 +6916,13 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};

/* Definitions for bpf_sk_cb_flags */
enum {
SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
SK_BPF_CB_TX_TIMESTAMPING
};

/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
Expand Down Expand Up @@ -7028,6 +7035,28 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
BPF_SOCK_OPS_TS_SCHED_OPT_CB, /* Called when skb is passing through
* dev layer when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TS_SW_OPT_CB, /* Called when skb is about to send
* to the nic when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TS_HW_OPT_CB, /* Called in hardware phase when
* SK_BPF_CB_TX_TIMESTAMPING feature
* is on.
*/
BPF_SOCK_OPS_TS_ACK_OPT_CB, /* Called when all the skbs in the
* same sendmsg call are acked
* when SK_BPF_CB_TX_TIMESTAMPING
* feature is on.
*/
BPF_SOCK_OPS_TS_SND_CB, /* Called when every sendmsg syscall
* is triggered. It's used to correlate
* sendmsg timestamp with corresponding
* tskey.
*/
};

/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
Expand Down Expand Up @@ -7094,6 +7123,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
SK_BPF_CB_FLAGS = 1009, /* Used to set socket bpf flags */
};

enum {
Expand Down
1 change: 1 addition & 0 deletions kernel/bpf/btf.c
Original file line number Diff line number Diff line change
Expand Up @@ -8522,6 +8522,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
Expand Down
3 changes: 2 additions & 1 deletion net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -4500,7 +4500,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);

if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
if (unlikely(skb_shinfo(skb)->tx_flags &
(SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

/* Disable soft irqs for various locks below. Also
Expand Down
75 changes: 70 additions & 5 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};

static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
u32 sk_bpf_cb_flags;

if (getopt) {
*(u32 *)optval = sk->sk_bpf_cb_flags;
return 0;
}

sk_bpf_cb_flags = *(u32 *)optval;

if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
return -EINVAL;

sk->sk_bpf_cb_flags = sk_bpf_cb_flags;

return 0;
}

static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
Expand All @@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
Expand All @@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}

if (optname == SK_BPF_CB_FLAGS)
return sk_bpf_set_get_cb_flags(sk, optval, getopt);

if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
Expand Down Expand Up @@ -5500,6 +5523,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}

static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
{
return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
Expand Down Expand Up @@ -5650,6 +5678,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

Expand Down Expand Up @@ -5735,6 +5766,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
int ret, copy_len = 0;
Expand Down Expand Up @@ -5777,6 +5811,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;

Expand Down Expand Up @@ -7586,6 +7623,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
u8 search_kind, search_len, copy_len, magic_len;
int ret;

if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;

/* 2 byte is the minimal option len except TCPOPT_NOP and
* TCPOPT_EOL which are useless for the bpf prog to learn
* and this helper disallow loading them also.
Expand Down Expand Up @@ -10358,10 +10398,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
} \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
is_locked_tcp_sock), \
fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
if (si->dst_reg == si->src_reg) \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
Expand Down Expand Up @@ -10446,10 +10486,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
is_locked_tcp_sock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
Expand Down Expand Up @@ -12062,6 +12102,21 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
#endif
}

__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops)
{
struct sk_buff *skb;

if (skops->op != BPF_SOCK_OPS_TS_SND_CB)
return -EOPNOTSUPP;

skb = skops->skb;
TCP_SKB_CB(skb)->txstamp_ack = 2;
skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;

return 0;
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
Expand Down Expand Up @@ -12095,6 +12150,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
Expand All @@ -12115,6 +12174,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
.set = &bpf_kfunc_check_set_tcp_reqsk,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_sock_ops,
};

static int __init bpf_kfunc_init(void)
{
int ret;
Expand All @@ -12133,7 +12197,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);

Expand Down
Loading
Loading