Skip to content

Commit

Permalink
mptcp: fix rcv buffer auto-tuning.
Browse files Browse the repository at this point in the history
The MPTCP code uses the assumption that the tcp_win_from_space() helper
does not use any TCP-specific field, and thus works correctly operating
on an MPTCP socket.
The commit dfa2f04 ("tcp: get rid of sysctl_tcp_adv_win_scale") broke
such assumption, and as a consequence most MPTCP connections stall on
zero-window event due to auto-tuning changing the rcv buffer size quite
randomly.
Address the issue synching again the MPTCP auto-tuning code with the TCP
one. To achieve that, factor out the windows size logic in socket
independent helpers, and reuse them in mptcp_rcv_space_adjust(). The MPTCP
level scaling_ratio is selected as the minimum one from the all the subflows,
as a worst-case estimate.

Co-developed-by: Matthieu Baerts <[email protected]>
Signed-off-by: Matthieu Baerts <[email protected]>
Signed-off-by: Paolo Abeni <[email protected]>
  • Loading branch information
Paolo Abeni authored and intel-lab-lkp committed Jul 20, 2023
1 parent 9e4743b commit bf9cb14
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 14 deletions.
20 changes: 15 additions & 5 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1430,22 +1430,32 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
__u32 *window_clamp, int wscale_ok,
__u8 *rcv_wscale, __u32 init_rcv_wnd);

static inline int tcp_win_from_space(const struct sock *sk, int space)
static inline int __tcp_win_from_space(u8 scaling_ratio, int space)
{
s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
s64 scaled_space = (s64)space * scaling_ratio;

return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
}

/* inverse of tcp_win_from_space() */
static inline int tcp_space_from_win(const struct sock *sk, int win)
static inline int tcp_win_from_space(const struct sock *sk, int space)
{
return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space);
}

/* inverse of __tcp_win_from_space() */
static inline int __tcp_space_from_win(u8 scaling_ratio, int win)
{
u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;

do_div(val, tcp_sk(sk)->scaling_ratio);
do_div(val, scaling_ratio);
return val;
}

static inline int tcp_space_from_win(const struct sock *sk, int win)
{
return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
}

static inline void tcp_scaling_ratio_init(struct sock *sk)
{
/* Assume a conservative default of 1200 bytes of payload per 4K page.
Expand Down
14 changes: 7 additions & 7 deletions net/mptcp/protocol.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
if (err)
return err;

msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio;
WRITE_ONCE(msk->first, ssock->sk);
subflow = mptcp_subflow_ctx(ssock->sk);
list_add(&subflow->node, &msk->conn_list);
Expand Down Expand Up @@ -1906,6 +1907,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
u8 scaling_ratio = 255;
u32 time, advmss = 1;
u64 rtt_us, mstamp;

Expand Down Expand Up @@ -1936,9 +1938,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)

rtt_us = max(sf_rtt_us, rtt_us);
advmss = max(sf_advmss, advmss);
scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
}

msk->rcvq_space.rtt_us = rtt_us;
msk->scaling_ratio = scaling_ratio;
if (time < (rtt_us >> 3) || rtt_us == 0)
return;

Expand All @@ -1947,8 +1951,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)

if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
int rcvbuf;

rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;

Expand All @@ -1957,18 +1961,14 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(grow, msk->rcvq_space.space);
rcvwin += (grow << 1);

rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER);
while (tcp_win_from_space(sk, rcvmem) < advmss)
rcvmem += 128;

do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin),
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));

if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;

window_clamp = tcp_win_from_space(sk, rcvbuf);
window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf);
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

/* Make subflows follow along. If we do not do this, we
Expand Down
8 changes: 7 additions & 1 deletion net/mptcp/protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ struct mptcp_sock {
bool csum_enabled;
bool allow_infinite_fallback;
u8 mpc_endpoint_id;
u8 scaling_ratio;
u8 recvmsg_inq:1,
cork:1,
nodelay:1,
Expand Down Expand Up @@ -349,9 +350,14 @@ static inline int __mptcp_rmem(const struct sock *sk)
return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
}

static inline int mptcp_win_from_space(const struct sock *sk, int space)
{
return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space);
}

static inline int __mptcp_space(const struct sock *sk)
{
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
}

static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
Expand Down
2 changes: 1 addition & 1 deletion net/mptcp/subflow.c
Original file line number Diff line number Diff line change
Expand Up @@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
const struct sock *sk = subflow->conn;

*space = __mptcp_space(sk);
*full_space = tcp_full_space(sk);
*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}

void __mptcp_error_report(struct sock *sk)
Expand Down

0 comments on commit bf9cb14

Please sign in to comment.