Here are 3 different fixes, all related to the MPTCP receive buffer:
- Patch 1: fix receive buffer space when recvmsg() blocks after receiving some data. For a fix introduced in v6.12, backported to v6.1.
- Patch 2: mptcp_cleanup_rbuf() can be called when no data has been copied. For 5.11.
- Patch 3: prevent excessive coalescing on receive, which can affect the throughput badly. It looks better to wait a bit before backporting this one to stable versions, to get more results. For 5.10.
Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- Please note that there is no urgency here as well: this can of course be sent to Linus next year!
Enjoy this holiday period!
--- Paolo Abeni (3): mptcp: fix recvbuffer adjust on sleeping rcvmsg mptcp: don't always assume copied data in mptcp_cleanup_rbuf() mptcp: prevent excessive coalescing on receive
net/mptcp/protocol.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) --- base-commit: a024e377efed31ecfb39210bed562932321345b3 change-id: 20241230-net-mptcp-rbuf-fixes-74526e59d951
Best regards,
From: Paolo Abeni pabeni@redhat.com
If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta.
Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes.
This also removes an unneeded difference between the TCP and MPTCP RX code path implementation.
Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni pabeni@redhat.com Reviewed-by: Mat Martineau martineau@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c0348471a5995465aec32c67af347..27afdb7e2071b16dbc4dfa1199b6e78c784f7a7c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; }
+static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; }
+ mptcp_rcv_space_adjust(msk, copied); return copied; }
@@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, }
pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } }
- mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS)
From: Paolo Abeni pabeni@redhat.com
Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition.
Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock.
Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni pabeni@redhat.com Reviewed-by: Mat Martineau martineau@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071b16dbc4dfa1199b6e78c784f7a7c..5307fff9d995309591ed742801350078db519f79 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); }
-static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow;
slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); }
@@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); }
-static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty;
- cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied;
mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } }
@@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
- /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue;
@@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, }
pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } }
+ mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS)
From: Paolo Abeni pabeni@redhat.com
Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly.
Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni pabeni@redhat.com Reviewed-by: Mat Martineau martineau@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d995309591ed742801350078db519f79..1b2e7cbb577fc26280f31e58adceb36987112f54 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta;
if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false;
Hello:
This series was applied to netdev/net.git (main) by Jakub Kicinski kuba@kernel.org:
On Mon, 30 Dec 2024 19:12:29 +0100 you wrote:
Here are 3 different fixes, all related to the MPTCP receive buffer:
Patch 1: fix receive buffer space when recvmsg() blocks after receiving some data. For a fix introduced in v6.12, backported to v6.1.
Patch 2: mptcp_cleanup_rbuf() can be called when no data has been copied. For 5.11.
[...]
Here is the summary with links: - [net,1/3] mptcp: fix recvbuffer adjust on sleeping rcvmsg https://git.kernel.org/netdev/net/c/449e6912a252 - [net,2/3] mptcp: don't always assume copied data in mptcp_cleanup_rbuf() https://git.kernel.org/netdev/net/c/551844f26da2 - [net,3/3] mptcp: prevent excessive coalescing on receive https://git.kernel.org/netdev/net/c/56b824eb49d6
You are awesome, thank you!
linux-stable-mirror@lists.linaro.org