This series contains 4 independent new features:
- Patch 1: use HMAC-SHA256 library instead of open-coded HMAC.
- Patch 2: selftests: check for unexpected fallback counter increments.
- Patches 3-4: record subflows in RPS table, for aRFS support.
Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- Changes in v2: - Drop previous patches 2 ("mptcp: make ADD_ADDR retransmission timeout adaptive") + 3 ("selftests: mptcp: remove add_addr_timeout settings"): They were introducing instabilities in the selftests. - Rebased. Other patches have not been modified. - Link to v1: https://lore.kernel.org/r/20250901-net-next-mptcp-misc-feat-6-18-v1-0-80ae80...
--- Christoph Paasch (2): net: Add rfs_needed() helper mptcp: record subflows in RPS table
Eric Biggers (1): mptcp: use HMAC-SHA256 library instead of open-coded HMAC
Gang Yan (1): selftests: mptcp: add checks for fallback counters
include/net/rps.h | 85 ++++++++++------ net/mptcp/crypto.c | 35 +------ net/mptcp/protocol.c | 21 ++++ tools/testing/selftests/net/mptcp/mptcp_join.sh | 123 ++++++++++++++++++++++++ 4 files changed, 202 insertions(+), 62 deletions(-) --- base-commit: cd8a4cfa6bb43a441901e82f5c222dddc75a18a3 change-id: 20250829-net-next-mptcp-misc-feat-6-18-722fa87a60f1
Best regards,
From: Eric Biggers ebiggers@kernel.org
Now that there are easy-to-use HMAC-SHA256 library functions, use these in net/mptcp/crypto.c instead of open-coding the HMAC algorithm.
Remove the WARN_ON_ONCE() for messages longer than SHA256_DIGEST_SIZE. The new implementation handles all message lengths correctly.
The mptcp-crypto KUnit test still passes after this change.
Signed-off-by: Eric Biggers ebiggers@kernel.org Reviewed-by: Matthieu Baerts (NGI0) matttbe@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- net/mptcp/crypto.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-)
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b08ba959ac4fd485bb833043ff58d4c8bac8a37a..31948e18d97da7ee0ee2ae9e4f7c9ca0e3b330a7 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -22,7 +22,6 @@
#include <linux/kernel.h> #include <crypto/sha2.h> -#include <linux/unaligned.h>
#include "protocol.h"
@@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { - u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; - u8 key1be[8]; - u8 key2be[8]; - int i; + __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) };
- if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) - len = SHA256_DIGEST_SIZE; - - put_unaligned_be64(key1, key1be); - put_unaligned_be64(key2, key2be); - - /* Generate key xored with ipad */ - memset(input, 0x36, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - memcpy(&input[SHA256_BLOCK_SIZE], msg, len); - - /* emit sha256(K1 || msg) on the second input block, so we can - * reuse 'input' for the last hashing - */ - sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]); - - /* Prepare second part of hmac */ - memset(input, 0x5C, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); + hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac); }
#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
From: Gang Yan yangang@kylinos.cn
Recently, some mib counters about fallback has been added, this patch provides a method to check the expected behavior of these mib counters during the test execution.
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/571 Signed-off-by: Gang Yan yangang@kylinos.cn Reviewed-by: Matthieu Baerts (NGI0) matttbe@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 123 ++++++++++++++++++++++++ 1 file changed, 123 insertions(+)
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 82cae37d9c2026cc55466636d53a76f929a03452..2f046167a0b6cc6fb5531a033d8d95c9ea399cf9 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -74,6 +74,17 @@ unset join_create_err unset join_bind_err unset join_connect_err
+unset fb_ns1 +unset fb_ns2 +unset fb_infinite_map_tx +unset fb_dss_corruption +unset fb_simult_conn +unset fb_mpc_passive +unset fb_mpc_active +unset fb_mpc_data +unset fb_md5_sig +unset fb_dss + # generated using "nfbpf_compile '(ip && (ip[54] & 0xf0) == 0x30) || # (ip6 && (ip6[74] & 0xf0) == 0x30)'" CBPF_MPTCP_SUBOPTION_ADD_ADDR="14, @@ -1399,6 +1410,115 @@ chk_join_tx_nr() print_results "join Tx" ${rc} }
+chk_fallback_nr() +{ + local infinite_map_tx=${fb_infinite_map_tx:-0} + local dss_corruption=${fb_dss_corruption:-0} + local simult_conn=${fb_simult_conn:-0} + local mpc_passive=${fb_mpc_passive:-0} + local mpc_active=${fb_mpc_active:-0} + local mpc_data=${fb_mpc_data:-0} + local md5_sig=${fb_md5_sig:-0} + local dss=${fb_dss:-0} + local rc=${KSFT_PASS} + local ns=$1 + local count + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtInfiniteMapTx") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$infinite_map_tx" ]; then + rc=${KSFT_FAIL} + print_check "$ns infinite map tx fallback" + fail_test "got $count infinite map tx fallback[s] in $ns expected $infinite_map_tx" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDSSCorruptionFallback") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$dss_corruption" ]; then + rc=${KSFT_FAIL} + print_check "$ns dss corruption fallback" + fail_test "got $count dss corruption fallback[s] in $ns expected $dss_corruption" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtSimultConnectFallback") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$simult_conn" ]; then + rc=${KSFT_FAIL} + print_check "$ns simult conn fallback" + fail_test "got $count simult conn fallback[s] in $ns expected $simult_conn" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackACK") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$mpc_passive" ]; then + rc=${KSFT_FAIL} + print_check "$ns mpc passive fallback" + fail_test "got $count mpc passive fallback[s] in $ns expected $mpc_passive" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableFallbackSYNACK") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$mpc_active" ]; then + rc=${KSFT_FAIL} + print_check "$ns mpc active fallback" + fail_test "got $count mpc active fallback[s] in $ns expected $mpc_active" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMPCapableDataFallback") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$mpc_data" ]; then + rc=${KSFT_FAIL} + print_check "$ns mpc data fallback" + fail_test "got $count mpc data fallback[s] in $ns expected $mpc_data" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtMD5SigFallback") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$md5_sig" ]; then + rc=${KSFT_FAIL} + print_check "$ns MD5 Sig fallback" + fail_test "got $count MD5 Sig fallback[s] in $ns expected $md5_sig" + fi + + count=$(mptcp_lib_get_counter ${!ns} "MPTcpExtDssFallback") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$dss" ]; then + rc=${KSFT_FAIL} + print_check "$ns dss fallback" + fail_test "got $count dss fallback[s] in $ns expected $dss" + fi + + return $rc +} + +chk_fallback_nr_all() +{ + local netns=("ns1" "ns2") + local fb_ns=("fb_ns1" "fb_ns2") + local rc=${KSFT_PASS} + + for i in 0 1; do + if [ -n "${!fb_ns[i]}" ]; then + eval "${!fb_ns[i]}" \ + chk_fallback_nr ${netns[i]} || rc=${?} + else + chk_fallback_nr ${netns[i]} || rc=${?} + fi + done + + if [ "${rc}" != "${KSFT_PASS}" ]; then + print_results "fallback" ${rc} + fi +} + chk_join_nr() { local syn_nr=$1 @@ -1484,6 +1604,8 @@ chk_join_nr() join_syn_tx="${join_syn_tx:-${syn_nr}}" \ chk_join_tx_nr
+ chk_fallback_nr_all + if $validate_checksum; then chk_csum_nr $csum_ns1 $csum_ns2 chk_fail_nr $fail_nr $fail_nr @@ -3337,6 +3459,7 @@ fail_tests() join_csum_ns1=+1 join_csum_ns2=+0 \ join_fail_nr=1 join_rst_nr=0 join_infi_nr=1 \ join_corrupted_pkts="$(pedit_action_pkts)" \ + fb_ns1="fb_dss=1" fb_ns2="fb_infinite_map_tx=1" \ chk_join_nr 0 0 0 chk_fail_nr 1 -1 invert fi
From: Christoph Paasch cpaasch@openai.com
Add a helper to check if RFS is needed or not. Allows to make the code a bit cleaner and the next patch to have MPTCP use this helper to decide whether or not to iterate over the subflows.
tun_flow_update() was calling sock_rps_record_flow_hash() regardless of the state of rfs_needed. This was not really a bug as sock_flow_table simply ends up being NULL and thus everything will be fine. This commit here thus also implicitly makes tun_flow_update() respect the state of rfs_needed.
Suggested-by: Matthieu Baerts matttbe@kernel.org Signed-off-by: Christoph Paasch cpaasch@openai.com Acked-by: Matthieu Baerts (NGI0) matttbe@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- include/net/rps.h | 85 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 29 deletions(-)
diff --git a/include/net/rps.h b/include/net/rps.h index 9917dce42ca457e9c25d9e84ee450235f771d09b..f1794cd2e7fb32a36bde9959fab651663ab190fd 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -85,11 +85,8 @@ static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, WRITE_ONCE(table->ents[index], val); }
-#endif /* CONFIG_RPS */ - -static inline void sock_rps_record_flow_hash(__u32 hash) +static inline void _sock_rps_record_flow_hash(__u32 hash) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table;
if (!hash) @@ -99,42 +96,33 @@ static inline void sock_rps_record_flow_hash(__u32 hash) if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); -#endif }
-static inline void sock_rps_record_flow(const struct sock *sk) +static inline void _sock_rps_record_flow(const struct sock *sk) { -#ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } + _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } -#endif }
-static inline void sock_rps_delete_flow(const struct sock *sk) +static inline void _sock_rps_delete_flow(const struct sock *sk) { -#ifdef CONFIG_RPS struct rps_sock_flow_table *table; u32 hash, index;
- if (!static_branch_unlikely(&rfs_needed)) - return; - hash = READ_ONCE(sk->sk_rxhash); if (!hash) return; @@ -147,6 +135,45 @@ static inline void sock_rps_delete_flow(const struct sock *sk) WRITE_ONCE(table->ents[index], RPS_NO_CPU); } rcu_read_unlock(); +} +#endif /* CONFIG_RPS */ + +static inline bool rfs_is_needed(void) +{ +#ifdef CONFIG_RPS + return static_branch_unlikely(&rfs_needed); +#else + return false; +#endif +} + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); +#endif +} + +static inline void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_delete_flow(sk); #endif }
From: Christoph Paasch cpaasch@openai.com
Accelerated Receive Flow Steering (aRFS) relies on sockets recording their RX flow hash into the rps_sock_flow_table so that incoming packets are steered to the CPU where the application runs.
With MPTCP, the application interacts with the parent MPTCP socket while data is carried over per-subflow TCP sockets. Without recording these subflows, aRFS cannot steer interrupts and RX processing for the flows to the desired CPU.
Record all subflows in the RPS table by calling sock_rps_record_flow() for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and mptcp_stream_accept(), by using the new helper mptcp_rps_record_subflows().
It does not by itself improve throughput, but ensures that IRQ and RX processing are directed to the right CPU, which is a prerequisite for effective aRFS.
Signed-off-by: Christoph Paasch cpaasch@openai.com Reviewed-by: Matthieu Baerts (NGI0) matttbe@kernel.org Signed-off-by: Matthieu Baerts (NGI0) matttbe@kernel.org --- net/mptcp/protocol.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ad41c48126e44fda646f1ec1c81957db1407a6cc..a8d57b88578dfea807d3d55e430849aa8005c637 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -12,6 +12,7 @@ #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/aligned_data.h> +#include <net/rps.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk) return limit - not_sent; }
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!rfs_is_needed()) + return; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + sock_rps_record_flow(ssk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ mptcp_rps_record_subflows(msk); + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; @@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; }
+ mptcp_rps_record_subflows(msk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX); @@ -3922,6 +3941,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_sock_graft(ssk, newsock); }
+ mptcp_rps_record_subflows(msk); + /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */
Hello:
This series was applied to netdev/net-next.git (main) by Jakub Kicinski kuba@kernel.org:
On Tue, 02 Sep 2025 23:11:32 +0200 you wrote:
This series contains 4 independent new features:
Patch 1: use HMAC-SHA256 library instead of open-coded HMAC.
Patch 2: selftests: check for unexpected fallback counter increments.
Patches 3-4: record subflows in RPS table, for aRFS support.
[...]
Here is the summary with links: - [net-next,v2,1/4] mptcp: use HMAC-SHA256 library instead of open-coded HMAC https://git.kernel.org/netdev/net-next/c/2d5be5629ce7 - [net-next,v2,2/4] selftests: mptcp: add checks for fallback counters https://git.kernel.org/netdev/net-next/c/3fff72f827ad - [net-next,v2,3/4] net: Add rfs_needed() helper https://git.kernel.org/netdev/net-next/c/929324913e0c - [net-next,v2,4/4] mptcp: record subflows in RPS table https://git.kernel.org/netdev/net-next/c/3bd4f98a4e2c
You are awesome, thank you!
linux-kselftest-mirror@lists.linaro.org