From: Wilfred Mallawa wilfred.mallawa@wdc.com
During a handshake, an endpoint may specify a maximum record size limit. Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the maximum record size. Meaning that, the outgoing records from the kernel can exceed a lower size negotiated during the handshake. In such a case, the TLS endpoint must send a fatal "record_overflow" alert [1], and thus the record is discarded.
Upcoming Western Digital NVMe-TCP hardware controllers implement TLS support. For these devices, supporting TLS record size negotiation is necessary because the maximum TLS record size supported by the controller is less than the default 16KB currently used by the kernel.
Currently, there is no way to inform the kernel of such a limit. This patch adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that allows for setting the maximum plaintext fragment size. Once set, outgoing records are no larger than the size specified. This option can be used to specify the record size limit.
[1] https://www.rfc-editor.org/rfc/rfc8449
Signed-off-by: Wilfred Mallawa wilfred.mallawa@wdc.com --- Changes V4 -> V5 - Change the socket option to TLS_TX_MAX_PAYLOAD_LEN, such that we can limit the payload length in a generic way, as pposed to strictly specifying record size limit. No functional changes other than removing TLS 1.3 content byte length checks for this argument.
- Lock the socket when calling do_tls_setsockopt_tx_payload_len()
V4: https://lore.kernel.org/netdev/20250923053207.113938-1-wilfred.opensource@gm... --- Documentation/networking/tls.rst | 11 ++++++ include/net/tls.h | 3 ++ include/uapi/linux/tls.h | 2 ++ net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 62 ++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 2 +- 6 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 36cc7afc2527..dabab17ab84a 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -280,6 +280,17 @@ If the record decrypted turns out to had been padded or is not a data record it will be decrypted again into a kernel buffer without zero copy. Such events are counted in the ``TlsDecryptRetry`` statistic.
+TLS_TX_MAX_PAYLOAD_LEN +~~~~~~~~~~~~~~~~~~~~~~ + +Sets the maximum size for the plaintext of a protected record. + +When this option is set, the kernel enforces this limit on all transmitted TLS +records, ensuring no plaintext fragment exceeds the specified size. This can be +used to specify the TLS Record Size Limit [1]. + +[1] https://datatracker.ietf.org/doc/html/rfc8449 + Statistics ==========
diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..f2af113728aa 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE @@ -226,6 +228,7 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index b66a800389cc..b8b9c42f848c 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -41,6 +41,7 @@ #define TLS_RX 2 /* Set receive parameters */ #define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */ #define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */ +#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */
/* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -194,6 +195,7 @@ enum { TLS_INFO_RXCONF, TLS_INFO_ZC_RO_TX, TLS_INFO_RX_NO_PAD, + TLS_INFO_TX_MAX_PAYLOAD_LEN, __TLS_INFO_MAX, }; #define TLS_INFO_MAX (__TLS_INFO_MAX - 1) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a64ae15b1a60..c6289c73cffc 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -461,7 +461,7 @@ static int tls_push_data(struct sock *sk, /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ - max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + max_open_record_len = tls_ctx->tx_max_payload_len + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3ccb3135e51..b481d1add14e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -544,6 +544,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, return 0; }
+static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + u16 payload_len = ctx->tx_max_payload_len; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < sizeof(payload_len)) + return -EINVAL; + + if (put_user(sizeof(payload_len), optlen)) + return -EFAULT; + + if (copy_to_user(optval, &payload_len, sizeof(payload_len))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -563,6 +585,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -812,6 +837,30 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, return rc; }
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx); + u16 value; + + if (sw_ctx->open_rec) + return -EBUSY; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value < TLS_MIN_RECORD_SIZE_LIM || value > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + ctx->tx_max_payload_len = value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -833,6 +882,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + lock_sock(sk); + rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen); + release_sock(sk); + break; default: rc = -ENOPROTOOPT; break; @@ -1022,6 +1076,7 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); @@ -1111,6 +1166,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) goto nla_failure; }
+ err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN, + ctx->tx_max_payload_len); + + if (err) + goto nla_failure; + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -1132,6 +1193,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin) nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */ 0;
return size; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index daac9fd4be7e..e76ea38b712a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true;
From: Wilfred Mallawa wilfred.mallawa@wdc.com
Test that outgoing plaintext records respect the tls TLS_TX_MAX_PAYLOAD_LEN set using setsockopt(). The limit is set to be 128, thus, in all received records, the plaintext must not exceed this amount.
Also test that setting a new record size limit whilst a pending open record exists is handled correctly by discarding the request.
Suggested-by: Sabrina Dubroca sd@queasysnail.net Signed-off-by: Wilfred Mallawa wilfred.mallawa@wdc.com --- Changes V4 -> V5: - Cleanup unused variables - Use ASSERT_EQ more aptly
V4: https://lore.kernel.org/netdev/20250923053207.113938-2-wilfred.opensource@gm... --- tools/testing/selftests/net/tls.c | 141 ++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+)
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index e788b84551ca..158dec851176 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2791,6 +2791,147 @@ TEST_F(tls_err, oob_pressure) EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); }
+/* + * Parse a stream of TLS records and ensure that each record respects + * the specified @max_payload_len. + */ +static size_t parse_tls_records(struct __test_metadata *_metadata, + const __u8 *rx_buf, int rx_len, int overhead, + __u16 max_payload_len) +{ + const __u8 *rec = rx_buf; + size_t total_plaintext_rx = 0; + const __u8 rec_header_len = 5; + + while (rec < rx_buf + rx_len) { + __u16 record_payload_len; + __u16 plaintext_len; + + /* Sanity check that it's a TLS header for application data */ + ASSERT_EQ(rec[0], 23); + ASSERT_EQ(rec[1], 0x3); + ASSERT_EQ(rec[2], 0x3); + + memcpy(&record_payload_len, rec + 3, 2); + record_payload_len = ntohs(record_payload_len); + ASSERT_GE(record_payload_len, overhead); + + plaintext_len = record_payload_len - overhead; + total_plaintext_rx += plaintext_len; + + /* Plaintext must not exceed the specified limit */ + ASSERT_LE(plaintext_len, max_payload_len); + rec += rec_header_len + record_payload_len; + } + + return total_plaintext_rx; +} + +TEST(tx_max_payload_len) +{ + struct tls_crypto_info_keys tls12; + int cfd, ret, fd, overhead; + size_t total_plaintext_rx = 0; + __u8 tx[1024], rx[2000]; + __u16 limit = 128; + __u16 opt = 0; + unsigned int optlen = sizeof(opt); + bool notls; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); + + if (notls) + exit(KSFT_SKIP); + + /* Don't install keys on fd, we'll parse raw records */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &limit, + sizeof(limit)); + ASSERT_EQ(ret, 0); + + ret = getsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &opt, &optlen); + EXPECT_EQ(ret, 0); + EXPECT_EQ(limit, opt); + EXPECT_EQ(optlen, sizeof(limit)); + + memset(tx, 0, sizeof(tx)); + ASSERT_EQ(send(cfd, tx, sizeof(tx), 0), sizeof(tx)); + close(cfd); + + ret = recv(fd, rx, sizeof(rx), 0); + + /* + * 16B tag + 8B IV -- record header (5B) is not counted but we'll + * need it to walk the record stream + */ + overhead = 16 + 8; + total_plaintext_rx = parse_tls_records(_metadata, rx, ret, overhead, + limit); + + ASSERT_EQ(total_plaintext_rx, sizeof(tx)); + close(fd); +} + +TEST(tx_max_payload_len_open_rec) +{ + struct tls_crypto_info_keys tls12; + int cfd, ret, fd, overhead; + size_t total_plaintext_rx = 0; + __u8 tx[1024], rx[2000]; + __u16 tx_partial = 256; + __u16 og_limit = 512, limit = 128; + bool notls; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &fd, &cfd, ¬ls); + + if (notls) + exit(KSFT_SKIP); + + /* Don't install keys on fd, we'll parse raw records */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &og_limit, + sizeof(og_limit)); + ASSERT_EQ(ret, 0); + + memset(tx, 0, sizeof(tx)); + ASSERT_EQ(send(cfd, tx, tx_partial, MSG_MORE), tx_partial); + + /* + * Changing the payload limit with a pending open record should + * not be allowed. + */ + ret = setsockopt(cfd, SOL_TLS, TLS_TX_MAX_PAYLOAD_LEN, &limit, + sizeof(limit)); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBUSY); + + ASSERT_EQ(send(cfd, tx + tx_partial, sizeof(tx) - tx_partial, MSG_EOR), + sizeof(tx) - tx_partial); + close(cfd); + + ret = recv(fd, rx, sizeof(rx), 0); + + /* + * 16B tag + 8B IV -- record header (5B) is not counted but we'll + * need it to walk the record stream + */ + overhead = 16 + 8; + total_plaintext_rx = parse_tls_records(_metadata, rx, ret, overhead, + og_limit); + ASSERT_EQ(total_plaintext_rx, sizeof(tx)); + close(fd); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr;
syzbot ci has tested the following series
[v5] net/tls: support setting the maximum payload size https://lore.kernel.org/all/20251014051825.1084403-2-wilfred.opensource@gmai... * [PATCH net-next v5 1/2] net/tls: support setting the maximum payload size * [PATCH net-next v5 2/2] selftests: tls: add tls record_size_limit test
and found the following issue: general protection fault in tls_setsockopt
Full report is available here: https://ci.syzbot.org/series/210c88bf-945a-460c-9fe3-c55bd1958b5d
***
general protection fault in tls_setsockopt
tree: net-next URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/netdev/net-next.git base: 18a7e218cfcdca6666e1f7356533e4c988780b57 arch: amd64 compiler: Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8 config: https://ci.syzbot.org/builds/ae03b3d4-e809-4b0e-bd4a-76b4ae8e9d74/config C repro: https://ci.syzbot.org/findings/d0b97737-6ea7-4c05-b10b-71dc834c9393/c_repro syz repro: https://ci.syzbot.org/findings/d0b97737-6ea7-4c05-b10b-71dc834c9393/syz_repr...
Oops: general protection fault, probably for non-canonical address 0xdffffc0000000023: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000118-0x000000000000011f] CPU: 1 UID: 0 PID: 5970 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:do_tls_setsockopt_tx_payload_len net/tls/tls_main.c:847 [inline] RIP: 0010:do_tls_setsockopt net/tls/tls_main.c:887 [inline] RIP: 0010:tls_setsockopt+0x189/0x1590 net/tls/tls_main.c:906 Code: ff df 80 3c 08 00 74 08 4c 89 ff e8 51 67 3c f8 41 bc 18 01 00 00 4d 03 27 4c 89 e0 48 c1 e8 03 49 bf 00 00 00 00 00 fc ff df <42> 80 3c 38 00 74 08 4c 89 e7 e8 28 67 3c f8 49 83 3c 24 00 0f 84 RSP: 0018:ffffc90003e57cc0 EFLAGS: 00010202 RAX: 0000000000000023 RBX: ffff88816adfcc00 RCX: dffffc0000000000 RDX: 0000000000000006 RSI: ffffffff8d7e872a RDI: ffffffff8bc074e0 RBP: ffffc90003e57e38 R08: ffffffff8f9e0f77 R09: 1ffffffff1f3c1ee R10: dffffc0000000000 R11: fffffbfff1f3c1ef R12: 0000000000000118 R13: 1ffff920007cafa4 R14: ffff88816b33bde8 R15: dffffc0000000000 FS: 00005555585b2500(0000) GS:ffff8882a9d0f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000180 CR3: 000000010739e000 CR4: 00000000000006f0 Call Trace: <TASK> do_sock_setsockopt+0x17c/0x1b0 net/socket.c:2360 __sys_setsockopt net/socket.c:2385 [inline] __do_sys_setsockopt net/socket.c:2391 [inline] __se_sys_setsockopt net/socket.c:2388 [inline] __x64_sys_setsockopt+0x13f/0x1b0 net/socket.c:2388 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f499fb8eec9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffecf309e88 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 00007f499fde5fa0 RCX: 00007f499fb8eec9 RDX: 0000000000000005 RSI: 000000000000011a RDI: 0000000000000003 RBP: 00007f499fc11f91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f499fde5fa0 R14: 00007f499fde5fa0 R15: 0000000000000005 </TASK> Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:do_tls_setsockopt_tx_payload_len net/tls/tls_main.c:847 [inline] RIP: 0010:do_tls_setsockopt net/tls/tls_main.c:887 [inline] RIP: 0010:tls_setsockopt+0x189/0x1590 net/tls/tls_main.c:906 Code: ff df 80 3c 08 00 74 08 4c 89 ff e8 51 67 3c f8 41 bc 18 01 00 00 4d 03 27 4c 89 e0 48 c1 e8 03 49 bf 00 00 00 00 00 fc ff df <42> 80 3c 38 00 74 08 4c 89 e7 e8 28 67 3c f8 49 83 3c 24 00 0f 84 RSP: 0018:ffffc90003e57cc0 EFLAGS: 00010202 RAX: 0000000000000023 RBX: ffff88816adfcc00 RCX: dffffc0000000000 RDX: 0000000000000006 RSI: ffffffff8d7e872a RDI: ffffffff8bc074e0 RBP: ffffc90003e57e38 R08: ffffffff8f9e0f77 R09: 1ffffffff1f3c1ee R10: dffffc0000000000 R11: fffffbfff1f3c1ef R12: 0000000000000118 R13: 1ffff920007cafa4 R14: ffff88816b33bde8 R15: dffffc0000000000 FS: 00005555585b2500(0000) GS:ffff8882a9d0f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000180 CR3: 000000010739e000 CR4: 00000000000006f0 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: df 80 3c 08 00 74 filds 0x7400083c(%rax) 6: 08 4c 89 ff or %cl,-0x1(%rcx,%rcx,4) a: e8 51 67 3c f8 call 0xf83c6760 f: 41 bc 18 01 00 00 mov $0x118,%r12d 15: 4d 03 27 add (%r15),%r12 18: 4c 89 e0 mov %r12,%rax 1b: 48 c1 e8 03 shr $0x3,%rax 1f: 49 bf 00 00 00 00 00 movabs $0xdffffc0000000000,%r15 26: fc ff df * 29: 42 80 3c 38 00 cmpb $0x0,(%rax,%r15,1) <-- trapping instruction 2e: 74 08 je 0x38 30: 4c 89 e7 mov %r12,%rdi 33: e8 28 67 3c f8 call 0xf83c6760 38: 49 83 3c 24 00 cmpq $0x0,(%r12) 3d: 0f .byte 0xf 3e: 84 .byte 0x84
***
If these findings have caused you to resend the series or submit a separate fix, please add the following tag to your commit message: Tested-by: syzbot@syzkaller.appspotmail.com
--- This report is generated by a bot. It may contain errors. syzbot ci engineers can be reached at syzkaller@googlegroups.com.
On Tue, 2025-10-14 at 15:18 +1000, Wilfred Mallawa wrote:
+static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval,
unsigned int optlen)
+{
- struct tls_context *ctx = tls_get_ctx(sk);
- struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx);
- u16 value;
- if (sw_ctx->open_rec)
return -EBUSY;
Looks like syzbot found a bug here, sw_ctx can be NULL. Will fixup for V5.
Wilfred
linux-kselftest-mirror@lists.linaro.org