- Linux-kselftest-mirror - lists.linaro.org

[PATCH v3 4/4] selftests/rseq: Add test for mm_cid compaction

by Gabriele Monaco

A task in the kernel (task_mm_cid_work) runs somewhat periodically to compact the mm_cid for each process. Add a test to validate that it runs correctly and timely. The test spawns 1 thread pinned to each CPU, then each thread, including the main one, runs in short bursts for some time. During this period, the mm_cids should be spanning all numbers between 0 and nproc. At the end of this phase, a thread with high enough mm_cid (>= nproc/2) is selected to be the new leader, all other threads terminate. After some time, the only running thread should see 0 as mm_cid, if that doesn't happen, the compaction mechanism didn't work and the test fails. The test never fails if only 1 core is available, in which case, we cannot test anything as the only available mm_cid is 0. Acked-by: Shuah Khan <skhan(a)linuxfoundation.org> Signed-off-by: Gabriele Monaco <gmonaco(a)redhat.com> --- tools/testing/selftests/rseq/.gitignore | 1 + tools/testing/selftests/rseq/Makefile | 2 +- .../selftests/rseq/mm_cid_compaction_test.c | 204 ++++++++++++++++++ 3 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rseq/mm_cid_compaction_test.c diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..b3920c59bf40 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -3,6 +3,7 @@ basic_percpu_ops_test basic_percpu_ops_mm_cid_test basic_test basic_rseq_op_test +mm_cid_compaction_test param_test param_test_benchmark param_test_compare_twice diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..bc4d940f66d4 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test mm_cid_compaction_test TEST_GEN_PROGS_EXTENDED = librseq.so diff --git a/tools/testing/selftests/rseq/mm_cid_compaction_test.c b/tools/testing/selftests/rseq/mm_cid_compaction_test.c new file mode 100644 index 000000000000..d13623625f5a --- /dev/null +++ b/tools/testing/selftests/rseq/mm_cid_compaction_test.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include <assert.h> +#include <pthread.h> +#include <sched.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stddef.h> + +#include "../kselftest.h" +#include "rseq.h" + +#define VERBOSE 0 +#define printf_verbose(fmt, ...) \ + do { \ + if (VERBOSE) \ + printf(fmt, ##__VA_ARGS__); \ + } while (0) + +/* 50 ms */ +#define RUNNER_PERIOD 50000 +/* + * Number of runs before we terminate or get the token. + * The number is slowly increasing with the number of CPUs as the compaction + * process can take longer on larger systems. This is an arbitrary value. + */ +#define THREAD_RUNS (3 + args->num_cpus/8) + +/* + * Number of times we check that the mm_cid were compacted. + * Checks are repeated every RUNNER_PERIOD. + */ +#define MM_CID_COMPACT_TIMEOUT 10 + +struct thread_args { + int cpu; + int num_cpus; + pthread_mutex_t *token; + pthread_barrier_t *barrier; + pthread_t *tinfo; + struct thread_args *args_head; +}; + +static void __noreturn *thread_runner(void *arg) +{ + struct thread_args *args = arg; + int i, ret, curr_mm_cid; + cpu_set_t cpumask; + + CPU_ZERO(&cpumask); + CPU_SET(args->cpu, &cpumask); + ret = pthread_setaffinity_np(pthread_self(), sizeof(cpumask), &cpumask); + if (ret) { + errno = ret; + perror("Error: failed to set affinity"); + abort(); + } + pthread_barrier_wait(args->barrier); + + for (i = 0; i < THREAD_RUNS; i++) + usleep(RUNNER_PERIOD); + curr_mm_cid = rseq_current_mm_cid(); + /* + * We select one thread with high enough mm_cid to be the new leader. + * All other threads (including the main thread) will terminate. + * After some time, the mm_cid of the only remaining thread should + * converge to 0, if not, the test fails. + */ + if (curr_mm_cid >= args->num_cpus / 2 && + !pthread_mutex_trylock(args->token)) { + printf_verbose( + "cpu%d has mm_cid=%d and will be the new leader.\n", + sched_getcpu(), curr_mm_cid); + for (i = 0; i < args->num_cpus; i++) { + if (args->tinfo[i] == pthread_self()) + continue; + ret = pthread_join(args->tinfo[i], NULL); + if (ret) { + errno = ret; + perror("Error: failed to join thread"); + abort(); + } + } + pthread_barrier_destroy(args->barrier); + free(args->tinfo); + free(args->token); + free(args->barrier); + free(args->args_head); + + for (i = 0; i < MM_CID_COMPACT_TIMEOUT; i++) { + curr_mm_cid = rseq_current_mm_cid(); + printf_verbose("run %d: mm_cid=%d on cpu%d.\n", i, + curr_mm_cid, sched_getcpu()); + if (curr_mm_cid == 0) + exit(EXIT_SUCCESS); + usleep(RUNNER_PERIOD); + } + exit(EXIT_FAILURE); + } + printf_verbose("cpu%d has mm_cid=%d and is going to terminate.\n", + sched_getcpu(), curr_mm_cid); + pthread_exit(NULL); +} + +int test_mm_cid_compaction(void) +{ + cpu_set_t affinity; + int i, j, ret = 0, num_threads; + pthread_t *tinfo; + pthread_mutex_t *token; + pthread_barrier_t *barrier; + struct thread_args *args; + + sched_getaffinity(0, sizeof(affinity), &affinity); + num_threads = CPU_COUNT(&affinity); + tinfo = calloc(num_threads, sizeof(*tinfo)); + if (!tinfo) { + perror("Error: failed to allocate tinfo"); + return -1; + } + args = calloc(num_threads, sizeof(*args)); + if (!args) { + perror("Error: failed to allocate args"); + ret = -1; + goto out_free_tinfo; + } + token = malloc(sizeof(*token)); + if (!token) { + perror("Error: failed to allocate token"); + ret = -1; + goto out_free_args; + } + barrier = malloc(sizeof(*barrier)); + if (!barrier) { + perror("Error: failed to allocate barrier"); + ret = -1; + goto out_free_token; + } + if (num_threads == 1) { + fprintf(stderr, "Cannot test on a single cpu. " + "Skipping mm_cid_compaction test.\n"); + /* only skipping the test, this is not a failure */ + goto out_free_barrier; + } + pthread_mutex_init(token, NULL); + ret = pthread_barrier_init(barrier, NULL, num_threads); + if (ret) { + errno = ret; + perror("Error: failed to initialise barrier"); + goto out_free_barrier; + } + for (i = 0, j = 0; i < CPU_SETSIZE && j < num_threads; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + args[j].num_cpus = num_threads; + args[j].tinfo = tinfo; + args[j].token = token; + args[j].barrier = barrier; + args[j].cpu = i; + args[j].args_head = args; + if (!j) { + /* The first thread is the main one */ + tinfo[0] = pthread_self(); + ++j; + continue; + } + ret = pthread_create(&tinfo[j], NULL, thread_runner, &args[j]); + if (ret) { + errno = ret; + perror("Error: failed to create thread"); + abort(); + } + ++j; + } + printf_verbose("Started %d threads.\n", num_threads); + + /* Also main thread will terminate if it is not selected as leader */ + thread_runner(&args[0]); + + /* only reached in case of errors */ +out_free_barrier: + free(barrier); +out_free_token: + free(token); +out_free_args: + free(args); +out_free_tinfo: + free(tinfo); + + return ret; +} + +int main(int argc, char **argv) +{ + if (!rseq_mm_cid_available()) { + fprintf(stderr, "Error: rseq_mm_cid unavailable\n"); + return -1; + } + if (test_mm_cid_compaction()) + return -1; + return 0; +} -- 2.51.0

1 month, 1 week

1
0
0 0

next-20250924: Internal error: Oops: mnt_ns_release (fs/namespace.c:148) __arm64_sys_listmount (fs/namespace.c:5936)

by Naresh Kamboju

While running LTP syscalls tests on Linux next-20250924 tag build the following kernel oops noticed on arm64 and x86_64 devices. First seen on next-20250924 Good: next-20250923 Bad: next-2025094 Regression Analysis: - New regression? yes - Reproducibility? yes Test regression: next-20250924: Internal error: Oops: mnt_ns_release (fs/namespace.c:148) __arm64_sys_listmount (fs/namespace.c:5936) Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org> $ git log --oneline next-20250923..next-20250924 -- fs/namespace.c c54644c3221b6 (next/fs-next) Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git 1f28cc19559a8 Merge branch 'namespace-6.18' into vfs.all e2c277f720291 Merge branch 'kernel-6.18.clone3' into vfs.all b2af83d5b8223 Merge branch 'vfs-6.18.mount' into vfs.all 29ecd1ca48ec2 Merge branch 'vfs-6.18.misc' into vfs.all d7610cb7454bb ns: simplify ns_common_init() further 59bfb66816809 listmount: don't call path_put() under namespace semaphore 2bc5bfbfd3f27 statmount: don't call path_put() under namespace semaphore ## Test log [ 41.821877] Internal error: Oops: 0000000096000005 [#1] SMP [ 41.919038] Modules linked in: cdc_ether usbnet sm3_ce sha3_ce nvme xhci_pci_renesas nvme_core arm_cspmu_module arm_spe_pmu ipmi_devintf ipmi_msghandler arm_cmn cppc_cpufreq drm fuse backlight [ 41.944048] CPU: 14 UID: 0 PID: 6416 Comm: listmount04 Not tainted 6.17.0-rc7-next-20250924 #1 PREEMPT [ 41.958197] Hardware name: Inspur NF5280R7/Mitchell MB, BIOS 04.04.00004001 2025-02-04 22:23:30 02/04/2025 [ 41.967837] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) [ 41.974958] pc : mnt_ns_release (arch/arm64/include/asm/atomic_lse.h:62 (discriminator 1) arch/arm64/include/asm/atomic_lse.h:76 (discriminator 1) arch/arm64/include/asm/atomic.h:51 (discriminator 1) include/linux/atomic/atomic-arch-fallback.h:944 (discriminator 1) include/linux/atomic/atomic-instrumented.h:401 (discriminator 1) include/linux/refcount.h:389 (discriminator 1) include/linux/refcount.h:432 (discriminator 1) include/linux/refcount.h:450 (discriminator 1) fs/namespace.c:148 (discriminator 1)) [ 41.981910] lr : __arm64_sys_listmount (fs/namespace.c:5936) [ 41.993467] sp : ffff8000ff5afd50 [ 42.000329] x29: ffff8000ff5afd50 x28: fff00001bd947380 x27: 0000000000000000 [ 42.007454] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000100 [ 42.030726] x23: 0000000000000000 x22: 0000000000000020 x21: ffff8000ff5afdc8 [ 42.038281] x20: 0000aaaabd6a1110 x19: 0000000000000000 x18: 0000000000000000 [ 42.045405] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [ 42.052528] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 42.075541] x11: 0000000000000000 x10: 0000000000000000 x9 : ffffda68dcdbbe30 [ 42.082835] x8 : ffff8000ff5afda0 x7 : fefefefefefefefe x6 : ffffda68df5e9000 [ 42.096212] x5 : fff00001bd947380 [ 42.108978] x4 : fff00001bd947380 x3 : 0000000000000000 [ 42.114449] x2 : 0000000000000000 x1 : 00000000ffffffff x0 : 00000000000000b8 [ 42.134515] Call trace: [ 42.139725] mnt_ns_release (arch/arm64/include/asm/atomic_lse.h:62 (discriminator 1) arch/arm64/include/asm/atomic_lse.h:76 (discriminator 1) arch/arm64/include/asm/atomic.h:51 (discriminator 1) include/linux/atomic/atomic-arch-fallback.h:944 (discriminator 1) include/linux/atomic/atomic-instrumented.h:401 (discriminator 1) include/linux/refcount.h:389 (discriminator 1) include/linux/refcount.h:432 (discriminator 1) include/linux/refcount.h:450 (discriminator 1) fs/namespace.c:148 (discriminator 1)) (P) [ 42.143811] __arm64_sys_listmount (fs/namespace.c:5936) [ 42.148327] invoke_syscall.constprop.0 (arch/arm64/include/asm/syscall.h:61 arch/arm64/kernel/syscall.c:54) [ 42.159193] do_el0_svc (include/linux/thread_info.h:135 (discriminator 2) arch/arm64/kernel/syscall.c:140 (discriminator 2) arch/arm64/kernel/syscall.c:151 (discriminator 2)) [ 42.163970] el0_svc (arch/arm64/include/asm/irqflags.h:82 (discriminator 1) arch/arm64/include/asm/irqflags.h:123 (discriminator 1) arch/arm64/include/asm/irqflags.h:136 (discriminator 1) arch/arm64/kernel/entry-common.c:102 (discriminator 1) arch/arm64/kernel/entry-common.c:745 (discriminator 1)) [ 42.173791] el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:764) [ 42.185342] el0t_64_sync (arch/arm64/kernel/entry.S:596) [ 42.189165] Code: aa0003f3 9102e000 d503201f 12800001 (b8610001) All code ======== 0: aa0003f3 mov x19, x0 4: 9102e000 add x0, x0, #0xb8 8: d503201f nop c: 12800001 mov w1, #0xffffffff // #-1 10:* b8610001 ldaddl w1, w1, [x0] <-- trapping instruction Code starting with the faulting instruction =========================================== 0: b8610001 ldaddl w1, w1, [x0] [ 42.211485] ---[ end trace 0000000000000000 ]--- ## Source * Kernel version: 6.17.0-rc7 * Git tree: https://kernel.googlesource.com/pub/scm/linux/kernel/git/next/linux-next.git * Git describe: 6.17.0-rc7-next-20250924 * Git commit: b5a4da2c459f79a2c87c867398f1c0c315779781 * Architectures: arm64, x86_64 * Toolchains: gcc-13 * Kconfigs: defconfig+lkftconfig ## Build * Test log arm64: https://qa-reports.linaro.org/api/testruns/30007634/log_file/ * Test log x86_64: https://qa-reports.linaro.org/api/testruns/30000230/log_file/ * Test details: https://regressions.linaro.org/lkft/linux-next-master-ampere/next-20250924/… * Build plan: https://tuxapi.tuxsuite.com/v1/groups/ampere/projects/ci/tests/339teV8pAwrs… * Build link: https://storage.tuxsuite.com/public/ampere/ci/builds/339teBhKZ4DENKbJJNnbWK… * Kernel config: https://storage.tuxsuite.com/public/ampere/ci/builds/339teBhKZ4DENKbJJNnbWK… -- Linaro LKFT

1 month, 1 week

3
2
0 0

[PATCH RESEND v4] selftests/tty: add TIOCSTI test suite

by Abhinav Saxena

TIOCSTI is a TTY ioctl command that allows inserting characters into the terminal input queue, making it appear as if the user typed those characters. This functionality has behavior that varies based on system configuration and process credentials. The dev.tty.legacy_tiocsti sysctl introduced in commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") controls TIOCSTI usage. When disabled, TIOCSTI requires CAP_SYS_ADMIN capability. The current implementation checks the current process's credentials via capable(CAP_SYS_ADMIN), but does not validate against the file opener's credentials stored in file->f_cred. This creates different behavior when file descriptors are passed between processes via SCM_RIGHTS. Add a test suite with 16 test variants using fixture variants to verify TIOCSTI behavior when dev.tty.legacy_tiocsti is enabled/disabled: - Basic TIOCSTI tests (8 variants): Direct testing with different capability and controlling terminal combinations - FD passing tests (8 variants): Test behavior when file descriptors are passed between processes with different capabilities The FD passing tests document this behavior - some tests show different results than expected based on file opener credentials, demonstrating that TIOCSTI uses current process credentials rather than file opener credentials. The tests validate proper enforcement of the legacy_tiocsti sysctl. Test implementation uses openpty(3) with TIOCSCTTY for isolated PTY environments. See tty_ioctl(4) for details on TIOCSTI behavior and security requirements. Signed-off-by: Abhinav Saxena <xandfury(a)gmail.com> --- RESEND: add TTY/serial maintainers and linux-serial CCs. No code changes. - Link to orignal v4: https://lore.kernel.org/r/20250902-toicsti-bug-v4-1-e5c960e0b3d6@gmail.com Changes in v4: - Moved skip conditions and sysctl setup from TEST_F to FIXTURE_SETUP (Kees Cook) - Fixed fclose() error handling in set_legacy_tiocsti_setting (Kees Cook) - Extracted run_basic_tiocsti_test() and run_fdpass_tiocsti_test functions (Kees Cook) - Removed redundant sysctl restore logic from TEST_F (Kees Cook) - Simplified FIXTURE_TEARDOWN (Kees Cook) - Replace drop_to_nobody() to drop_all_privs() which should be more portable (Justin Stitt) - Link to v3: https://lore.kernel.org/r/20250730-toicsti-bug-v3-1-dd2dac97f27a@gmail.com Add selftests for TIOCSTI ioctl To run all tests: $ sudo ./tools/testing/selftests/tty/tty_tiocsti_test Test Results: - PASSED: 13/16 tests - Different behavior: 3/16 tests (documenting credential checking behavior) All tests validated using: - scripts/checkpatch.pl --strict (clean output) - Functional testing on kernel v6.16-rc2 Changes in v3: - Replaced all printf() calls with TH_LOG() for proper test logging (Kees Cook) - Added struct __test_metadata parameter to helper functions - Moved common legacy_tiocsti availability check to FIXTURE_SETUP() - Implemented sysctl modification/restoration in FIXTURE_SETUP/TEARDOWN - Used openpty() with TIOCSCTTY for reliable PTY testing environment - Fixed child/parent synchronization in FD passing tests - Replaced manual _exit(1) handling with proper ASSERT statements - Switched // comments to /* */ format throughout - Expanded to 16 test variants using fixture variants - Enhanced error handling and test reliability - Link to v2: https://lore.kernel.org/r/20250713-toicsti-bug-v2-1-b183787eea29@gmail.com - Link to v1: https://lore.kernel.org/r/20250622-toicsti-bug-v1-0-f374373b04b2@gmail.com References: - tty_ioctl(4) - documents TIOCSTI ioctl and capability requirements - openpty(3) - pseudo-terminal creation and management - commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") - Documentation/security/credentials.rst - https://github.com/KSPP/linux/issues/156 - https://lore.kernel.org/linux-hardening/Y0m9l52AKmw6Yxi1@hostpad/ - drivers/tty/Kconfig - Documentation/driver-api/tty/ --- tools/testing/selftests/tty/Makefile | 6 +- tools/testing/selftests/tty/config | 1 + tools/testing/selftests/tty/tty_tiocsti_test.c | 650 +++++++++++++++++++++++++ 3 files changed, 656 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tty/Makefile b/tools/testing/selftests/tty/Makefile index 50d7027b2ae3fb495dd1c0684363fa8f426be42c..7f6fbe5a0cd5663310e334d9d068b21dab9136ec 100644 --- a/tools/testing/selftests/tty/Makefile +++ b/tools/testing/selftests/tty/Makefile @@ -1,5 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS = -O2 -Wall -TEST_GEN_PROGS := tty_tstamp_update +TEST_GEN_PROGS := tty_tstamp_update tty_tiocsti_test +LDLIBS += -lcap include ../lib.mk + +# Add libcap for TIOCSTI test +$(OUTPUT)/tty_tiocsti_test: LDLIBS += -lcap diff --git a/tools/testing/selftests/tty/config b/tools/testing/selftests/tty/config new file mode 100644 index 0000000000000000000000000000000000000000..c6373aba66366c82435bb26c019eb360eb6310eb --- /dev/null +++ b/tools/testing/selftests/tty/config @@ -0,0 +1 @@ +CONFIG_LEGACY_TIOCSTI=y diff --git a/tools/testing/selftests/tty/tty_tiocsti_test.c b/tools/testing/selftests/tty/tty_tiocsti_test.c new file mode 100644 index 0000000000000000000000000000000000000000..5e767e6cb3ef8f05c5430eb0fcc792064c446c03 --- /dev/null +++ b/tools/testing/selftests/tty/tty_tiocsti_test.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TTY Tests - TIOCSTI + * + * Copyright © 2025 Abhinav Saxena <xandfury(a)gmail.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <stdbool.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <pwd.h> +#include <termios.h> +#include <grp.h> +#include <sys/capability.h> +#include <sys/prctl.h> +#include <pty.h> +#include <utmp.h> + +#include "../kselftest_harness.h" + +enum test_type { + TEST_PTY_TIOCSTI_BASIC, + TEST_PTY_TIOCSTI_FD_PASSING, + /* other tests cases such as serial may be added. */ +}; + +/* + * Test Strategy: + * - Basic tests: Use PTY with/without TIOCSCTTY (controlling terminal for + * current process) + * - FD passing tests: Child creates PTY, parent receives FD (demonstrates + * security issue) + * + * SECURITY VULNERABILITY DEMONSTRATION: + * FD passing tests show that TIOCSTI uses CURRENT process credentials, not + * opener credentials. This means privileged processes can be given FDs from + * unprivileged processes and successfully perform TIOCSTI operations that the + * unprivileged process couldn't do directly. + * + * Attack scenario: + * 1. Unprivileged process opens TTY (direct TIOCSTI fails due to lack of + * privileges) + * 2. Unprivileged process passes FD to privileged process via SCM_RIGHTS + * 3. Privileged process can use TIOCSTI on the FD (succeeds due to its + * privileges) + * 4. Result: Effective privilege escalation via file descriptor passing + * + * This matches the kernel logic in tiocsti(): + * 1. if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; + * 2. if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + * return -EPERM; + * Note: Both checks use capable() on CURRENT process, not FD opener! + * + * If the file credentials were also checked along with the capable() checks + * then the results for FD pass tests would be consistent with the basic tests. + */ + +FIXTURE(tiocsti) +{ + int pty_master_fd; /* PTY - for basic tests */ + int pty_slave_fd; + bool has_pty; + bool initial_cap_sys_admin; + int original_legacy_tiocsti_setting; + bool can_modify_sysctl; +}; + +FIXTURE_VARIANT(tiocsti) +{ + const enum test_type test_type; + const bool controlling_tty; /* true=current->signal->tty == tty */ + const int legacy_tiocsti; /* 0=restricted, 1=permissive */ + const bool requires_cap; /* true=with CAP_SYS_ADMIN, false=without */ + const int expected_success; /* 0=success, -EIO/-EPERM=specific error */ +}; + +/* + * Tests Controlling Terminal Variants (current->signal->tty == tty) + * + * TIOCSTI Test Matrix: + * + * | legacy_tiocsti | CAP_SYS_ADMIN | Expected Result | Error | + * |----------------|---------------|-----------------|-------| + * | 1 (permissive) | true | SUCCESS | - | + * | 1 (permissive) | false | SUCCESS | - | + * | 0 (restricted) | true | SUCCESS | - | + * | 0 (restricted) | false | FAILURE | -EIO | + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, /* FAILURE: legacy restriction */ +}; /* clang-format on */ + +/* + * Note for FD Passing Test Variants + * Since we're testing the scenario where an unprivileged process pass an FD + * to a privileged one, .requires_cap here means the caps of the child process. + * Not the parent; parent would always be privileged. + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; /* clang-format on */ + +/* + * Non-Controlling Terminal Variants (current->signal->tty != tty) + * + * TIOCSTI Test Matrix: + * + * | legacy_tiocsti | CAP_SYS_ADMIN | Expected Result | Error | + * |----------------|---------------|-----------------|-------| + * | 1 (permissive) | true | SUCCESS | - | + * | 1 (permissive) | false | FAILURE | -EPERM| + * | 0 (restricted) | true | SUCCESS | - | + * | 0 (restricted) | false | FAILURE | -EIO | + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = -EPERM, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = -EPERM, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; /* clang-format on */ + +/* Helper function to send FD via SCM_RIGHTS */ +static int send_fd_via_socket(int socket_fd, int fd_to_send) +{ + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + char dummy_data = 'F'; + struct iovec iov = { .iov_base = &dummy_data, .iov_len = 1 }; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + memcpy(CMSG_DATA(cmsg), &fd_to_send, sizeof(int)); + + return sendmsg(socket_fd, &msg, 0) < 0 ? -1 : 0; +} + +/* Helper function to receive FD via SCM_RIGHTS */ +static int recv_fd_via_socket(int socket_fd) +{ + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + char dummy_data; + struct iovec iov = { .iov_base = &dummy_data, .iov_len = 1 }; + int received_fd = -1; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + if (recvmsg(socket_fd, &msg, 0) < 0) + return -1; + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + break; + } + } + + return received_fd; +} + +static inline bool has_cap_sys_admin(void) +{ + cap_t caps = cap_get_proc(); + + if (!caps) + return false; + + cap_flag_value_t cap_val; + bool has_cap = (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, + &cap_val) == 0) && + (cap_val == CAP_SET); + + cap_free(caps); + return has_cap; +} + +/* + * Switch to non-root user and clear all capabilities + */ +static inline bool drop_all_privs(struct __test_metadata *_metadata) +{ + /* Drop supplementary groups */ + ASSERT_EQ(setgroups(0, NULL), 0); + + /* Switch to non-root user */ + ASSERT_EQ(setgid(1000), 0); + ASSERT_EQ(setuid(1000), 0); + + /* Clear all capabilities */ + cap_t empty = cap_init(); + + ASSERT_NE(empty, NULL); + ASSERT_EQ(cap_set_proc(empty), 0); + cap_free(empty); + + /* Prevent privilege regain */ + ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0); + + /* Verify privilege drop */ + ASSERT_FALSE(has_cap_sys_admin()); + return true; +} + +static inline int get_legacy_tiocsti_setting(struct __test_metadata *_metadata) +{ + FILE *fp; + int value = -1; + + fp = fopen("/proc/sys/dev/tty/legacy_tiocsti", "r"); + if (!fp) { + /* legacy_tiocsti sysctl not available (kernel < 6.2) */ + return -1; + } + + if (fscanf(fp, "%d", &value) == 1 && fclose(fp) == 0) { + if (value < 0 || value > 1) + value = -1; /* Invalid value */ + } else { + value = -1; /* Failed to parse */ + } + + return value; +} + +static inline bool set_legacy_tiocsti_setting(struct __test_metadata *_metadata, + int value) +{ + FILE *fp; + bool success = false; + + /* Sanity-check the value */ + ASSERT_GE(value, 0); + ASSERT_LE(value, 1); + + /* + * Try to open for writing; if we lack permission, return false so + * the test harness will skip variants that need to change it + */ + fp = fopen("/proc/sys/dev/tty/legacy_tiocsti", "w"); + if (!fp) + return false; + + /* Write the new setting */ + if (fprintf(fp, "%d\n", value) > 0 && fclose(fp) == 0) + success = true; + else + TH_LOG("Failed to write legacy_tiocsti: %s", strerror(errno)); + + return success; +} + +/* + * TIOCSTI injection test function + * @tty_fd: TTY slave file descriptor to test TIOCSTI on + * Returns: 0 on success, -errno on failure + */ +static inline int test_tiocsti_injection(struct __test_metadata *_metadata, + int tty_fd) +{ + int ret; + char inject_char = 'V'; + + errno = 0; + ret = ioctl(tty_fd, TIOCSTI, &inject_char); + return ret == 0 ? 0 : -errno; +} + +/* + * Child process: test TIOCSTI directly with capability/controlling + * terminal setup + */ +static void run_basic_tiocsti_test(struct __test_metadata *_metadata, + FIXTURE_DATA(tiocsti) * self, + const FIXTURE_VARIANT(tiocsti) * variant) +{ + /* Handle capability requirements */ + if (self->initial_cap_sys_admin && !variant->requires_cap) + ASSERT_TRUE(drop_all_privs(_metadata)); + + if (variant->controlling_tty) { + /* + * Create new session and set PTY as + * controlling terminal + */ + pid_t sid = setsid(); + + ASSERT_GE(sid, 0); + ASSERT_EQ(ioctl(self->pty_slave_fd, TIOCSCTTY, 0), 0); + } + + /* + * Validate test environment setup and verify final + * capability state matches expectation + * after potential drop. + */ + ASSERT_TRUE(self->has_pty); + ASSERT_EQ(has_cap_sys_admin(), variant->requires_cap); + + /* Test TIOCSTI and validate result */ + int result = test_tiocsti_injection(_metadata, self->pty_slave_fd); + + /* Check against expected result from variant */ + EXPECT_EQ(result, variant->expected_success); + _exit(0); +} + +/* + * Child process: create PTY and then pass FD to parent via SCM_RIGHTS + */ +static void run_fdpass_tiocsti_test(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(tiocsti) * variant, + int sockfd) +{ + signal(SIGHUP, SIG_IGN); + + /* Handle privilege dropping */ + if (!variant->requires_cap && has_cap_sys_admin()) + ASSERT_TRUE(drop_all_privs(_metadata)); + + /* Create child's PTY */ + int child_master_fd, child_slave_fd; + + ASSERT_EQ(openpty(&child_master_fd, &child_slave_fd, NULL, NULL, NULL), + 0); + + if (variant->controlling_tty) { + pid_t sid = setsid(); + + ASSERT_GE(sid, 0); + ASSERT_EQ(ioctl(child_slave_fd, TIOCSCTTY, 0), 0); + } + + /* Test child's direct TIOCSTI for reference */ + int direct_result = test_tiocsti_injection(_metadata, child_slave_fd); + + EXPECT_EQ(direct_result, variant->expected_success); + + /* Send FD to parent */ + ASSERT_EQ(send_fd_via_socket(sockfd, child_slave_fd), 0); + + /* Wait for parent completion signal */ + char sync_byte; + ssize_t bytes_read = read(sockfd, &sync_byte, 1); + + ASSERT_EQ(bytes_read, 1); + + close(child_master_fd); + close(child_slave_fd); + close(sockfd); + _exit(0); +} + +FIXTURE_SETUP(tiocsti) +{ + /* Create PTY pair for basic tests */ + self->has_pty = (openpty(&self->pty_master_fd, &self->pty_slave_fd, + NULL, NULL, NULL) == 0); + if (!self->has_pty) { + self->pty_master_fd = -1; + self->pty_slave_fd = -1; + } + + self->initial_cap_sys_admin = has_cap_sys_admin(); + self->original_legacy_tiocsti_setting = + get_legacy_tiocsti_setting(_metadata); + + if (self->original_legacy_tiocsti_setting < 0) + SKIP(return, + "legacy_tiocsti sysctl not available (kernel < 6.2)"); + + /* Common skip conditions */ + if (variant->test_type == TEST_PTY_TIOCSTI_BASIC && !self->has_pty) + SKIP(return, "PTY not available for controlling terminal test"); + + if (variant->test_type == TEST_PTY_TIOCSTI_FD_PASSING && + !self->initial_cap_sys_admin) + SKIP(return, "FD Pass tests require CAP_SYS_ADMIN"); + + if (variant->requires_cap && !self->initial_cap_sys_admin) + SKIP(return, "Test requires initial CAP_SYS_ADMIN"); + + /* Test if we can modify the sysctl (requires appropriate privileges) */ + self->can_modify_sysctl = set_legacy_tiocsti_setting( + _metadata, self->original_legacy_tiocsti_setting); + + /* Sysctl setup based on variant */ + if (self->can_modify_sysctl && + self->original_legacy_tiocsti_setting != variant->legacy_tiocsti) { + if (!set_legacy_tiocsti_setting(_metadata, + variant->legacy_tiocsti)) + SKIP(return, "Failed to set legacy_tiocsti sysctl"); + + } else if (!self->can_modify_sysctl && + self->original_legacy_tiocsti_setting != + variant->legacy_tiocsti) + SKIP(return, "legacy_tiocsti setting mismatch"); +} + +FIXTURE_TEARDOWN(tiocsti) +{ + /* + * Backup restoration - + * each test should restore its own sysctl changes + */ + if (self->can_modify_sysctl) { + int current_value = get_legacy_tiocsti_setting(_metadata); + + if (current_value != self->original_legacy_tiocsti_setting) { + TH_LOG("Backup: Restoring legacy_tiocsti from %d to %d", + current_value, + self->original_legacy_tiocsti_setting); + set_legacy_tiocsti_setting( + _metadata, + self->original_legacy_tiocsti_setting); + } + } + + if (self->has_pty) { + if (self->pty_master_fd >= 0) + close(self->pty_master_fd); + if (self->pty_slave_fd >= 0) + close(self->pty_slave_fd); + } +} + +TEST_F(tiocsti, test) +{ + int status; + pid_t child_pid; + + if (variant->test_type == TEST_PTY_TIOCSTI_BASIC) { + /* ===== BASIC TIOCSTI TEST ===== */ + child_pid = fork(); + ASSERT_GE(child_pid, 0); + + /* Perform the actual test in the child process */ + if (child_pid == 0) + run_basic_tiocsti_test(_metadata, self, variant); + + } else { + /* ===== FD PASSING SECURITY TEST ===== */ + int sockpair[2]; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sockpair), 0); + + child_pid = fork(); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child process - create PTY and send FD */ + close(sockpair[0]); + run_fdpass_tiocsti_test(_metadata, variant, + sockpair[1]); + } + + /* Parent process - receive FD and test TIOCSTI */ + close(sockpair[1]); + + int received_fd = recv_fd_via_socket(sockpair[0]); + + ASSERT_GE(received_fd, 0); + + bool parent_has_cap = self->initial_cap_sys_admin; + + TH_LOG("=== TIOCSTI FD Passing Test Context ==="); + TH_LOG("legacy_tiocsti: %d, Parent CAP_SYS_ADMIN: %s, Child: %s", + variant->legacy_tiocsti, parent_has_cap ? "yes" : "no", + variant->requires_cap ? "kept" : "dropped"); + + /* SECURITY TEST: Try TIOCSTI with FD opened by child */ + int result = test_tiocsti_injection(_metadata, received_fd); + + /* Log security concern if demonstrated */ + if (result == 0 && !variant->requires_cap) { + TH_LOG("*** SECURITY CONCERN DEMONSTRATED ***"); + TH_LOG("Privileged parent can use TIOCSTI on FD from unprivileged child"); + TH_LOG("This shows current process credentials are used, not opener credentials"); + } + + EXPECT_EQ(result, variant->expected_success) + { + TH_LOG("FD passing: expected error %d, got %d", + variant->expected_success, result); + } + + /* Signal child completion */ + char sync_byte = 'D'; + ssize_t bytes_written = write(sockpair[0], &sync_byte, 1); + + ASSERT_EQ(bytes_written, 1); + + close(received_fd); + close(sockpair[0]); + } + + /* Common child process cleanup for both test types */ + ASSERT_EQ(waitpid(child_pid, &status, 0), child_pid); + + if (WIFSIGNALED(status)) { + TH_LOG("Child terminated by signal %d", WTERMSIG(status)); + ASSERT_FALSE(WIFSIGNALED(status)) + { + TH_LOG("Child process failed assertion"); + } + } else { + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_HARNESS_MAIN --- base-commit: e6b9dce0aeeb91dfc0974ab87f02454e24566182 change-id: 20250618-toicsti-bug-7822b8e94a32 Best regards, -- Abhinav Saxena <xandfury(a)gmail.com>

1 month, 1 week

1
1
0 0

[PATCH RFC 0/4] landlock: add LANDLOCK_SCOPE_MEMFD_EXEC execution

by Abhinav Saxena

This patch series introduces LANDLOCK_SCOPE_MEMFD_EXEC, a new Landlock scoping mechanism that restricts execution of anonymous memory file descriptors (memfd) created via memfd_create(2). This addresses security gaps where processes can bypass W^X policies and execute arbitrary code through anonymous memory objects. Fixes: https://github.com/landlock-lsm/linux/issues/37 SECURITY PROBLEM ================ Current Landlock filesystem restrictions do not cover memfd objects, allowing processes to: 1. Read-to-execute bypass: Create writable memfd, inject code, then execute via mmap(PROT_EXEC) or direct execve() 2. Anonymous execution: Execute code without touching the filesystem via execve("/proc/self/fd/N") where N is a memfd descriptor 3. Cross-domain access violations: Pass memfd between processes to bypass domain restrictions These scenarios can occur in sandboxed environments where filesystem access is restricted but memfd creation remains possible. IMPLEMENTATION ============== The implementation adds hierarchical execution control through domain scoping: Core Components: - is_memfd_file(): Reliable memfd detection via "memfd:" dentry prefix - domain_is_scoped(): Cross-domain hierarchy checking (moved to domain.c) - LSM hooks: mmap_file, file_mprotect, bprm_creds_for_exec - Creation-time restrictions: hook_file_alloc_security Security Matrix: Execution decisions follow domain hierarchy rules preventing both same-domain bypass attempts and cross-domain access violations while preserving legitimate hierarchical access patterns. Domain Hierarchy with LANDLOCK_SCOPE_MEMFD_EXEC: =============================================== Root (no domain) - No restrictions | +-- Domain A [SCOPE_MEMFD_EXEC] Layer 1 | +-- memfd_A (tagged with Domain A as creator) | | | +-- Domain A1 (child) [NO SCOPE] Layer 2 | | +-- Inherits Layer 1 restrictions from parent | | +-- memfd_A1 (can create, inherits restrictions) | | +-- Domain A1a [SCOPE_MEMFD_EXEC] Layer 3 | | +-- memfd_A1a (tagged with Domain A1a) | | | +-- Domain A2 (child) [SCOPE_MEMFD_EXEC] Layer 2 | +-- memfd_A2 (tagged with Domain A2 as creator) | +-- CANNOT access memfd_A1 (different subtree) | +-- Domain B [SCOPE_MEMFD_EXEC] Layer 1 +-- memfd_B (tagged with Domain B as creator) +-- CANNOT access ANY memfd from Domain A subtree Execution Decision Matrix: ======================== Executor-> | A | A1 | A1a | A2 | B | Root Creator | | | | | | ------------|-----|----|-----|----|----|----- Domain A | X | X | X | X | X | Y Domain A1 | Y | X | X | X | X | Y Domain A1a | Y | Y | X | X | X | Y Domain A2 | Y | X | X | X | X | Y Domain B | X | X | X | X | X | Y Root | Y | Y | Y | Y | Y | Y Legend: Y = Execution allowed, X = Execution denied Scenarios Covered: - Direct mmap(PROT_EXEC) on memfd files - Two-stage mmap(PROT_READ) + mprotect(PROT_EXEC) bypass attempts - execve("/proc/self/fd/N") anonymous execution - execveat() and fexecve() file descriptor execution - Cross-process memfd inheritance and IPC passing TESTING ======= All patches have been validated with: - scripts/checkpatch.pl --strict (clean) - Selftests covering same-domain restrictions, cross-domain hierarchy enforcement, and regular file isolation - KUnit tests for memfd detection edge cases DISCLAIMER ========== My understanding of Landlock scoping semantics may be limited, but this implementation reflects my current understanding based on available documentation and code analysis. I welcome feedback and corrections regarding the scoping logic and domain hierarchy enforcement. Signed-off-by: Abhinav Saxena <xandfury(a)gmail.com> --- Abhinav Saxena (4): landlock: add LANDLOCK_SCOPE_MEMFD_EXEC scope landlock: implement memfd detection landlock: add memfd exec LSM hooks and scoping selftests/landlock: add memfd execution tests include/uapi/linux/landlock.h | 5 + security/landlock/.kunitconfig | 1 + security/landlock/audit.c | 4 + security/landlock/audit.h | 1 + security/landlock/cred.c | 14 - security/landlock/domain.c | 67 ++++ security/landlock/domain.h | 4 + security/landlock/fs.c | 405 ++++++++++++++++++++- security/landlock/limits.h | 2 +- security/landlock/task.c | 67 ---- .../selftests/landlock/scoped_memfd_exec_test.c | 325 +++++++++++++++++ 11 files changed, 812 insertions(+), 83 deletions(-) --- base-commit: 5b74b2eff1eeefe43584e5b7b348c8cd3b723d38 change-id: 20250716-memfd-exec-ac0d582018c3 Best regards, -- Abhinav Saxena <xandfury(a)gmail.com>

1 month, 1 week

3
10
0 0

[PATCH v4 1/2] net/tls: support maximum record size limit

by Wilfred Mallawa

From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com> During a handshake, an endpoint may specify a maximum record size limit. Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the maximum record size. Meaning that, the outgoing records from the kernel can exceed a lower size negotiated during the handshake. In such a case, the TLS endpoint must send a fatal "record_overflow" alert [1], and thus the record is discarded. Upcoming Western Digital NVMe-TCP hardware controllers implement TLS support. For these devices, supporting TLS record size negotiation is necessary because the maximum TLS record size supported by the controller is less than the default 16KB currently used by the kernel. This patch adds support for retrieving the negotiated record size limit during a handshake, and enforcing it at the TLS layer such that outgoing records are no larger than the size negotiated. This patch depends on the respective userspace support in tlshd and GnuTLS [2]. [1] https://www.rfc-editor.org/rfc/rfc8449 [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005 Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com> --- Changes V3 -> V4: * Added record_size_limit RFC reference to documentation * Always export the record size limit in tls_get_info() * Disallow user space to change the record_size_limit from under us if an open record is pending. * Added record_size_limit minimum size check as per RFC * Allow space for the ContentType byte for TLS 1.3. The expected behaviour is that userspace directly uses the negotiated record_size_limit, kernel will limit the plaintext buffer size appropirately. * New patch to add self-tests. --- Documentation/networking/tls.rst | 12 +++++ include/net/tls.h | 5 +++ include/uapi/linux/tls.h | 2 + net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 75 ++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 2 +- 6 files changed, 96 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 36cc7afc2527..d24bf8911bb8 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -280,6 +280,18 @@ If the record decrypted turns out to had been padded or is not a data record it will be decrypted again into a kernel buffer without zero copy. Such events are counted in the ``TlsDecryptRetry`` statistic. +TLS_TX_RECORD_SIZE_LIM +~~~~~~~~~~~~~~~~~~~~~~ + +Sets the maximum size for the plaintext of a protected record. + +The provided value should correspond to the limit negotiated during the TLS +handshake via the `record_size_limit` extension (RFC 8449)[1]. When this +option is set, the kernel enforces this limit on all transmitted TLS records, +ensuring no plaintext fragment exceeds the specified size. + +[1] https://datatracker.ietf.org/doc/html/rfc8449 + Statistics ========== diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..32f053770ec4 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,6 +53,8 @@ struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE @@ -226,6 +228,9 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_record_size_limit; /* Max plaintext fragment size. For TLS 1.3, + * this excludes the ContentType. + */ int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index b66a800389cc..3add266d5916 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -41,6 +41,7 @@ #define TLS_RX 2 /* Set receive parameters */ #define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */ #define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */ +#define TLS_TX_RECORD_SIZE_LIM 5 /* Maximum record size */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -194,6 +195,7 @@ enum { TLS_INFO_RXCONF, TLS_INFO_ZC_RO_TX, TLS_INFO_RX_NO_PAD, + TLS_INFO_TX_RECORD_SIZE_LIM, __TLS_INFO_MAX, }; #define TLS_INFO_MAX (__TLS_INFO_MAX - 1) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52..bf16ceb41dde 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -459,7 +459,7 @@ static int tls_push_data(struct sock *sk, /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ - max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + max_open_record_len = tls_ctx->tx_record_size_limit + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3ccb3135e51..09883d9c6c96 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -544,6 +544,31 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, return 0; } +static int do_tls_getsockopt_tx_record_size(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + int len; + /* TLS 1.3: Record length contains ContentType */ + u16 record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ? + ctx->tx_record_size_limit + 1 : + ctx->tx_record_size_limit; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < sizeof(record_size_limit)) + return -EINVAL; + + if (put_user(sizeof(record_size_limit), optlen)) + return -EFAULT; + + if (copy_to_user(optval, &record_size_limit, sizeof(record_size_limit))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -563,6 +588,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_RECORD_SIZE_LIM: + rc = do_tls_getsockopt_tx_record_size(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -812,6 +840,43 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, return rc; } +static int do_tls_setsockopt_tx_record_size(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx); + u16 value; + + if (sw_ctx->open_rec) + return -EBUSY; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value < TLS_MIN_RECORD_SIZE_LIM) + return -EINVAL; + + if (ctx->prot_info.version == TLS_1_2_VERSION && + value > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + if (ctx->prot_info.version == TLS_1_3_VERSION && + value - 1 > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + /* + * For TLS 1.3: 'value' includes one byte for the appended ContentType. + * Adjust the kernel's internal plaintext limit accordingly. + */ + ctx->tx_record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ? + value - 1 : value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -833,6 +898,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_RECORD_SIZE_LIM: + rc = do_tls_setsockopt_tx_record_size(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -1022,6 +1090,7 @@ static int tls_init(struct sock *sk) ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->tx_record_size_limit = TLS_MAX_PAYLOAD_SIZE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); @@ -1111,6 +1180,11 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) goto nla_failure; } + err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM, + ctx->tx_record_size_limit); + if (err) + goto nla_failure; + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -1132,6 +1206,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin) nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_RECORD_SIZE_LIM */ 0; return size; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e..28fb796573d1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + record_room = tls_ctx->tx_record_size_limit - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; -- 2.51.0

1 month, 1 week

4
10
0 0

[PATCH v2 2/2] ipv6: Check AF_UNSPEC in ip6_route_multipath_add()

by Maksimilijan Marosevic

This check was removed in commit e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().") as part of rt6_qualify_for ecmp(). The author correctly recognises that rt6_qualify_for_ecmp() returns false if fb_nh_gw_family is set to AF_UNSPEC, but then mistakes AF_UNSPEC for AF_INET6 when reasoning that the check is unnecessary. This means certain malformed entries don't get caught in ip6_route_multipath_add(). This patch reintroduces the AF_UNSPEC check while respecting changes of the initial patch. Reported-by: syzbot+a259a17220263c2d73fc(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a259a17220263c2d73fc Fixes: e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().") Signed-off-by: Maksimilijan Marosevic <maksimilijan.marosevic(a)proton.me> --- net/ipv6/route.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aee6a10b112a..884bae3fb1b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5454,6 +5454,14 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + if (rt->fib6_nh->fib_nh_gw_family == AF_UNSPEC) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + fib6_info_release(rt); + goto cleanup; + } + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); -- 2.43.0

1 month, 1 week

1
0
0 0

Hello from France.

by David Santamaria

The Alphabetical layout was... auto-correct. Friendly: Neo (Jesus-Christ a.k.a. King David again). - Miracles proving that I was Jesus, and King David, are in my Facebook. I proposed to explain the Holy-Trinity live on TV. It is since year 2000 that I must be in the news. I do not declare be Jew, but get persecuted anyway by antisemites (antichristians?). Writing still does not pay my bills at all, keeps me almost homeless. While my intellectual property is stolen, my names removed from many Masterpieces. Facebook https://www.facebook.com/profile.php?id=100057121342964 Paypal https://www.paypal.com/paypalme/meDavidSantamaria Email/Teams DavidSantamaria(a)hotmail.fr<mailto:DavidSantamaria@hotmail.fr> RCS/WhatsApp +33 7 67 99 32 37 $ADA addr1qx8chpwdeqv77duf2eutrtgvd5967l4w87fy54fx0022gr8p80z2mq7cmunmrdvy8yn3pzfzpm46zyfjp8usl36vpw2q509hrd $BTC 3FdwVoDzJoUzceogUwEmVu9YxoFvX6c2Rk $WAVES 3PEAeFkwqVsgAiyad8uVQzQxiGDyJNnCCn5 Sent from my Public Address.

1 month, 1 week

1
0
0 0

[PATCH 1/2] selftests/acct: add cleanup for leftover process_log binary

by Madhur Kumar

Some kselftests generate temporary binaries that are not tracked by TEST_GEN_PROGS. Add EXTRA_CLEAN entry to remove process_log during `make kselftest-clean`. Signed-off-by: Madhur Kumar <madhurkumar004(a)gmail.com> --- tools/testing/selftests/acct/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/acct/Makefile b/tools/testing/selftests/acct/Makefile index 7e025099cf65..16eb97079e63 100644 --- a/tools/testing/selftests/acct/Makefile +++ b/tools/testing/selftests/acct/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := acct_syscall CFLAGS += -Wall +EXTRA_CLEAN := process_log -include ../lib.mk \ No newline at end of file +include ../lib.mk -- 2.51.0

1 month, 1 week

2
8
0 0

[PATCH net-next 00/13] selftests: Mark auto-deferring functions clearly

by Petr Machata

selftests/net/lib.sh contains a suite of iproute2 wrappers that automatically schedule the corresponding cleanup through defer. The fact they do so is however not immediately obvious, one needs to know which functions are handling the deferral behind the scenes, and which expect the caller to handle cleanups themselves. A convention for these auto-deferring functions would help both writing and patch review. This patchset does so by marking these functions with an adf_ prefix. We already have a few such functions: forwarding/lib.sh has adf_mcd_start() and a few selftests add private helpers that conform to this convention. Patches #1 to #8 gradually convert individual functions, one per patch. Patch #9 renames an auto-deferring private helpers named dfr_* to adf_*. The plan is not to retro-rename all private helpers, but I happened to know about this one. Patches #10 to #12 introduce several autodefer helpers for commonly used forwarding/lib.sh functions, and opportunistically convert straightforward instances of 'action; defer counteraction' to the new helpers. Patch #13 adds some README verbiage to pitch defer and the adf_* convention. Petr Machata (13): selftests: net: lib: Rename ip_link_add() to adf_* selftests: net: lib: Rename ip_link_set_master() to adf_* selftests: net: lib: Rename ip_link_set_addr() to adf_* selftests: net: lib: Rename ip_link_set_up() to adf_* selftests: net: lib: Rename ip_link_set_down() to adf_* selftests: net: lib: Rename ip_addr_add() to adf_* selftests: net: lib: Rename ip_route_add() to adf_* selftests: net: lib: Rename bridge_vlan_add() to adf_* selftests: net: vlan_bridge_binding: Rename dfr_set_binding_*() to adf_* selftests: forwarding: lib: Add an autodefer variant of vrf_prepare() selftests: forwarding: lib: Add an autodefer variant of simple_if_init() selftests: forwarding: lib: Add an autodefer variant of forwarding_enable() selftests: forwarding: README: Mention defer, adf_ .../drivers/net/mlxsw/devlink_trap_policer.sh | 9 +- .../drivers/net/mlxsw/qos_ets_strict.sh | 12 +- .../drivers/net/mlxsw/qos_max_descriptors.sh | 9 +- .../drivers/net/mlxsw/qos_mc_aware.sh | 12 +- .../drivers/net/mlxsw/sch_red_core.sh | 6 +- tools/testing/selftests/net/fdb_notify.sh | 26 ++-- tools/testing/selftests/net/forwarding/README | 15 ++ .../net/forwarding/bridge_activity_notify.sh | 21 ++- .../net/forwarding/bridge_fdb_local_vlan_0.sh | 65 ++++---- tools/testing/selftests/net/forwarding/lib.sh | 18 +++ .../selftests/net/forwarding/sch_ets_core.sh | 9 +- .../selftests/net/forwarding/sch_red.sh | 12 +- .../selftests/net/forwarding/sch_tbf_core.sh | 6 +- .../net/forwarding/vxlan_bridge_1q_mc_ul.sh | 141 +++++++++--------- .../net/forwarding/vxlan_reserved.sh | 33 ++-- tools/testing/selftests/net/lib.sh | 16 +- .../net/test_vxlan_fdb_changelink.sh | 8 +- .../selftests/net/vlan_bridge_binding.sh | 44 +++--- 18 files changed, 225 insertions(+), 237 deletions(-) -- 2.49.0

1 month, 1 week

3
27
0 0

[PATCH net-next 00/15] mptcp: pm: special case for c-flag + luminar endp

by Matthieu Baerts (NGI0)

Here are some patches for the MPTCP PM, including some refactoring that I thought it would be best to send at the end of a cycle to avoid conflicts between net and net-next that could last a few weeks. The most interesting changes are in the first and last patch, the rest are patches refactoring the code & tests to validate the modifications. - Patches 1 & 2: When servers set the C-flag in their MP_CAPABLE to tell clients not to create subflows to the initial address and port -- e.g. a deployment behind a L4 load balancer like a typical CDN deployment -- clients will not use their other endpoints when default settings are used. That's because the in-kernel path-manager uses the 'subflow' endpoints to create subflows only to the initial address and port. The first patch fixes that (for >=v5.14), and the second one validates it. - Patches 3-14: various patches refactoring the code around the in-kernel PM (mainly): split too long functions, rename variables and functions to avoid confusions, reduce structure size, and compare IDs instead of IP addresses. Note that one patch modifies one internal variable used in one BPF selftest. - Patch 15: ability to control endpoints that are used in reaction to a new address announced by the other peer. With that, endpoints can be used only once. Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org> --- Notes: - Patches 1 & 2 are sent to net-next on purpose: to delay a bit the backports, just in case. Plus we are at the end of a cycle, and not to delay the other refactoring patches. - Sorry, I wanted to send this series earlier on, but due to some unrelated issues (and holiday), it got delayed. Most patches are pure refactoring ones. --- Matthieu Baerts (NGI0) (15): mptcp: pm: in-kernel: usable client side with C-flag selftests: mptcp: join: validate C-flag + def limit mptcp: pm: in-kernel: refactor fill_local_addresses_vec mptcp: pm: in-kernel: refactor fill_remote_addresses_vec mptcp: pm: rename 'subflows' to 'extra_subflows' mptcp: pm: in-kernel: rename 'subflows_max' to 'limit_extra_subflows' mptcp: pm: in-kernel: rename 'add_addr_signal_max' to 'endp_signal_max' mptcp: pm: in-kernel: rename 'add_addr_accept_max' to 'limit_add_addr_accepted' mptcp: pm: in-kernel: rename 'local_addr_max' to 'endp_subflow_max' mptcp: pm: in-kernel: rename 'local_addr_list' to 'endp_list' mptcp: pm: in-kernel: rename 'addrs' to 'endpoints' mptcp: pm: in-kernel: remove stale_loss_cnt mptcp: pm: in-kernel: reduce pernet struct size mptcp: pm: in-kernel: compare IDs instead of addresses mptcp: pm: in-kernel: add laminar endpoints include/uapi/linux/mptcp.h | 11 +- net/mptcp/pm.c | 32 +- net/mptcp/pm_kernel.c | 569 ++++++++++++++-------- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 21 +- net/mptcp/sockopt.c | 22 +- tools/testing/selftests/bpf/progs/mptcp_subflow.c | 2 +- tools/testing/selftests/net/mptcp/mptcp_join.sh | 11 + 8 files changed, 441 insertions(+), 229 deletions(-) --- base-commit: a1f1f2422e098485b09e55a492de05cf97f9954d change-id: 20250925-net-next-mptcp-c-flag-laminar-f8442e4d4bd9 Best regards, -- Matthieu Baerts (NGI0) <matttbe(a)kernel.org>

1 month, 1 week

2
16
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror