September 2025 - Linux-kselftest-mirror

[PATCH v3 4/4] selftests/rseq: Add test for mm_cid compaction

by Gabriele Monaco

A task in the kernel (task_mm_cid_work) runs somewhat periodically to compact the mm_cid for each process. Add a test to validate that it runs correctly and timely. The test spawns 1 thread pinned to each CPU, then each thread, including the main one, runs in short bursts for some time. During this period, the mm_cids should be spanning all numbers between 0 and nproc. At the end of this phase, a thread with high enough mm_cid (>= nproc/2) is selected to be the new leader, all other threads terminate. After some time, the only running thread should see 0 as mm_cid, if that doesn't happen, the compaction mechanism didn't work and the test fails. The test never fails if only 1 core is available, in which case, we cannot test anything as the only available mm_cid is 0. Acked-by: Shuah Khan <skhan(a)linuxfoundation.org> Signed-off-by: Gabriele Monaco <gmonaco(a)redhat.com> --- tools/testing/selftests/rseq/.gitignore | 1 + tools/testing/selftests/rseq/Makefile | 2 +- .../selftests/rseq/mm_cid_compaction_test.c | 204 ++++++++++++++++++ 3 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rseq/mm_cid_compaction_test.c diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 0fda241fa62b..b3920c59bf40 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -3,6 +3,7 @@ basic_percpu_ops_test basic_percpu_ops_mm_cid_test basic_test basic_rseq_op_test +mm_cid_compaction_test param_test param_test_benchmark param_test_compare_twice diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d0a5fae5954..bc4d940f66d4 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -17,7 +17,7 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test mm_cid_compaction_test TEST_GEN_PROGS_EXTENDED = librseq.so diff --git a/tools/testing/selftests/rseq/mm_cid_compaction_test.c b/tools/testing/selftests/rseq/mm_cid_compaction_test.c new file mode 100644 index 000000000000..d13623625f5a --- /dev/null +++ b/tools/testing/selftests/rseq/mm_cid_compaction_test.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include <assert.h> +#include <pthread.h> +#include <sched.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stddef.h> + +#include "../kselftest.h" +#include "rseq.h" + +#define VERBOSE 0 +#define printf_verbose(fmt, ...) \ + do { \ + if (VERBOSE) \ + printf(fmt, ##__VA_ARGS__); \ + } while (0) + +/* 50 ms */ +#define RUNNER_PERIOD 50000 +/* + * Number of runs before we terminate or get the token. + * The number is slowly increasing with the number of CPUs as the compaction + * process can take longer on larger systems. This is an arbitrary value. + */ +#define THREAD_RUNS (3 + args->num_cpus/8) + +/* + * Number of times we check that the mm_cid were compacted. + * Checks are repeated every RUNNER_PERIOD. + */ +#define MM_CID_COMPACT_TIMEOUT 10 + +struct thread_args { + int cpu; + int num_cpus; + pthread_mutex_t *token; + pthread_barrier_t *barrier; + pthread_t *tinfo; + struct thread_args *args_head; +}; + +static void __noreturn *thread_runner(void *arg) +{ + struct thread_args *args = arg; + int i, ret, curr_mm_cid; + cpu_set_t cpumask; + + CPU_ZERO(&cpumask); + CPU_SET(args->cpu, &cpumask); + ret = pthread_setaffinity_np(pthread_self(), sizeof(cpumask), &cpumask); + if (ret) { + errno = ret; + perror("Error: failed to set affinity"); + abort(); + } + pthread_barrier_wait(args->barrier); + + for (i = 0; i < THREAD_RUNS; i++) + usleep(RUNNER_PERIOD); + curr_mm_cid = rseq_current_mm_cid(); + /* + * We select one thread with high enough mm_cid to be the new leader. + * All other threads (including the main thread) will terminate. + * After some time, the mm_cid of the only remaining thread should + * converge to 0, if not, the test fails. + */ + if (curr_mm_cid >= args->num_cpus / 2 && + !pthread_mutex_trylock(args->token)) { + printf_verbose( + "cpu%d has mm_cid=%d and will be the new leader.\n", + sched_getcpu(), curr_mm_cid); + for (i = 0; i < args->num_cpus; i++) { + if (args->tinfo[i] == pthread_self()) + continue; + ret = pthread_join(args->tinfo[i], NULL); + if (ret) { + errno = ret; + perror("Error: failed to join thread"); + abort(); + } + } + pthread_barrier_destroy(args->barrier); + free(args->tinfo); + free(args->token); + free(args->barrier); + free(args->args_head); + + for (i = 0; i < MM_CID_COMPACT_TIMEOUT; i++) { + curr_mm_cid = rseq_current_mm_cid(); + printf_verbose("run %d: mm_cid=%d on cpu%d.\n", i, + curr_mm_cid, sched_getcpu()); + if (curr_mm_cid == 0) + exit(EXIT_SUCCESS); + usleep(RUNNER_PERIOD); + } + exit(EXIT_FAILURE); + } + printf_verbose("cpu%d has mm_cid=%d and is going to terminate.\n", + sched_getcpu(), curr_mm_cid); + pthread_exit(NULL); +} + +int test_mm_cid_compaction(void) +{ + cpu_set_t affinity; + int i, j, ret = 0, num_threads; + pthread_t *tinfo; + pthread_mutex_t *token; + pthread_barrier_t *barrier; + struct thread_args *args; + + sched_getaffinity(0, sizeof(affinity), &affinity); + num_threads = CPU_COUNT(&affinity); + tinfo = calloc(num_threads, sizeof(*tinfo)); + if (!tinfo) { + perror("Error: failed to allocate tinfo"); + return -1; + } + args = calloc(num_threads, sizeof(*args)); + if (!args) { + perror("Error: failed to allocate args"); + ret = -1; + goto out_free_tinfo; + } + token = malloc(sizeof(*token)); + if (!token) { + perror("Error: failed to allocate token"); + ret = -1; + goto out_free_args; + } + barrier = malloc(sizeof(*barrier)); + if (!barrier) { + perror("Error: failed to allocate barrier"); + ret = -1; + goto out_free_token; + } + if (num_threads == 1) { + fprintf(stderr, "Cannot test on a single cpu. " + "Skipping mm_cid_compaction test.\n"); + /* only skipping the test, this is not a failure */ + goto out_free_barrier; + } + pthread_mutex_init(token, NULL); + ret = pthread_barrier_init(barrier, NULL, num_threads); + if (ret) { + errno = ret; + perror("Error: failed to initialise barrier"); + goto out_free_barrier; + } + for (i = 0, j = 0; i < CPU_SETSIZE && j < num_threads; i++) { + if (!CPU_ISSET(i, &affinity)) + continue; + args[j].num_cpus = num_threads; + args[j].tinfo = tinfo; + args[j].token = token; + args[j].barrier = barrier; + args[j].cpu = i; + args[j].args_head = args; + if (!j) { + /* The first thread is the main one */ + tinfo[0] = pthread_self(); + ++j; + continue; + } + ret = pthread_create(&tinfo[j], NULL, thread_runner, &args[j]); + if (ret) { + errno = ret; + perror("Error: failed to create thread"); + abort(); + } + ++j; + } + printf_verbose("Started %d threads.\n", num_threads); + + /* Also main thread will terminate if it is not selected as leader */ + thread_runner(&args[0]); + + /* only reached in case of errors */ +out_free_barrier: + free(barrier); +out_free_token: + free(token); +out_free_args: + free(args); +out_free_tinfo: + free(tinfo); + + return ret; +} + +int main(int argc, char **argv) +{ + if (!rseq_mm_cid_available()) { + fprintf(stderr, "Error: rseq_mm_cid unavailable\n"); + return -1; + } + if (test_mm_cid_compaction()) + return -1; + return 0; +} -- 2.51.0

11 hours, 31 minutes

1
0
0 0

[PATCH v7 00/12] Direct Map Removal Support for guest_memfd

by Patrick Roy

From: Patrick Roy <roypat(a)amazon.co.uk> [ based on kvm/next ] Unmapping virtual machine guest memory from the host kernel's direct map is a successful mitigation against Spectre-style transient execution issues: If the kernel page tables do not contain entries pointing to guest memory, then any attempted speculative read through the direct map will necessarily be blocked by the MMU before any observable microarchitectural side-effects happen. This means that Spectre-gadgets and similar cannot be used to target virtual machine memory. Roughly 60% of speculative execution issues fall into this category [1, Table 1]. This patch series extends guest_memfd with the ability to remove its memory from the host kernel's direct map, to be able to attain the above protection for KVM guests running inside guest_memfd. Additionally, a Firecracker branch with support for these VMs can be found on GitHub [2]. For more details, please refer to the v5 cover letter [v5]. No substantial changes in design have taken place since. === Changes Since v6 === - Drop patch for passing struct address_space to ->free_folio(), due to possible races with freeing of the address_space. (Hugh) - Stop using PG_uptodate / gmem preparedness tracking to keep track of direct map state. Instead, use the lowest bit of folio->private. (Mike, David) - Do direct map removal when establishing mapping of gmem folio instead of at allocation time, due to impossibility of handling direct map removal errors in kvm_gmem_populate(). (Patrick) - Do TLB flushes after direct map removal, and provide a module parameter to opt out from them, and a new patch to export flush_tlb_kernel_range() to KVM. (Will) [1]: https://download.vusec.net/papers/quarantine_raid23.pdf [2]: https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hidi… [RFCv1]: https://lore.kernel.org/kvm/20240709132041.3625501-1-roypat@amazon.co.uk/ [RFCv2]: https://lore.kernel.org/kvm/20240910163038.1298452-1-roypat@amazon.co.uk/ [RFCv3]: https://lore.kernel.org/kvm/20241030134912.515725-1-roypat@amazon.co.uk/ [v4]: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/ [v5]: https://lore.kernel.org/kvm/20250828093902.2719-1-roypat@amazon.co.uk/ [v6]: https://lore.kernel.org/kvm/20250912091708.17502-1-roypat@amazon.co.uk/ Patrick Roy (12): arch: export set_direct_map_valid_noflush to KVM module x86/tlb: export flush_tlb_kernel_range to KVM module mm: introduce AS_NO_DIRECT_MAP KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate KVM: guest_memfd: Add flag to remove from direct map KVM: guest_memfd: add module param for disabling TLB flushing KVM: selftests: load elf via bounce buffer KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1 KVM: selftests: Add guest_memfd based vm_mem_backing_src_types KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing selftests KVM: selftests: stuff vm_mem_backing_src_type into vm_shape KVM: selftests: Test guest execution from direct map removed gmem Documentation/virt/kvm/api.rst | 5 ++ arch/arm64/include/asm/kvm_host.h | 12 ++++ arch/arm64/mm/pageattr.c | 1 + arch/loongarch/mm/pageattr.c | 1 + arch/riscv/mm/pageattr.c | 1 + arch/s390/mm/pageattr.c | 1 + arch/x86/include/asm/tlbflush.h | 3 +- arch/x86/mm/pat/set_memory.c | 1 + arch/x86/mm/tlb.c | 1 + include/linux/kvm_host.h | 9 +++ include/linux/pagemap.h | 16 +++++ include/linux/secretmem.h | 18 ----- include/uapi/linux/kvm.h | 2 + lib/buildid.c | 4 +- mm/gup.c | 19 ++---- mm/mlock.c | 2 +- mm/secretmem.c | 8 +-- .../testing/selftests/kvm/guest_memfd_test.c | 2 + .../testing/selftests/kvm/include/kvm_util.h | 37 ++++++++--- .../testing/selftests/kvm/include/test_util.h | 8 +++ tools/testing/selftests/kvm/lib/elf.c | 8 +-- tools/testing/selftests/kvm/lib/io.c | 23 +++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 61 +++++++++-------- tools/testing/selftests/kvm/lib/test_util.c | 8 +++ tools/testing/selftests/kvm/lib/x86/sev.c | 1 + .../selftests/kvm/pre_fault_memory_test.c | 1 + .../selftests/kvm/set_memory_region_test.c | 50 ++++++++++++-- .../kvm/x86/private_mem_conversions_test.c | 7 +- virt/kvm/guest_memfd.c | 66 +++++++++++++++++-- virt/kvm/kvm_main.c | 8 +++ 30 files changed, 290 insertions(+), 94 deletions(-) base-commit: a6ad54137af92535cfe32e19e5f3bc1bb7dbd383 -- 2.51.0

12 hours, 52 minutes

6
32
0 0

next-20250924: Internal error: Oops: mnt_ns_release (fs/namespace.c:148) __arm64_sys_listmount (fs/namespace.c:5936)

by Naresh Kamboju

While running LTP syscalls tests on Linux next-20250924 tag build the following kernel oops noticed on arm64 and x86_64 devices. First seen on next-20250924 Good: next-20250923 Bad: next-2025094 Regression Analysis: - New regression? yes - Reproducibility? yes Test regression: next-20250924: Internal error: Oops: mnt_ns_release (fs/namespace.c:148) __arm64_sys_listmount (fs/namespace.c:5936) Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org> $ git log --oneline next-20250923..next-20250924 -- fs/namespace.c c54644c3221b6 (next/fs-next) Merge branch 'for-next' of https://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git 1f28cc19559a8 Merge branch 'namespace-6.18' into vfs.all e2c277f720291 Merge branch 'kernel-6.18.clone3' into vfs.all b2af83d5b8223 Merge branch 'vfs-6.18.mount' into vfs.all 29ecd1ca48ec2 Merge branch 'vfs-6.18.misc' into vfs.all d7610cb7454bb ns: simplify ns_common_init() further 59bfb66816809 listmount: don't call path_put() under namespace semaphore 2bc5bfbfd3f27 statmount: don't call path_put() under namespace semaphore ## Test log [ 41.821877] Internal error: Oops: 0000000096000005 [#1] SMP [ 41.919038] Modules linked in: cdc_ether usbnet sm3_ce sha3_ce nvme xhci_pci_renesas nvme_core arm_cspmu_module arm_spe_pmu ipmi_devintf ipmi_msghandler arm_cmn cppc_cpufreq drm fuse backlight [ 41.944048] CPU: 14 UID: 0 PID: 6416 Comm: listmount04 Not tainted 6.17.0-rc7-next-20250924 #1 PREEMPT [ 41.958197] Hardware name: Inspur NF5280R7/Mitchell MB, BIOS 04.04.00004001 2025-02-04 22:23:30 02/04/2025 [ 41.967837] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) [ 41.974958] pc : mnt_ns_release (arch/arm64/include/asm/atomic_lse.h:62 (discriminator 1) arch/arm64/include/asm/atomic_lse.h:76 (discriminator 1) arch/arm64/include/asm/atomic.h:51 (discriminator 1) include/linux/atomic/atomic-arch-fallback.h:944 (discriminator 1) include/linux/atomic/atomic-instrumented.h:401 (discriminator 1) include/linux/refcount.h:389 (discriminator 1) include/linux/refcount.h:432 (discriminator 1) include/linux/refcount.h:450 (discriminator 1) fs/namespace.c:148 (discriminator 1)) [ 41.981910] lr : __arm64_sys_listmount (fs/namespace.c:5936) [ 41.993467] sp : ffff8000ff5afd50 [ 42.000329] x29: ffff8000ff5afd50 x28: fff00001bd947380 x27: 0000000000000000 [ 42.007454] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000100 [ 42.030726] x23: 0000000000000000 x22: 0000000000000020 x21: ffff8000ff5afdc8 [ 42.038281] x20: 0000aaaabd6a1110 x19: 0000000000000000 x18: 0000000000000000 [ 42.045405] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [ 42.052528] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 42.075541] x11: 0000000000000000 x10: 0000000000000000 x9 : ffffda68dcdbbe30 [ 42.082835] x8 : ffff8000ff5afda0 x7 : fefefefefefefefe x6 : ffffda68df5e9000 [ 42.096212] x5 : fff00001bd947380 [ 42.108978] x4 : fff00001bd947380 x3 : 0000000000000000 [ 42.114449] x2 : 0000000000000000 x1 : 00000000ffffffff x0 : 00000000000000b8 [ 42.134515] Call trace: [ 42.139725] mnt_ns_release (arch/arm64/include/asm/atomic_lse.h:62 (discriminator 1) arch/arm64/include/asm/atomic_lse.h:76 (discriminator 1) arch/arm64/include/asm/atomic.h:51 (discriminator 1) include/linux/atomic/atomic-arch-fallback.h:944 (discriminator 1) include/linux/atomic/atomic-instrumented.h:401 (discriminator 1) include/linux/refcount.h:389 (discriminator 1) include/linux/refcount.h:432 (discriminator 1) include/linux/refcount.h:450 (discriminator 1) fs/namespace.c:148 (discriminator 1)) (P) [ 42.143811] __arm64_sys_listmount (fs/namespace.c:5936) [ 42.148327] invoke_syscall.constprop.0 (arch/arm64/include/asm/syscall.h:61 arch/arm64/kernel/syscall.c:54) [ 42.159193] do_el0_svc (include/linux/thread_info.h:135 (discriminator 2) arch/arm64/kernel/syscall.c:140 (discriminator 2) arch/arm64/kernel/syscall.c:151 (discriminator 2)) [ 42.163970] el0_svc (arch/arm64/include/asm/irqflags.h:82 (discriminator 1) arch/arm64/include/asm/irqflags.h:123 (discriminator 1) arch/arm64/include/asm/irqflags.h:136 (discriminator 1) arch/arm64/kernel/entry-common.c:102 (discriminator 1) arch/arm64/kernel/entry-common.c:745 (discriminator 1)) [ 42.173791] el0t_64_sync_handler (arch/arm64/kernel/entry-common.c:764) [ 42.185342] el0t_64_sync (arch/arm64/kernel/entry.S:596) [ 42.189165] Code: aa0003f3 9102e000 d503201f 12800001 (b8610001) All code ======== 0: aa0003f3 mov x19, x0 4: 9102e000 add x0, x0, #0xb8 8: d503201f nop c: 12800001 mov w1, #0xffffffff // #-1 10:* b8610001 ldaddl w1, w1, [x0] <-- trapping instruction Code starting with the faulting instruction =========================================== 0: b8610001 ldaddl w1, w1, [x0] [ 42.211485] ---[ end trace 0000000000000000 ]--- ## Source * Kernel version: 6.17.0-rc7 * Git tree: https://kernel.googlesource.com/pub/scm/linux/kernel/git/next/linux-next.git * Git describe: 6.17.0-rc7-next-20250924 * Git commit: b5a4da2c459f79a2c87c867398f1c0c315779781 * Architectures: arm64, x86_64 * Toolchains: gcc-13 * Kconfigs: defconfig+lkftconfig ## Build * Test log arm64: https://qa-reports.linaro.org/api/testruns/30007634/log_file/ * Test log x86_64: https://qa-reports.linaro.org/api/testruns/30000230/log_file/ * Test details: https://regressions.linaro.org/lkft/linux-next-master-ampere/next-20250924/… * Build plan: https://tuxapi.tuxsuite.com/v1/groups/ampere/projects/ci/tests/339teV8pAwrs… * Build link: https://storage.tuxsuite.com/public/ampere/ci/builds/339teBhKZ4DENKbJJNnbWK… * Kernel config: https://storage.tuxsuite.com/public/ampere/ci/builds/339teBhKZ4DENKbJJNnbWK… -- Linaro LKFT

13 hours, 31 minutes

3
2
0 0

[PATCH 0/4] PCI: Add support and tests for FIXUP quirks in modules

by Brian Norris

This series primarily adds support for DECLARE_PCI_FIXUP_*() in modules. There are a few drivers that already use this, and so they are presumably broken when built as modules. While at it, I wrote some unit tests that emulate a fake PCI device, and let the PCI framework match/not-match its vendor/device IDs. This test can be built into the kernel or built as a module. I also include some infrastructure changes (patch 3 and 4), so that ARCH=um (the default for kunit.py), ARCH=arm, and ARCH=arm64 will run these tests by default. These patches have different maintainers and are independent, so they can probably be picked up separately. I included them because otherwise the tests in patch 2 aren't so easy to run. Brian Norris (4): PCI: Support FIXUP quirks in modules PCI: Add KUnit tests for FIXUP quirks um: Select PCI_DOMAINS_GENERIC kunit: qemu_configs: Add PCI to arm, arm64 arch/um/Kconfig | 1 + drivers/pci/Kconfig | 11 ++ drivers/pci/Makefile | 1 + drivers/pci/fixup-test.c | 197 ++++++++++++++++++++++ drivers/pci/quirks.c | 62 +++++++ include/linux/module.h | 18 ++ kernel/module/main.c | 26 +++ tools/testing/kunit/qemu_configs/arm.py | 1 + tools/testing/kunit/qemu_configs/arm64.py | 1 + 9 files changed, 318 insertions(+) create mode 100644 drivers/pci/fixup-test.c -- 2.51.0.384.g4c02a37b29-goog

14 hours, 17 minutes

6
17
0 0

[PATCH RESEND v4] selftests/tty: add TIOCSTI test suite

by Abhinav Saxena

TIOCSTI is a TTY ioctl command that allows inserting characters into the terminal input queue, making it appear as if the user typed those characters. This functionality has behavior that varies based on system configuration and process credentials. The dev.tty.legacy_tiocsti sysctl introduced in commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") controls TIOCSTI usage. When disabled, TIOCSTI requires CAP_SYS_ADMIN capability. The current implementation checks the current process's credentials via capable(CAP_SYS_ADMIN), but does not validate against the file opener's credentials stored in file->f_cred. This creates different behavior when file descriptors are passed between processes via SCM_RIGHTS. Add a test suite with 16 test variants using fixture variants to verify TIOCSTI behavior when dev.tty.legacy_tiocsti is enabled/disabled: - Basic TIOCSTI tests (8 variants): Direct testing with different capability and controlling terminal combinations - FD passing tests (8 variants): Test behavior when file descriptors are passed between processes with different capabilities The FD passing tests document this behavior - some tests show different results than expected based on file opener credentials, demonstrating that TIOCSTI uses current process credentials rather than file opener credentials. The tests validate proper enforcement of the legacy_tiocsti sysctl. Test implementation uses openpty(3) with TIOCSCTTY for isolated PTY environments. See tty_ioctl(4) for details on TIOCSTI behavior and security requirements. Signed-off-by: Abhinav Saxena <xandfury(a)gmail.com> --- RESEND: add TTY/serial maintainers and linux-serial CCs. No code changes. - Link to orignal v4: https://lore.kernel.org/r/20250902-toicsti-bug-v4-1-e5c960e0b3d6@gmail.com Changes in v4: - Moved skip conditions and sysctl setup from TEST_F to FIXTURE_SETUP (Kees Cook) - Fixed fclose() error handling in set_legacy_tiocsti_setting (Kees Cook) - Extracted run_basic_tiocsti_test() and run_fdpass_tiocsti_test functions (Kees Cook) - Removed redundant sysctl restore logic from TEST_F (Kees Cook) - Simplified FIXTURE_TEARDOWN (Kees Cook) - Replace drop_to_nobody() to drop_all_privs() which should be more portable (Justin Stitt) - Link to v3: https://lore.kernel.org/r/20250730-toicsti-bug-v3-1-dd2dac97f27a@gmail.com Add selftests for TIOCSTI ioctl To run all tests: $ sudo ./tools/testing/selftests/tty/tty_tiocsti_test Test Results: - PASSED: 13/16 tests - Different behavior: 3/16 tests (documenting credential checking behavior) All tests validated using: - scripts/checkpatch.pl --strict (clean output) - Functional testing on kernel v6.16-rc2 Changes in v3: - Replaced all printf() calls with TH_LOG() for proper test logging (Kees Cook) - Added struct __test_metadata parameter to helper functions - Moved common legacy_tiocsti availability check to FIXTURE_SETUP() - Implemented sysctl modification/restoration in FIXTURE_SETUP/TEARDOWN - Used openpty() with TIOCSCTTY for reliable PTY testing environment - Fixed child/parent synchronization in FD passing tests - Replaced manual _exit(1) handling with proper ASSERT statements - Switched // comments to /* */ format throughout - Expanded to 16 test variants using fixture variants - Enhanced error handling and test reliability - Link to v2: https://lore.kernel.org/r/20250713-toicsti-bug-v2-1-b183787eea29@gmail.com - Link to v1: https://lore.kernel.org/r/20250622-toicsti-bug-v1-0-f374373b04b2@gmail.com References: - tty_ioctl(4) - documents TIOCSTI ioctl and capability requirements - openpty(3) - pseudo-terminal creation and management - commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled") - Documentation/security/credentials.rst - https://github.com/KSPP/linux/issues/156 - https://lore.kernel.org/linux-hardening/Y0m9l52AKmw6Yxi1@hostpad/ - drivers/tty/Kconfig - Documentation/driver-api/tty/ --- tools/testing/selftests/tty/Makefile | 6 +- tools/testing/selftests/tty/config | 1 + tools/testing/selftests/tty/tty_tiocsti_test.c | 650 +++++++++++++++++++++++++ 3 files changed, 656 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tty/Makefile b/tools/testing/selftests/tty/Makefile index 50d7027b2ae3fb495dd1c0684363fa8f426be42c..7f6fbe5a0cd5663310e334d9d068b21dab9136ec 100644 --- a/tools/testing/selftests/tty/Makefile +++ b/tools/testing/selftests/tty/Makefile @@ -1,5 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS = -O2 -Wall -TEST_GEN_PROGS := tty_tstamp_update +TEST_GEN_PROGS := tty_tstamp_update tty_tiocsti_test +LDLIBS += -lcap include ../lib.mk + +# Add libcap for TIOCSTI test +$(OUTPUT)/tty_tiocsti_test: LDLIBS += -lcap diff --git a/tools/testing/selftests/tty/config b/tools/testing/selftests/tty/config new file mode 100644 index 0000000000000000000000000000000000000000..c6373aba66366c82435bb26c019eb360eb6310eb --- /dev/null +++ b/tools/testing/selftests/tty/config @@ -0,0 +1 @@ +CONFIG_LEGACY_TIOCSTI=y diff --git a/tools/testing/selftests/tty/tty_tiocsti_test.c b/tools/testing/selftests/tty/tty_tiocsti_test.c new file mode 100644 index 0000000000000000000000000000000000000000..5e767e6cb3ef8f05c5430eb0fcc792064c446c03 --- /dev/null +++ b/tools/testing/selftests/tty/tty_tiocsti_test.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TTY Tests - TIOCSTI + * + * Copyright © 2025 Abhinav Saxena <xandfury(a)gmail.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <stdbool.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/wait.h> +#include <pwd.h> +#include <termios.h> +#include <grp.h> +#include <sys/capability.h> +#include <sys/prctl.h> +#include <pty.h> +#include <utmp.h> + +#include "../kselftest_harness.h" + +enum test_type { + TEST_PTY_TIOCSTI_BASIC, + TEST_PTY_TIOCSTI_FD_PASSING, + /* other tests cases such as serial may be added. */ +}; + +/* + * Test Strategy: + * - Basic tests: Use PTY with/without TIOCSCTTY (controlling terminal for + * current process) + * - FD passing tests: Child creates PTY, parent receives FD (demonstrates + * security issue) + * + * SECURITY VULNERABILITY DEMONSTRATION: + * FD passing tests show that TIOCSTI uses CURRENT process credentials, not + * opener credentials. This means privileged processes can be given FDs from + * unprivileged processes and successfully perform TIOCSTI operations that the + * unprivileged process couldn't do directly. + * + * Attack scenario: + * 1. Unprivileged process opens TTY (direct TIOCSTI fails due to lack of + * privileges) + * 2. Unprivileged process passes FD to privileged process via SCM_RIGHTS + * 3. Privileged process can use TIOCSTI on the FD (succeeds due to its + * privileges) + * 4. Result: Effective privilege escalation via file descriptor passing + * + * This matches the kernel logic in tiocsti(): + * 1. if (!tty_legacy_tiocsti && !capable(CAP_SYS_ADMIN)) return -EIO; + * 2. if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + * return -EPERM; + * Note: Both checks use capable() on CURRENT process, not FD opener! + * + * If the file credentials were also checked along with the capable() checks + * then the results for FD pass tests would be consistent with the basic tests. + */ + +FIXTURE(tiocsti) +{ + int pty_master_fd; /* PTY - for basic tests */ + int pty_slave_fd; + bool has_pty; + bool initial_cap_sys_admin; + int original_legacy_tiocsti_setting; + bool can_modify_sysctl; +}; + +FIXTURE_VARIANT(tiocsti) +{ + const enum test_type test_type; + const bool controlling_tty; /* true=current->signal->tty == tty */ + const int legacy_tiocsti; /* 0=restricted, 1=permissive */ + const bool requires_cap; /* true=with CAP_SYS_ADMIN, false=without */ + const int expected_success; /* 0=success, -EIO/-EPERM=specific error */ +}; + +/* + * Tests Controlling Terminal Variants (current->signal->tty == tty) + * + * TIOCSTI Test Matrix: + * + * | legacy_tiocsti | CAP_SYS_ADMIN | Expected Result | Error | + * |----------------|---------------|-----------------|-------| + * | 1 (permissive) | true | SUCCESS | - | + * | 1 (permissive) | false | SUCCESS | - | + * | 0 (restricted) | true | SUCCESS | - | + * | 0 (restricted) | false | FAILURE | -EIO | + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_pty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, /* FAILURE: legacy restriction */ +}; /* clang-format on */ + +/* + * Note for FD Passing Test Variants + * Since we're testing the scenario where an unprivileged process pass an FD + * to a privileged one, .requires_cap here means the caps of the child process. + * Not the parent; parent would always be privileged. + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_pty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = true, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; /* clang-format on */ + +/* + * Non-Controlling Terminal Variants (current->signal->tty != tty) + * + * TIOCSTI Test Matrix: + * + * | legacy_tiocsti | CAP_SYS_ADMIN | Expected Result | Error | + * |----------------|---------------|-----------------|-------| + * | 1 (permissive) | true | SUCCESS | - | + * | 1 (permissive) | false | FAILURE | -EPERM| + * | 0 (restricted) | true | SUCCESS | - | + * | 0 (restricted) | false | FAILURE | -EIO | + */ + +/* clang-format off */ +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = -EPERM, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, basic_nopty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_BASIC, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_permissive_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_permissive_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 1, + .requires_cap = false, + .expected_success = -EPERM, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_restricted_withcap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = true, + .expected_success = 0, +}; + +FIXTURE_VARIANT_ADD(tiocsti, fdpass_nopty_restricted_nocap) { + .test_type = TEST_PTY_TIOCSTI_FD_PASSING, + .controlling_tty = false, + .legacy_tiocsti = 0, + .requires_cap = false, + .expected_success = -EIO, +}; /* clang-format on */ + +/* Helper function to send FD via SCM_RIGHTS */ +static int send_fd_via_socket(int socket_fd, int fd_to_send) +{ + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + char dummy_data = 'F'; + struct iovec iov = { .iov_base = &dummy_data, .iov_len = 1 }; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + memcpy(CMSG_DATA(cmsg), &fd_to_send, sizeof(int)); + + return sendmsg(socket_fd, &msg, 0) < 0 ? -1 : 0; +} + +/* Helper function to receive FD via SCM_RIGHTS */ +static int recv_fd_via_socket(int socket_fd) +{ + struct msghdr msg = { 0 }; + struct cmsghdr *cmsg; + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + char dummy_data; + struct iovec iov = { .iov_base = &dummy_data, .iov_len = 1 }; + int received_fd = -1; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsg_buf; + msg.msg_controllen = sizeof(cmsg_buf); + + if (recvmsg(socket_fd, &msg, 0) < 0) + return -1; + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + break; + } + } + + return received_fd; +} + +static inline bool has_cap_sys_admin(void) +{ + cap_t caps = cap_get_proc(); + + if (!caps) + return false; + + cap_flag_value_t cap_val; + bool has_cap = (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, + &cap_val) == 0) && + (cap_val == CAP_SET); + + cap_free(caps); + return has_cap; +} + +/* + * Switch to non-root user and clear all capabilities + */ +static inline bool drop_all_privs(struct __test_metadata *_metadata) +{ + /* Drop supplementary groups */ + ASSERT_EQ(setgroups(0, NULL), 0); + + /* Switch to non-root user */ + ASSERT_EQ(setgid(1000), 0); + ASSERT_EQ(setuid(1000), 0); + + /* Clear all capabilities */ + cap_t empty = cap_init(); + + ASSERT_NE(empty, NULL); + ASSERT_EQ(cap_set_proc(empty), 0); + cap_free(empty); + + /* Prevent privilege regain */ + ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0); + + /* Verify privilege drop */ + ASSERT_FALSE(has_cap_sys_admin()); + return true; +} + +static inline int get_legacy_tiocsti_setting(struct __test_metadata *_metadata) +{ + FILE *fp; + int value = -1; + + fp = fopen("/proc/sys/dev/tty/legacy_tiocsti", "r"); + if (!fp) { + /* legacy_tiocsti sysctl not available (kernel < 6.2) */ + return -1; + } + + if (fscanf(fp, "%d", &value) == 1 && fclose(fp) == 0) { + if (value < 0 || value > 1) + value = -1; /* Invalid value */ + } else { + value = -1; /* Failed to parse */ + } + + return value; +} + +static inline bool set_legacy_tiocsti_setting(struct __test_metadata *_metadata, + int value) +{ + FILE *fp; + bool success = false; + + /* Sanity-check the value */ + ASSERT_GE(value, 0); + ASSERT_LE(value, 1); + + /* + * Try to open for writing; if we lack permission, return false so + * the test harness will skip variants that need to change it + */ + fp = fopen("/proc/sys/dev/tty/legacy_tiocsti", "w"); + if (!fp) + return false; + + /* Write the new setting */ + if (fprintf(fp, "%d\n", value) > 0 && fclose(fp) == 0) + success = true; + else + TH_LOG("Failed to write legacy_tiocsti: %s", strerror(errno)); + + return success; +} + +/* + * TIOCSTI injection test function + * @tty_fd: TTY slave file descriptor to test TIOCSTI on + * Returns: 0 on success, -errno on failure + */ +static inline int test_tiocsti_injection(struct __test_metadata *_metadata, + int tty_fd) +{ + int ret; + char inject_char = 'V'; + + errno = 0; + ret = ioctl(tty_fd, TIOCSTI, &inject_char); + return ret == 0 ? 0 : -errno; +} + +/* + * Child process: test TIOCSTI directly with capability/controlling + * terminal setup + */ +static void run_basic_tiocsti_test(struct __test_metadata *_metadata, + FIXTURE_DATA(tiocsti) * self, + const FIXTURE_VARIANT(tiocsti) * variant) +{ + /* Handle capability requirements */ + if (self->initial_cap_sys_admin && !variant->requires_cap) + ASSERT_TRUE(drop_all_privs(_metadata)); + + if (variant->controlling_tty) { + /* + * Create new session and set PTY as + * controlling terminal + */ + pid_t sid = setsid(); + + ASSERT_GE(sid, 0); + ASSERT_EQ(ioctl(self->pty_slave_fd, TIOCSCTTY, 0), 0); + } + + /* + * Validate test environment setup and verify final + * capability state matches expectation + * after potential drop. + */ + ASSERT_TRUE(self->has_pty); + ASSERT_EQ(has_cap_sys_admin(), variant->requires_cap); + + /* Test TIOCSTI and validate result */ + int result = test_tiocsti_injection(_metadata, self->pty_slave_fd); + + /* Check against expected result from variant */ + EXPECT_EQ(result, variant->expected_success); + _exit(0); +} + +/* + * Child process: create PTY and then pass FD to parent via SCM_RIGHTS + */ +static void run_fdpass_tiocsti_test(struct __test_metadata *_metadata, + const FIXTURE_VARIANT(tiocsti) * variant, + int sockfd) +{ + signal(SIGHUP, SIG_IGN); + + /* Handle privilege dropping */ + if (!variant->requires_cap && has_cap_sys_admin()) + ASSERT_TRUE(drop_all_privs(_metadata)); + + /* Create child's PTY */ + int child_master_fd, child_slave_fd; + + ASSERT_EQ(openpty(&child_master_fd, &child_slave_fd, NULL, NULL, NULL), + 0); + + if (variant->controlling_tty) { + pid_t sid = setsid(); + + ASSERT_GE(sid, 0); + ASSERT_EQ(ioctl(child_slave_fd, TIOCSCTTY, 0), 0); + } + + /* Test child's direct TIOCSTI for reference */ + int direct_result = test_tiocsti_injection(_metadata, child_slave_fd); + + EXPECT_EQ(direct_result, variant->expected_success); + + /* Send FD to parent */ + ASSERT_EQ(send_fd_via_socket(sockfd, child_slave_fd), 0); + + /* Wait for parent completion signal */ + char sync_byte; + ssize_t bytes_read = read(sockfd, &sync_byte, 1); + + ASSERT_EQ(bytes_read, 1); + + close(child_master_fd); + close(child_slave_fd); + close(sockfd); + _exit(0); +} + +FIXTURE_SETUP(tiocsti) +{ + /* Create PTY pair for basic tests */ + self->has_pty = (openpty(&self->pty_master_fd, &self->pty_slave_fd, + NULL, NULL, NULL) == 0); + if (!self->has_pty) { + self->pty_master_fd = -1; + self->pty_slave_fd = -1; + } + + self->initial_cap_sys_admin = has_cap_sys_admin(); + self->original_legacy_tiocsti_setting = + get_legacy_tiocsti_setting(_metadata); + + if (self->original_legacy_tiocsti_setting < 0) + SKIP(return, + "legacy_tiocsti sysctl not available (kernel < 6.2)"); + + /* Common skip conditions */ + if (variant->test_type == TEST_PTY_TIOCSTI_BASIC && !self->has_pty) + SKIP(return, "PTY not available for controlling terminal test"); + + if (variant->test_type == TEST_PTY_TIOCSTI_FD_PASSING && + !self->initial_cap_sys_admin) + SKIP(return, "FD Pass tests require CAP_SYS_ADMIN"); + + if (variant->requires_cap && !self->initial_cap_sys_admin) + SKIP(return, "Test requires initial CAP_SYS_ADMIN"); + + /* Test if we can modify the sysctl (requires appropriate privileges) */ + self->can_modify_sysctl = set_legacy_tiocsti_setting( + _metadata, self->original_legacy_tiocsti_setting); + + /* Sysctl setup based on variant */ + if (self->can_modify_sysctl && + self->original_legacy_tiocsti_setting != variant->legacy_tiocsti) { + if (!set_legacy_tiocsti_setting(_metadata, + variant->legacy_tiocsti)) + SKIP(return, "Failed to set legacy_tiocsti sysctl"); + + } else if (!self->can_modify_sysctl && + self->original_legacy_tiocsti_setting != + variant->legacy_tiocsti) + SKIP(return, "legacy_tiocsti setting mismatch"); +} + +FIXTURE_TEARDOWN(tiocsti) +{ + /* + * Backup restoration - + * each test should restore its own sysctl changes + */ + if (self->can_modify_sysctl) { + int current_value = get_legacy_tiocsti_setting(_metadata); + + if (current_value != self->original_legacy_tiocsti_setting) { + TH_LOG("Backup: Restoring legacy_tiocsti from %d to %d", + current_value, + self->original_legacy_tiocsti_setting); + set_legacy_tiocsti_setting( + _metadata, + self->original_legacy_tiocsti_setting); + } + } + + if (self->has_pty) { + if (self->pty_master_fd >= 0) + close(self->pty_master_fd); + if (self->pty_slave_fd >= 0) + close(self->pty_slave_fd); + } +} + +TEST_F(tiocsti, test) +{ + int status; + pid_t child_pid; + + if (variant->test_type == TEST_PTY_TIOCSTI_BASIC) { + /* ===== BASIC TIOCSTI TEST ===== */ + child_pid = fork(); + ASSERT_GE(child_pid, 0); + + /* Perform the actual test in the child process */ + if (child_pid == 0) + run_basic_tiocsti_test(_metadata, self, variant); + + } else { + /* ===== FD PASSING SECURITY TEST ===== */ + int sockpair[2]; + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sockpair), 0); + + child_pid = fork(); + ASSERT_GE(child_pid, 0); + + if (child_pid == 0) { + /* Child process - create PTY and send FD */ + close(sockpair[0]); + run_fdpass_tiocsti_test(_metadata, variant, + sockpair[1]); + } + + /* Parent process - receive FD and test TIOCSTI */ + close(sockpair[1]); + + int received_fd = recv_fd_via_socket(sockpair[0]); + + ASSERT_GE(received_fd, 0); + + bool parent_has_cap = self->initial_cap_sys_admin; + + TH_LOG("=== TIOCSTI FD Passing Test Context ==="); + TH_LOG("legacy_tiocsti: %d, Parent CAP_SYS_ADMIN: %s, Child: %s", + variant->legacy_tiocsti, parent_has_cap ? "yes" : "no", + variant->requires_cap ? "kept" : "dropped"); + + /* SECURITY TEST: Try TIOCSTI with FD opened by child */ + int result = test_tiocsti_injection(_metadata, received_fd); + + /* Log security concern if demonstrated */ + if (result == 0 && !variant->requires_cap) { + TH_LOG("*** SECURITY CONCERN DEMONSTRATED ***"); + TH_LOG("Privileged parent can use TIOCSTI on FD from unprivileged child"); + TH_LOG("This shows current process credentials are used, not opener credentials"); + } + + EXPECT_EQ(result, variant->expected_success) + { + TH_LOG("FD passing: expected error %d, got %d", + variant->expected_success, result); + } + + /* Signal child completion */ + char sync_byte = 'D'; + ssize_t bytes_written = write(sockpair[0], &sync_byte, 1); + + ASSERT_EQ(bytes_written, 1); + + close(received_fd); + close(sockpair[0]); + } + + /* Common child process cleanup for both test types */ + ASSERT_EQ(waitpid(child_pid, &status, 0), child_pid); + + if (WIFSIGNALED(status)) { + TH_LOG("Child terminated by signal %d", WTERMSIG(status)); + ASSERT_FALSE(WIFSIGNALED(status)) + { + TH_LOG("Child process failed assertion"); + } + } else { + EXPECT_EQ(WEXITSTATUS(status), 0); + } +} + +TEST_HARNESS_MAIN --- base-commit: e6b9dce0aeeb91dfc0974ab87f02454e24566182 change-id: 20250618-toicsti-bug-7822b8e94a32 Best regards, -- Abhinav Saxena <xandfury(a)gmail.com>

23 hours, 13 minutes

1
1
0 0

[PATCH RFC 0/4] landlock: add LANDLOCK_SCOPE_MEMFD_EXEC execution

by Abhinav Saxena

This patch series introduces LANDLOCK_SCOPE_MEMFD_EXEC, a new Landlock scoping mechanism that restricts execution of anonymous memory file descriptors (memfd) created via memfd_create(2). This addresses security gaps where processes can bypass W^X policies and execute arbitrary code through anonymous memory objects. Fixes: https://github.com/landlock-lsm/linux/issues/37 SECURITY PROBLEM ================ Current Landlock filesystem restrictions do not cover memfd objects, allowing processes to: 1. Read-to-execute bypass: Create writable memfd, inject code, then execute via mmap(PROT_EXEC) or direct execve() 2. Anonymous execution: Execute code without touching the filesystem via execve("/proc/self/fd/N") where N is a memfd descriptor 3. Cross-domain access violations: Pass memfd between processes to bypass domain restrictions These scenarios can occur in sandboxed environments where filesystem access is restricted but memfd creation remains possible. IMPLEMENTATION ============== The implementation adds hierarchical execution control through domain scoping: Core Components: - is_memfd_file(): Reliable memfd detection via "memfd:" dentry prefix - domain_is_scoped(): Cross-domain hierarchy checking (moved to domain.c) - LSM hooks: mmap_file, file_mprotect, bprm_creds_for_exec - Creation-time restrictions: hook_file_alloc_security Security Matrix: Execution decisions follow domain hierarchy rules preventing both same-domain bypass attempts and cross-domain access violations while preserving legitimate hierarchical access patterns. Domain Hierarchy with LANDLOCK_SCOPE_MEMFD_EXEC: =============================================== Root (no domain) - No restrictions | +-- Domain A [SCOPE_MEMFD_EXEC] Layer 1 | +-- memfd_A (tagged with Domain A as creator) | | | +-- Domain A1 (child) [NO SCOPE] Layer 2 | | +-- Inherits Layer 1 restrictions from parent | | +-- memfd_A1 (can create, inherits restrictions) | | +-- Domain A1a [SCOPE_MEMFD_EXEC] Layer 3 | | +-- memfd_A1a (tagged with Domain A1a) | | | +-- Domain A2 (child) [SCOPE_MEMFD_EXEC] Layer 2 | +-- memfd_A2 (tagged with Domain A2 as creator) | +-- CANNOT access memfd_A1 (different subtree) | +-- Domain B [SCOPE_MEMFD_EXEC] Layer 1 +-- memfd_B (tagged with Domain B as creator) +-- CANNOT access ANY memfd from Domain A subtree Execution Decision Matrix: ======================== Executor-> | A | A1 | A1a | A2 | B | Root Creator | | | | | | ------------|-----|----|-----|----|----|----- Domain A | X | X | X | X | X | Y Domain A1 | Y | X | X | X | X | Y Domain A1a | Y | Y | X | X | X | Y Domain A2 | Y | X | X | X | X | Y Domain B | X | X | X | X | X | Y Root | Y | Y | Y | Y | Y | Y Legend: Y = Execution allowed, X = Execution denied Scenarios Covered: - Direct mmap(PROT_EXEC) on memfd files - Two-stage mmap(PROT_READ) + mprotect(PROT_EXEC) bypass attempts - execve("/proc/self/fd/N") anonymous execution - execveat() and fexecve() file descriptor execution - Cross-process memfd inheritance and IPC passing TESTING ======= All patches have been validated with: - scripts/checkpatch.pl --strict (clean) - Selftests covering same-domain restrictions, cross-domain hierarchy enforcement, and regular file isolation - KUnit tests for memfd detection edge cases DISCLAIMER ========== My understanding of Landlock scoping semantics may be limited, but this implementation reflects my current understanding based on available documentation and code analysis. I welcome feedback and corrections regarding the scoping logic and domain hierarchy enforcement. Signed-off-by: Abhinav Saxena <xandfury(a)gmail.com> --- Abhinav Saxena (4): landlock: add LANDLOCK_SCOPE_MEMFD_EXEC scope landlock: implement memfd detection landlock: add memfd exec LSM hooks and scoping selftests/landlock: add memfd execution tests include/uapi/linux/landlock.h | 5 + security/landlock/.kunitconfig | 1 + security/landlock/audit.c | 4 + security/landlock/audit.h | 1 + security/landlock/cred.c | 14 - security/landlock/domain.c | 67 ++++ security/landlock/domain.h | 4 + security/landlock/fs.c | 405 ++++++++++++++++++++- security/landlock/limits.h | 2 +- security/landlock/task.c | 67 ---- .../selftests/landlock/scoped_memfd_exec_test.c | 325 +++++++++++++++++ 11 files changed, 812 insertions(+), 83 deletions(-) --- base-commit: 5b74b2eff1eeefe43584e5b7b348c8cd3b723d38 change-id: 20250716-memfd-exec-ac0d582018c3 Best regards, -- Abhinav Saxena <xandfury(a)gmail.com>

23 hours, 36 minutes

3
10
0 0

[PATCH v4 1/2] net/tls: support maximum record size limit

by Wilfred Mallawa

From: Wilfred Mallawa <wilfred.mallawa(a)wdc.com> During a handshake, an endpoint may specify a maximum record size limit. Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the maximum record size. Meaning that, the outgoing records from the kernel can exceed a lower size negotiated during the handshake. In such a case, the TLS endpoint must send a fatal "record_overflow" alert [1], and thus the record is discarded. Upcoming Western Digital NVMe-TCP hardware controllers implement TLS support. For these devices, supporting TLS record size negotiation is necessary because the maximum TLS record size supported by the controller is less than the default 16KB currently used by the kernel. This patch adds support for retrieving the negotiated record size limit during a handshake, and enforcing it at the TLS layer such that outgoing records are no larger than the size negotiated. This patch depends on the respective userspace support in tlshd and GnuTLS [2]. [1] https://www.rfc-editor.org/rfc/rfc8449 [2] https://gitlab.com/gnutls/gnutls/-/merge_requests/2005 Signed-off-by: Wilfred Mallawa <wilfred.mallawa(a)wdc.com> --- Changes V3 -> V4: * Added record_size_limit RFC reference to documentation * Always export the record size limit in tls_get_info() * Disallow user space to change the record_size_limit from under us if an open record is pending. * Added record_size_limit minimum size check as per RFC * Allow space for the ContentType byte for TLS 1.3. The expected behaviour is that userspace directly uses the negotiated record_size_limit, kernel will limit the plaintext buffer size appropirately. * New patch to add self-tests. --- Documentation/networking/tls.rst | 12 +++++ include/net/tls.h | 5 +++ include/uapi/linux/tls.h | 2 + net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 75 ++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 2 +- 6 files changed, 96 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst index 36cc7afc2527..d24bf8911bb8 100644 --- a/Documentation/networking/tls.rst +++ b/Documentation/networking/tls.rst @@ -280,6 +280,18 @@ If the record decrypted turns out to had been padded or is not a data record it will be decrypted again into a kernel buffer without zero copy. Such events are counted in the ``TlsDecryptRetry`` statistic. +TLS_TX_RECORD_SIZE_LIM +~~~~~~~~~~~~~~~~~~~~~~ + +Sets the maximum size for the plaintext of a protected record. + +The provided value should correspond to the limit negotiated during the TLS +handshake via the `record_size_limit` extension (RFC 8449)[1]. When this +option is set, the kernel enforces this limit on all transmitted TLS records, +ensuring no plaintext fragment exceeds the specified size. + +[1] https://datatracker.ietf.org/doc/html/rfc8449 + Statistics ========== diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..32f053770ec4 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,6 +53,8 @@ struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE @@ -226,6 +228,9 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_record_size_limit; /* Max plaintext fragment size. For TLS 1.3, + * this excludes the ContentType. + */ int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index b66a800389cc..3add266d5916 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -41,6 +41,7 @@ #define TLS_RX 2 /* Set receive parameters */ #define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */ #define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */ +#define TLS_TX_RECORD_SIZE_LIM 5 /* Maximum record size */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -194,6 +195,7 @@ enum { TLS_INFO_RXCONF, TLS_INFO_ZC_RO_TX, TLS_INFO_RX_NO_PAD, + TLS_INFO_TX_RECORD_SIZE_LIM, __TLS_INFO_MAX, }; #define TLS_INFO_MAX (__TLS_INFO_MAX - 1) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52..bf16ceb41dde 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -459,7 +459,7 @@ static int tls_push_data(struct sock *sk, /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ - max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + max_open_record_len = tls_ctx->tx_record_size_limit + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3ccb3135e51..09883d9c6c96 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -544,6 +544,31 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, return 0; } +static int do_tls_getsockopt_tx_record_size(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + int len; + /* TLS 1.3: Record length contains ContentType */ + u16 record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ? + ctx->tx_record_size_limit + 1 : + ctx->tx_record_size_limit; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < sizeof(record_size_limit)) + return -EINVAL; + + if (put_user(sizeof(record_size_limit), optlen)) + return -EFAULT; + + if (copy_to_user(optval, &record_size_limit, sizeof(record_size_limit))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -563,6 +588,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_RECORD_SIZE_LIM: + rc = do_tls_getsockopt_tx_record_size(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -812,6 +840,43 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, return rc; } +static int do_tls_setsockopt_tx_record_size(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx); + u16 value; + + if (sw_ctx->open_rec) + return -EBUSY; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value < TLS_MIN_RECORD_SIZE_LIM) + return -EINVAL; + + if (ctx->prot_info.version == TLS_1_2_VERSION && + value > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + if (ctx->prot_info.version == TLS_1_3_VERSION && + value - 1 > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + /* + * For TLS 1.3: 'value' includes one byte for the appended ContentType. + * Adjust the kernel's internal plaintext limit accordingly. + */ + ctx->tx_record_size_limit = ctx->prot_info.version == TLS_1_3_VERSION ? + value - 1 : value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -833,6 +898,9 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_RECORD_SIZE_LIM: + rc = do_tls_setsockopt_tx_record_size(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -1022,6 +1090,7 @@ static int tls_init(struct sock *sk) ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->tx_record_size_limit = TLS_MAX_PAYLOAD_SIZE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); @@ -1111,6 +1180,11 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) goto nla_failure; } + err = nla_put_u16(skb, TLS_INFO_TX_RECORD_SIZE_LIM, + ctx->tx_record_size_limit); + if (err) + goto nla_failure; + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -1132,6 +1206,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin) nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_RECORD_SIZE_LIM */ 0; return size; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e..28fb796573d1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + record_room = tls_ctx->tx_record_size_limit - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; -- 2.51.0

1 day, 1 hour

4
10
0 0

[PATCH v2 2/2] ipv6: Check AF_UNSPEC in ip6_route_multipath_add()

by Maksimilijan Marosevic

This check was removed in commit e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().") as part of rt6_qualify_for ecmp(). The author correctly recognises that rt6_qualify_for_ecmp() returns false if fb_nh_gw_family is set to AF_UNSPEC, but then mistakes AF_UNSPEC for AF_INET6 when reasoning that the check is unnecessary. This means certain malformed entries don't get caught in ip6_route_multipath_add(). This patch reintroduces the AF_UNSPEC check while respecting changes of the initial patch. Reported-by: syzbot+a259a17220263c2d73fc(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a259a17220263c2d73fc Fixes: e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().") Signed-off-by: Maksimilijan Marosevic <maksimilijan.marosevic(a)proton.me> --- net/ipv6/route.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aee6a10b112a..884bae3fb1b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5454,6 +5454,14 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + if (rt->fib6_nh->fib_nh_gw_family == AF_UNSPEC) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + fib6_info_release(rt); + goto cleanup; + } + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); -- 2.43.0

1 day, 2 hours

1
0
0 0

[PATCH v2 1/2] selftests/net: add netdevsim.c

by Maksimilijan Marosevic

Tests an edge case in the nsim module where gw_family == AF_UNSPEC. Works by creating a new nsim device and then sending a multipath path message to it and loopback. In unpatched kernels, this triggers a WARN_ON_ONCE in netdevsim/fib.c. Reported-by: syzbot+a259a17220263c2d73fc(a)syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a259a17220263c2d73fc Fixes: e6f497955fb6 ("ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().") Signed-off-by: Maksimilijan Marosevic <maksimilijan.marosevic(a)proton.me> --- tools/testing/selftests/net/netdevsim.c | 391 ++++++++++++++++++++++++ 1 file changed, 391 insertions(+) create mode 100644 tools/testing/selftests/net/netdevsim.c diff --git a/tools/testing/selftests/net/netdevsim.c b/tools/testing/selftests/net/netdevsim.c new file mode 100644 index 000000000000..cdc8ebef4dac --- /dev/null +++ b/tools/testing/selftests/net/netdevsim.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This test creates a new netdevsim device and then sends + * an IPv6 multipath netlink message to it and the loopback + * interface. + * + * This triggers an edge case where the routing table is + * constructed with an entry where gw_family = AF_UNSPEC. + * If not caught, this causes an unexpected nsiblings count + * in netdevsim/fib.c: nsim_fib6_event_init(), raising a + * warning. + * + * NOTE: The warning in question is raised by WARN_ON_ONCE. + * Therefore, this test reports a false negative if the + * warning has already been triggered. + * + */ + +#include <arpa/inet.h> +#include <bits/types/struct_iovec.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <netinet/in.h> +#include <stdio.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include <dirent.h> +#include <stdbool.h> +#include <net/if.h> + +#define RTF_UP 0x0001 // route usable +#define RTF_HOST 0x0004 // host entry (net otherwise) + +#define NSIM_PORTS 1 +#define NETDEVSIM_DEV_DIR "/sys/bus/netdevsim/devices" +#define NSIM_DEV_DIR_BUFFER_SIZE 128 +#define LO_DEV "lo" + +#define BUFSIZE 4096 +#define DST_PREFIX "2001:db8::" +#define GW1 "fe80::1" +#define GW2 "::1" + +#define PID_LEN 16 + +int get_free_idx(void) +{ + int idx = 0; + int tmp = 0; + DIR *nsim_dir = opendir(NETDEVSIM_DEV_DIR); + struct dirent *entry = NULL; + + if (nsim_dir == NULL) { + fprintf(stderr, "Unable to open nsim directory\n"); + return -1; + } + + do { + entry = readdir(nsim_dir); + if (entry != NULL && + sscanf(entry->d_name, "netdevsim%d", &tmp) > 0) { + if (tmp >= idx) + idx = tmp + 1; + } + } while (entry != NULL); + + closedir(nsim_dir); + return idx; +} + +int create_netdevsim_device(int id, int num_ports) +{ + const char *path = "/sys/bus/netdevsim/new_device"; + char buffer[64]; + int fd; + + fd = open(path, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "Failed to open new_device\n"); + return -1; + } + + snprintf(buffer, sizeof(buffer), "%d %d", id, num_ports); + if (write(fd, buffer, strlen(buffer)) < 0) { + fprintf(stderr, "Failed to write to new_device\n"); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +int ensure_nsim_dev_exists(void) +{ + int ret; + int nsim_idx; + + nsim_idx = get_free_idx(); + ret = create_netdevsim_device(nsim_idx, NSIM_PORTS); + if (ret != 0) { + fprintf(stderr, "Failed to create nsim device\n"); + return -1; + } + + return nsim_idx; +} + +char *get_nsim_dev_link(int nsim_idx) +{ + char nsim_dev_dir_buffer[NSIM_DEV_DIR_BUFFER_SIZE]; + DIR *nsim_dev_dir; + struct dirent *entry; + + sprintf(nsim_dev_dir_buffer, "%s/netdevsim%d/%s", NETDEVSIM_DEV_DIR, + nsim_idx, "net"); + + nsim_dev_dir = opendir(nsim_dev_dir_buffer); + + if (nsim_dev_dir == NULL) { + fprintf(stderr, "Unable to open %s\n", nsim_dev_dir_buffer); + return NULL; + } + + do { + entry = readdir(nsim_dev_dir); + if (entry != NULL && entry->d_name[0] != '.') + break; + + } while (entry != NULL); + + if (entry == NULL || entry->d_name[0] == '.') { + fprintf(stderr, "Device has no ports\n"); + return NULL; + } + + closedir(nsim_dev_dir); + + return entry->d_name; +} + +int get_nsim_dev(char **nsim_link) +{ + int nsim_idx; + char *nsim_dev_link; + + nsim_idx = ensure_nsim_dev_exists(); + if (nsim_idx < 0) + return -1; + + nsim_dev_link = get_nsim_dev_link(nsim_idx); + if (nsim_dev_link == NULL) + return -1; + + *nsim_link = nsim_dev_link; + return 0; +} + +int prepare_socket(void) +{ + struct sockaddr_nl sa; + int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + + if (fd < 0) { + fprintf(stderr, "Failed to open socket\n"); + return -1; + } + + sa.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *)&sa, sizeof(sa)) < 0) + fprintf(stderr, "Failed to bind socket\n"); + + return fd; +} + +struct nlmsghdr *construct_header(char **pos) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)(*pos); + + nlh->nlmsg_type = RTM_NEWROUTE; + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE; + + *pos += NLMSG_HDRLEN; + + return nlh; +} + +void construct_rtmsg(char **pos) +{ + struct rtmsg *rtm = (struct rtmsg *)(*pos); + + rtm->rtm_family = AF_INET6; + rtm->rtm_table = RT_TABLE_MAIN; + rtm->rtm_protocol = RTPROT_STATIC; + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_dst_len = 128; + rtm->rtm_flags |= RTF_HOST | RTF_UP; + + *pos += NLMSG_ALIGN(sizeof(struct rtmsg)); +} + +void construct_dest(char **pos) +{ + struct rtattr *rta_dest = (struct rtattr *)(*pos); + struct in6_addr dst6; + + rta_dest->rta_type = RTA_DST; + rta_dest->rta_len = RTA_LENGTH(sizeof(struct in6_addr)); + inet_pton(AF_INET6, DST_PREFIX, &dst6); + memcpy(RTA_DATA(rta_dest), &dst6, sizeof(dst6)); + *pos += RTA_ALIGN(rta_dest->rta_len); +} + +struct rtattr *construct_multipath_hdr(char **pos) +{ + struct rtattr *rta_mp = (struct rtattr *)(*pos); + + rta_mp->rta_type = RTA_MULTIPATH; + *pos += sizeof(struct rtattr); + + return rta_mp; +} + +void add_nexthop(char **pos, int ifindex, char *gw_addr) +{ + struct rtnexthop *rtnh = (struct rtnexthop *)(*pos); + + rtnh->rtnh_hops = 0; + rtnh->rtnh_ifindex = ifindex; + char *rtnh_pos = (char *)rtnh + RTNH_ALIGN(sizeof(struct rtnexthop)); + + struct rtattr *attr = (struct rtattr *)rtnh_pos; + + attr->rta_type = RTA_GATEWAY; + attr->rta_len = RTA_LENGTH(sizeof(struct in6_addr)); + + struct in6_addr gw; + + inet_pton(AF_INET6, gw_addr, &gw); + memcpy(RTA_DATA(attr), &gw, sizeof(gw)); + + rtnh_pos += RTA_ALIGN(attr->rta_len); + rtnh->rtnh_len = rtnh_pos - (char *)rtnh; + + *pos = rtnh_pos; +} + +struct nlmsghdr *construct_message(char *buf, int nsim_ifindex, int lo_ifindex) +{ + char *pos = buf; + struct nlmsghdr *nlh = construct_header(&pos); + + construct_rtmsg(&pos); + construct_dest(&pos); + + struct rtattr *rta_mp = construct_multipath_hdr(&pos); + + add_nexthop(&pos, nsim_ifindex, GW1); + add_nexthop(&pos, lo_ifindex, GW2); + + rta_mp->rta_len = pos - (char *)rta_mp; + nlh->nlmsg_len = pos - buf; + + return nlh; +} + +int send_nl_msg(struct nlmsghdr *nlh, int socket) +{ + struct iovec iov = { .iov_base = nlh, .iov_len = nlh->nlmsg_len }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + if (sendmsg(socket, (struct msghdr *)&msg, 0) < 0) { + fprintf(stderr, "Failed to send message\n"); + return 1; + } + + return 0; +} + +int open_kmsg(void) +{ + int fd = open("/dev/kmsg", O_RDONLY | O_NONBLOCK); + + if (fd < 0) { + fprintf(stderr, "Failed to open kmsg\n"); + return -1; + } + + return fd; +} + +int move_cursor_to_end(int fd) +{ + if (lseek(fd, 0, SEEK_END) == -1) { + fprintf(stderr, "Failed to lseek kmsg\n"); + return -1; + } + + return 0; +} + +int look_for_warn(int kmsg_fd) +{ + char buffer[1024]; + int bytes_read; + int pid = getpid(); + char pid_str[PID_LEN]; + + snprintf(pid_str, PID_LEN, "%d", pid); + + while ((bytes_read = read(kmsg_fd, buffer, sizeof(buffer) - 1)) > 0) { + buffer[bytes_read] = '\0'; + if (strstr(buffer, "WARNING") && strstr(buffer, pid_str)) { + printf("Kernel warning detected\n"); + return 1; + } + } + + return 0; +} + +int main(void) +{ + char *nsim_dev; + int if_lo, if_nsim; + int fd; + int kmsg_fd; + struct nlmsghdr *nlh; + char buf[BUFSIZE]; + + if (get_nsim_dev(&nsim_dev) != 0) + return EXIT_FAILURE; + + sleep(1); // Doesn't work without a delay + + if_lo = if_nametoindex(LO_DEV); + if_nsim = if_nametoindex(nsim_dev); + + if (!if_lo || !if_nsim) { + fprintf(stderr, "Failed to get interface index\n"); + return EXIT_FAILURE; + } + + memset(buf, 0, sizeof(buf)); + nlh = construct_message(buf, if_nsim, if_lo); + + fd = prepare_socket(); + if (fd < 0) { + fprintf(stderr, "Failed to open socket\n"); + close(fd); + return EXIT_FAILURE; + } + + kmsg_fd = open_kmsg(); + if (kmsg_fd < 0) { + fprintf(stderr, "Failed to open kmsg\n"); + close(fd); + return EXIT_FAILURE; + } + + if (move_cursor_to_end(kmsg_fd) < 0) { + fprintf(stderr, "Failed to open kmsg\n"); + close(fd); + close(kmsg_fd); + return EXIT_FAILURE; + } + + if (send_nl_msg(nlh, fd) != 0) { + close(fd); + close(kmsg_fd); + return EXIT_FAILURE; + } + + if (look_for_warn(kmsg_fd) != 0) { + close(fd); + close(kmsg_fd); + return EXIT_FAILURE; + } + + close(kmsg_fd); + close(fd); + return EXIT_SUCCESS; +} -- 2.43.0

1 day, 2 hours

1
0
0 0

Hello from France.

by David Santamaria

The Alphabetical layout was... auto-correct. Friendly: Neo (Jesus-Christ a.k.a. King David again). - Miracles proving that I was Jesus, and King David, are in my Facebook. I proposed to explain the Holy-Trinity live on TV. It is since year 2000 that I must be in the news. I do not declare be Jew, but get persecuted anyway by antisemites (antichristians?). Writing still does not pay my bills at all, keeps me almost homeless. While my intellectual property is stolen, my names removed from many Masterpieces. Facebook https://www.facebook.com/profile.php?id=100057121342964 Paypal https://www.paypal.com/paypalme/meDavidSantamaria Email/Teams DavidSantamaria(a)hotmail.fr<mailto:DavidSantamaria@hotmail.fr> RCS/WhatsApp +33 7 67 99 32 37 $ADA addr1qx8chpwdeqv77duf2eutrtgvd5967l4w87fy54fx0022gr8p80z2mq7cmunmrdvy8yn3pzfzpm46zyfjp8usl36vpw2q509hrd $BTC 3FdwVoDzJoUzceogUwEmVu9YxoFvX6c2Rk $WAVES 3PEAeFkwqVsgAiyad8uVQzQxiGDyJNnCCn5 Sent from my Public Address.

2 days, 6 hours

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror September 2025