When the first test failed, and the hugetlb test passed, the result would
be pass, but we expect a fail. Fix this issue by returning fail if either
is not KSFT_PASS.
CC: Luiz Capitulino <luizcap(a)redhat.com>
Signed-off-by: Chunyu Hu <chuhu(a)redhat.com>
---
tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index 02f290a69132..51401e081b20 100644
--- a/tools/testing/selftests/mm/va_high_addr_switch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -322,7 +322,7 @@ static int supported_arch(void)
int main(int argc, char **argv)
{
- int ret;
+ int ret, hugetlb_ret = KSFT_PASS;
if (!supported_arch())
return KSFT_SKIP;
@@ -331,6 +331,10 @@ int main(int argc, char **argv)
ret = run_test(testcases, sz_testcases);
if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
- ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
- return ret;
+ hugetlb_ret = run_test(hugetlb_testcases, sz_hugetlb_testcases);
+
+ if (ret == KSFT_PASS && hugetlb_ret == KSFT_PASS)
+ return KSFT_PASS;
+ else
+ return KSFT_FAIL;
}
--
2.49.0
Patch series "Fix va_high_addr_switch.sh test failure - again", v2.
The series address several issues exist for the va_high_addr_switch test:
1) the test return value is ignored in va_high_addr_switch.sh.
2) the va_high_addr_switch test requires 6 hugepages not 5.
3) the reurn value of the first test in va_high_addr_switch.c can be
overridden by the second test.
4) the nr_hugepages setup in run_vmtests.sh for arm64 can be done in
va_high_addr_switch.sh too.
5) update a comment for check_test_requirements.
Changes in v2:
- shorten the comment in for hugepages setup in v1
- add a new patch to fix the return value overridden issue in
va_high_addr_switch.c
- fix a code comment for check_test_requirements.
- update the series summary in patch 1
- add reviewed-by from Luiz Capitulino on patch 1 and patch 3
This patch: (of 5)
The return value should be return value of va_high_addr_switch, otherwise
a test failure would be silently ignored.
Reviewed-by: Luiz Capitulino <luizcap(a)redhat.com>
Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test")
CC: Luiz Capitulino <luizcap(a)redhat.com>
Signed-off-by: Chunyu Hu <chuhu(a)redhat.com>
---
Chunyu Hu (5):
selftests/mm: fix va_high_addr_switch.sh return value
selftests/mm: allocate 6 hugepages in va_high_addr_switch.sh
selftests/mm: remove arm64 nr_hugepages setup for va_high_addr_switch
test
selftests/mm: va_high_addr_switch return fail when either test failed
selftests/mm: fix comment for check_test_requirements
tools/testing/selftests/mm/run_vmtests.sh | 8 --------
tools/testing/selftests/mm/va_high_addr_switch.c | 10 +++++++---
tools/testing/selftests/mm/va_high_addr_switch.sh | 12 +++++++-----
3 files changed, 14 insertions(+), 16 deletions(-)
According to the doc below, I don't add the cover letter, not sure if cover
letter is preferred, and if that's the case, the doc need an update.
https://www.ozlabs.org/~akpm/stuff/tpp.txt
---
tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index a7d4b02b21dd..f89fe078a8e6 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -114,4 +114,6 @@ save_nr_hugepages
# 4 keep_mapped pages, and one for tmp usage
setup_nr_hugepages 5
./va_high_addr_switch --run-hugetlb
+retcode=$?
restore_nr_hugepages
+exit $retcode
--
2.49.0
Patch series "Fix va_high_addr_switch.sh test failure - again", v1.
There are two issues exist for the va_high_addr_switch test. One issue is
the test return value is ignored in va_high_addr_switch.sh. The second is
the va_high_addr_switch requires 6 hugepages but it requires 5.
Besides that, the nr_hugepages setup in run_vmtests.sh for arm64 can be
done in va_high_addr_switch.sh too.
This patch: (of 3)
The return value should be return value of va_high_addr_switch, otherwise
a test failure would be silently ignored.
Fixes: d9d957bd7b61 ("selftests/mm: alloc hugepages in va_high_addr_switch test")
CC: Luiz Capitulino <luizcap(a)redhat.com>
Signed-off-by: Chunyu Hu <chuhu(a)redhat.com>
---
tools/testing/selftests/mm/va_high_addr_switch.sh | 2 ++
1 file changed, 2 insertions(+)
diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh
index a7d4b02b21dd..f89fe078a8e6 100755
--- a/tools/testing/selftests/mm/va_high_addr_switch.sh
+++ b/tools/testing/selftests/mm/va_high_addr_switch.sh
@@ -114,4 +114,6 @@ save_nr_hugepages
# 4 keep_mapped pages, and one for tmp usage
setup_nr_hugepages 5
./va_high_addr_switch --run-hugetlb
+retcode=$?
restore_nr_hugepages
+exit $retcode
--
2.49.0
Verify Wacom devices set INPUT_PROP_DIRECT on display devices and
INPUT_PROP_POINTER on opaque devices. Moved test_prop_pointer into
TestOpaqueTablet. Created a DirectTabletTest mixin class for
test_prop_direct that can be inherited by display tablet test classes.
Used DirectTabletTest for TestDTH2452Tablet case.
Signed-off-by: Alex Tran <alex.t.tran(a)gmail.com>
---
Changes in v2:
- Removed the tests from the BaseTest class
- Removed disabling tests for certain subclasses
- Moved test_prop_pointer under TestOpaqueTablet
- Created DirectTabletTest mixin class
- Moved test_prop_direct under TestDTH2452Tablet
.../selftests/hid/tests/test_wacom_generic.py | 30 +++++++++++--------
1 file changed, 17 insertions(+), 13 deletions(-)
diff --git a/tools/testing/selftests/hid/tests/test_wacom_generic.py b/tools/testing/selftests/hid/tests/test_wacom_generic.py
index 2d6d04f0f..9d0b0802d 100644
--- a/tools/testing/selftests/hid/tests/test_wacom_generic.py
+++ b/tools/testing/selftests/hid/tests/test_wacom_generic.py
@@ -598,18 +598,6 @@ class BaseTest:
if unit_set:
assert required[usage].contains(field)
- def test_prop_direct(self):
- """
- Todo: Verify that INPUT_PROP_DIRECT is set on display devices.
- """
- pass
-
- def test_prop_pointer(self):
- """
- Todo: Verify that INPUT_PROP_POINTER is set on opaque devices.
- """
- pass
-
class PenTabletTest(BaseTest.TestTablet):
def assertName(self, uhdev):
@@ -677,6 +665,13 @@ class TestOpaqueTablet(PenTabletTest):
uhdev.event(130, 240, pressure=0), [], auto_syn=False, strict=True
)
+ def test_prop_pointer(self):
+ """
+ Verify that INPUT_PROP_POINTER is set on opaque devices.
+ """
+ evdev = self.uhdev.get_evdev()
+ assert libevdev.INPUT_PROP_POINTER in evdev.properties
+
class TestOpaqueCTLTablet(TestOpaqueTablet):
def create_device(self):
@@ -862,7 +857,16 @@ class TestPTHX60_Pen(TestOpaqueCTLTablet):
)
-class TestDTH2452Tablet(test_multitouch.BaseTest.TestMultitouch, TouchTabletTest):
+class DirectTabletTest():
+ def test_prop_direct(self):
+ """
+ Verify that INPUT_PROP_DIRECT is set on display devices.
+ """
+ evdev = self.uhdev.get_evdev()
+ assert libevdev.INPUT_PROP_DIRECT in evdev.properties
+
+
+class TestDTH2452Tablet(test_multitouch.BaseTest.TestMultitouch, TouchTabletTest, DirectTabletTest):
ContactIds = namedtuple("ContactIds", "contact_id, tracking_id, slot_num")
def create_device(self):
--
2.51.0
Hi Linus,
Please pull the following fixes update for Linux 6.19-rc3.
Drops unused parameter from kunit_device_register_internal and makes
FAULT_TEST default to n when PANIC_ON_OOPS.
Note: Sending this early for 6.19-rc3 (way too late for rc2 anyways)
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 8f0b4cce4481fb22653697cced8d0d04027cb1e8:
Linux 6.19-rc1 (2025-12-14 16:05:07 +1200)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-kunit-fixes-6.19-rc3
for you to fetch changes up to c33b68801fbe9d5ee8a9178beb5747ec65873530:
kunit: make FAULT_TEST default to n when PANIC_ON_OOPS (2025-12-15 09:27:19 -0700)
----------------------------------------------------------------
linux_kselftest-kunit-fixes-6.19-rc3
Drops unused parameter from kunit_device_register_internal and makes
FAULT_TEST default to n when PANIC_ON_OOPS.
----------------------------------------------------------------
Brendan Jackman (1):
kunit: make FAULT_TEST default to n when PANIC_ON_OOPS
Uwe Kleine-König (1):
kunit: Drop unused parameter from kunit_device_register_internal
lib/kunit/Kconfig | 2 +-
lib/kunit/device.c | 7 +++----
2 files changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------
This series fixes a few issues in the hugetlb cgroup charging selftests
(write_to_hugetlbfs.c + charge_reserved_hugetlb.sh) that show up on systems
with large hugepages (e.g. 512MB) and when failures cause the test to wait
indefinitely.
On an aarch64 64k page kernel with 512MB hugepages, the test consistently
fails in write_to_hugetlbfs with ENOMEM and then hangs waiting for the
expected usage values. The root cause is that charge_reserved_hugetlb.sh
mounts hugetlbfs with a fixed size=256M, which is smaller than a single
hugepage, resulting in a mount with size=0 capacity.
In addition, write_to_hugetlbfs previously parsed -s via atoi() into an
int, which can overflow and print negative sizes.
Reproducer / environment:
- Kernel: 6.12.0-xxx.el10.aarch64+64k
- Hugepagesize: 524288 kB (512MB)
- ./charge_reserved_hugetlb.sh -cgroup-v2
- Observed mount: pagesize=512M,size=0 before this series
After applying the series, the test completes successfully on the above setup.
Li Wang (3):
selftests/mm/write_to_hugetlbfs: parse -s with strtoull and use size_t
selftests/mm/charge_reserved_hugetlb.sh: add waits with timeout helper
selftests/mm/charge_reserved_hugetlb: fix hugetlbfs mount size for
large hugepages
.../selftests/mm/charge_reserved_hugetlb.sh | 49 ++++++++++---------
.../testing/selftests/mm/write_to_hugetlbfs.c | 19 +++++--
2 files changed, 43 insertions(+), 25 deletions(-)
--
2.49.0
Currently, x86, Riscv, Loongarch use the Generic Entry which makes
maintainers' work easier and codes more elegant. arm64 has already
successfully switched to the Generic IRQ Entry in commit
b3cf07851b6c ("arm64: entry: Switch to generic IRQ entry"), it is
time to completely convert arm64 to Generic Entry.
The goal is to bring arm64 in line with other architectures that already
use the generic entry infrastructure, reducing duplicated code and
making it easier to share future changes in entry/exit paths, such as
"Syscall User Dispatch".
This patch set is rebased on v6.18-rc7. And the performance was measured
on Kunpeng 920 using "perf bench basic syscall" with "arm64.nopauth
selinux=0 audit=1".
After switch to Generic Entry, the performance are below:
| Metric | W/O Generic Framework | With Generic Framework | Change |
| ---------- | --------------------- | ---------------------- | ------ |
| Total time | 2.130 [sec] | 2.235 [sec] | ↑4.90% |
| usecs/op | 0.213095 | 0.223512 | ↑4.89% |
| ops/sec | 4,692,753 | 4,474,044 | ↓4.89% |
Compared to earlier with arch specific handling, the performance decreased
by approximately 4.9%.
On the basis of optimizing syscall_get_arguments()[1], el0_svc_common()
and syscall_exit_work(), the performance are below:
| Metric | W/O Generic Entry | With Generic Entry opt| Change |
| ---------- | ----------------- | ------------------ | ------ |
| Total time | 2.130 [sec] | 2.134 [sec] | ↑0.18% |
| usecs/op | 0.213095 | 0.213414 | ↑0.15% |
| ops/sec | 4,692,753 | 4,685,737 | ↓0.15% |
Therefore, after the optimization, ARM64 System Call performance remains
almost unchanged.
It was tested ok with following test cases on kunpeng920 and QEMU
virt platform:
- Perf tests.
- Different `dynamic preempt` mode switch.
- Pseudo NMI tests.
- Stress-ng CPU stress test.
- Hackbench stress test.
- MTE test case in Documentation/arch/arm64/memory-tagging-extension.rst
and all test cases in tools/testing/selftests/arm64/mte/*.
- "sud" selftest testcase.
- get_set_sud, get_syscall_info, set_syscall_info, peeksiginfo
in tools/testing/selftests/ptrace.
- breakpoint_test_arm64 in selftests/breakpoints.
- syscall-abi and ptrace in tools/testing/selftests/arm64/abi
- fp-ptrace, sve-ptrace, za-ptrace in selftests/arm64/fp.
- vdso_test_getrandom in tools/testing/selftests/vDSO
- Strace tests.
The test QEMU configuration is as follows:
qemu-system-aarch64 \
-M virt,gic-version=3,virtualization=on,mte=on \
-cpu max,pauth-impdef=on \
-kernel Image \
-smp 8,sockets=1,cores=4,threads=2 \
-m 512m \
-nographic \
-no-reboot \
-device virtio-rng-pci \
-append "root=/dev/vda rw console=ttyAMA0 kgdboc=ttyAMA0,115200 \
earlycon preempt=voluntary irqchip.gicv3_pseudo_nmi=1" \
-drive if=none,file=images/rootfs.ext4,format=raw,id=hd0 \
-device virtio-blk-device,drive=hd0 \
[1]: https://lore.kernel.org/all/20251201120633.1193122-3-ruanjinjie@huawei.com/
Changes in v9:
- Move "Return early for ptrace_report_syscall_entry() error" patch ahead
to make it not introduce a regression.
- Not check _TIF_SECCOMP/SYSCALL_EMU for syscall_exit_work() in
a separate patch.
- Do not report_syscall_exit() for PTRACE_SYSEMU_SINGLESTEP in a separate
patch.
- Add two performance patch to improve the arm64 performance.
- Add Reviewed-by.
- Link to v8: https://lore.kernel.org/all/20251126071446.3234218-1-ruanjinjie@huawei.com/
Changes in v8:
- Rename "report_syscall_enter()" to "report_syscall_entry()".
- Add ptrace_save_reg() to avoid duplication.
- Remove unused _TIF_WORK_MASK in a standalone patch.
- Align syscall_trace_enter() return value with the generic version.
- Use "scno" instead of regs->syscallno in el0_svc_common().
- Move rseq_syscall() ahead in a standalone patch to clarify it clearly.
- Rename "syscall_trace_exit()" to "syscall_exit_work()".
- Keep the goto in el0_svc_common().
- No argument was passed to __secure_computing() and check -1 not -1L.
- Remove "Add has_syscall_work() helper" patch.
- Move "Add syscall_exit_to_user_mode_prepare() helper" patch later.
- Add miss header for asm/entry-common.h.
- Update the implementation of arch_syscall_is_vdso_sigreturn().
- Add "ARCH_SYSCALL_WORK_EXIT" to be defined as "SECCOMP | SYSCALL_EMU"
to keep the behaviour unchanged.
- Add more testcases test.
- Add Reviewed-by.
- Update the commit message.
- Link to v7: https://lore.kernel.org/all/20251117133048.53182-1-ruanjinjie@huawei.com/
Chanegs in v7:
- Support "Syscall User Dispatch" by implementing
arch_syscall_is_vdso_sigreturn() as kemal suggested.
- Add aarch64 support for "sud" selftest testcase, which tested ok with
the patch series.
- Fix the kernel test robot warning for arch_ptrace_report_syscall_entry()
and arch_ptrace_report_syscall_exit() in asm/entry-common.h.
- Add perf syscall performance test.
- Link to v6: https://lore.kernel.org/all/20250916082611.2972008-1-ruanjinjie@huawei.com/
Changes in v6:
- Rebased on v6.17-rc5-next as arm64 generic irq entry has merged.
- Update the commit message.
- Link to v5: https://lore.kernel.org/all/20241206101744.4161990-1-ruanjinjie@huawei.com/
Changes in v5:
- Not change arm32 and keep inerrupts_enabled() macro for gicv3 driver.
- Move irqentry_state definition into arch/arm64/kernel/entry-common.c.
- Avoid removing the __enter_from_*() and __exit_to_*() wrappers.
- Update "irqentry_state_t ret/irq_state" to "state"
to keep it consistently.
- Use generic irq entry header for PREEMPT_DYNAMIC after split
the generic entry.
- Also refactor the ARM64 syscall code.
- Introduce arch_ptrace_report_syscall_entry/exit(), instead of
arch_pre/post_report_syscall_entry/exit() to simplify code.
- Make the syscall patches clear separation.
- Update the commit message.
- Link to v4: https://lore.kernel.org/all/20241025100700.3714552-1-ruanjinjie@huawei.com/
Changes in v4:
- Rework/cleanup split into a few patches as Mark suggested.
- Replace interrupts_enabled() macro with regs_irqs_disabled(), instead
of left it here.
- Remove rcu and lockdep state in pt_regs by using temporary
irqentry_state_t as Mark suggested.
- Remove some unnecessary intermediate functions to make it clear.
- Rework preempt irq and PREEMPT_DYNAMIC code
to make the switch more clear.
- arch_prepare_*_entry/exit() -> arch_pre_*_entry/exit().
- Expand the arch functions comment.
- Make arch functions closer to its caller.
- Declare saved_reg in for block.
- Remove arch_exit_to_kernel_mode_prepare(), arch_enter_from_kernel_mode().
- Adjust "Add few arch functions to use generic entry" patch to be
the penultimate.
- Update the commit message.
- Add suggested-by.
- Link to v3: https://lore.kernel.org/all/20240629085601.470241-1-ruanjinjie@huawei.com/
Changes in v3:
- Test the MTE test cases.
- Handle forget_syscall() in arch_post_report_syscall_entry()
- Make the arch funcs not use __weak as Thomas suggested, so move
the arch funcs to entry-common.h, and make arch_forget_syscall() folded
in arch_post_report_syscall_entry() as suggested.
- Move report_single_step() to thread_info.h for arm64
- Change __always_inline() to inline, add inline for the other arch funcs.
- Remove unused signal.h for entry-common.h.
- Add Suggested-by.
- Update the commit message.
Changes in v2:
- Add tested-by.
- Fix a bug that not call arch_post_report_syscall_entry() in
syscall_trace_enter() if ptrace_report_syscall_entry() return not zero.
- Refactor report_syscall().
- Add comment for arch_prepare_report_syscall_exit().
- Adjust entry-common.h header file inclusion to alphabetical order.
- Update the commit message.
Jinjie Ruan (15):
arm64: Remove unused _TIF_WORK_MASK
arm64/ptrace: Split report_syscall()
arm64/ptrace: Return early for ptrace_report_syscall_entry() error
arm64/ptrace: Refactor syscall_trace_enter/exit()
arm64: ptrace: Move rseq_syscall() before audit_syscall_exit()
arm64: syscall: Rework el0_svc_common()
arm64/ptrace: Not check _TIF_SECCOMP/SYSCALL_EMU for
syscall_exit_work()
arm64/ptrace: Do not report_syscall_exit() for
PTRACE_SYSEMU_SINGLESTEP
arm64/ptrace: Expand secure_computing() in place
arm64/ptrace: Use syscall_get_arguments() helper
entry: Split syscall_exit_to_user_mode_work() for arch reuse
entry: Add arch_ptrace_report_syscall_entry/exit()
arm64: entry: Convert to generic entry
arm64: Inline el0_svc_common()
entry: Inline syscall_exit_work()
kemal (1):
selftests: sud_test: Support aarch64
arch/arm64/Kconfig | 2 +-
arch/arm64/include/asm/entry-common.h | 76 ++++++++++++++
arch/arm64/include/asm/syscall.h | 19 +++-
arch/arm64/include/asm/thread_info.h | 22 +----
arch/arm64/kernel/debug-monitors.c | 7 ++
arch/arm64/kernel/ptrace.c | 94 ------------------
arch/arm64/kernel/signal.c | 2 +-
arch/arm64/kernel/syscall.c | 29 ++----
include/linux/entry-common.h | 98 ++++++++++++++++---
kernel/entry/syscall-common.c | 60 +++++-------
.../syscall_user_dispatch/sud_test.c | 4 +
11 files changed, 220 insertions(+), 193 deletions(-)
--
2.34.1
nolibc currently uses 32-bit types for various APIs. These are
problematic as their reduced value range can lead to truncated values.
Intended for 6.19.
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
---
Changes in v3:
- Only use _Static_assert() where available
- Link to v2: https://lore.kernel.org/r/20251122-nolibc-uapi-types-v2-0-b814a43654f5@weis…
Changes in v2:
- Drop already applied ino_t and off_t patches.
- Also handle 'struct timeval'.
- Make the progression of the series a bit clearer.
- Add compatibility assertions.
- Link to v1: https://lore.kernel.org/r/20251029-nolibc-uapi-types-v1-0-e79de3b215d8@weis…
---
Thomas Weißschuh (14):
tools/nolibc/poll: use kernel types for system call invocations
tools/nolibc/poll: drop __NR_poll fallback
tools/nolibc/select: drop non-pselect based implementations
tools/nolibc/time: drop invocation of gettimeofday system call
tools/nolibc: prefer explicit 64-bit time-related system calls
tools/nolibc/gettimeofday: avoid libgcc 64-bit divisions
tools/nolibc/select: avoid libgcc 64-bit multiplications
tools/nolibc: use custom structs timespec and timeval
tools/nolibc: always use 64-bit time types
selftests/nolibc: test compatibility of nolibc and kernel time types
tools/nolibc: remove time conversions
tools/nolibc: add compiler version detection macros
tools/nolibc: add __nolibc_static_assert()
selftests/nolibc: add static assertions around time types handling
tools/include/nolibc/arch-s390.h | 3 +
tools/include/nolibc/compiler.h | 24 +++++++
tools/include/nolibc/poll.h | 14 ++--
tools/include/nolibc/std.h | 2 +-
tools/include/nolibc/sys/select.h | 25 ++-----
tools/include/nolibc/sys/time.h | 6 +-
tools/include/nolibc/sys/timerfd.h | 32 +++------
tools/include/nolibc/time.h | 102 +++++++++------------------
tools/include/nolibc/types.h | 17 ++++-
tools/testing/selftests/nolibc/nolibc-test.c | 27 +++++++
10 files changed, 129 insertions(+), 123 deletions(-)
---
base-commit: 351ec197a66e47bea17c3d803c5472473640dd0d
change-id: 20251001-nolibc-uapi-types-1c072d10fcc7
Best regards,
--
Thomas Weißschuh <linux(a)weissschuh.net>
Hi,
This series adds missing memory access tags (MEM_RDONLY or MEM_WRITE) to
several bpf helper function prototypes that use ARG_PTR_TO_MEM but lack the
correct type annotation.
Missing memory access tags in helper prototypes can lead to critical
correctness issues when the verifier tries to perform code optimization.
After commit 37cce22dbd51 ("bpf: verifier: Refactor helper access type
tracking"), the verifier relies on the memory access tags, rather than
treating all arguments in helper functions as potentially modifying the
pointed-to memory.
We have already seen several reports regarding this:
- commit ac44dcc788b9 ("bpf: Fix verifier assumptions of bpf_d_path's
output buffer") adds MEM_WRITE to bpf_d_path;
- commit 2eb7648558a7 ("bpf: Specify access type of bpf_sysctl_get_name
args") adds MEM_WRITE to bpf_sysctl_get_name.
This series looks through all prototypes in the kernel and completes the
tags. In addition, this series also adds selftests for some of these
functions.
I marked the series as RFC since the introduced selftests are fragile and
ad hoc (similar to the previously added selftests). The original goal of
these tests is to reproduce the case where the verifier wrongly optimizes
reads after the helper function is called. However, triggering the error
often requires carefully designed code patterns. For example, I had to
explicitly use "if (xx != 0)" in my attached tests, because using memcmp
will not reproduce the issue. This makes the reproduction heavily dependent
on the verifier's internal optimization logic and clutters the selftests
with specific, unnatural patterns.
Some cases are also hard to trigger by selftests. For example, I couldn't
find a triggering pattern for bpf_read_branch_records, since the
execution of program seems to be messed up by wrong tags. For
bpf_skb_fib_lookup, I also failed to reproduce it because the argument
needs content on entry, but the verifier seems to only enable this
optimization for fully empty buffers.
Since adding selftests does not help with existing issues or prevent future
occurrences of similar problems, I believe one way to resolve it is to
statically restrict ARG_PTR_TO_MEM from appearing without memory access
tags. Using ARG_PTR_TO_MEM alone without tags is nonsensical because:
- If the helper does not change the argument, missing MEM_RDONLY causes
the verifier to incorrectly reject a read-only buffer.
- If the helper does change the argument, missing MEM_WRITE causes the
verifier to incorrectly assume the memory is unchanged, leading to
potential errors.
I am still wondering, if we agree on the above, how should we enforce this
restriction? Should we let ARG_PTR_TO_MEM imply MEM_WRITE semantics by
default, and change ARG_PTR_TO_MEM | MEM_RDONLY to ARG_CONST_PTR_TO_MEM? Or
should we add a check in the verifier to ensure ARG_PTR_TO_MEM always comes
with an access tag (though this seems to only catch errors at
runtime/testing)?
Any insights and comments are welcome. If the individual fix patches for
the prototypes look correct, I would also really appreciate it if they
could be merged ahead of the discussion.
Thanks,
Zesen Liu
Signed-off-by: Zesen Liu <ftyghome(a)gmail.com>
---
Zesen Liu (2):
bpf: Fix memory access tags in helper prototypes
selftests/bpf: add regression tests for snprintf and get_stack helpers
kernel/bpf/helpers.c | 2 +-
kernel/trace/bpf_trace.c | 6 +++---
net/core/filter.c | 8 ++++----
tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c | 15 +++++++++++++--
tools/testing/selftests/bpf/prog_tests/snprintf.c | 6 ++++++
tools/testing/selftests/bpf/prog_tests/snprintf_btf.c | 3 +++
tools/testing/selftests/bpf/progs/netif_receive_skb.c | 13 ++++++++++++-
tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c | 11 ++++++++++-
tools/testing/selftests/bpf/progs/test_snprintf.c | 12 ++++++++++++
9 files changed, 64 insertions(+), 12 deletions(-)
---
base-commit: 22cc16c04b7893d8fc22810599f49a305d600b9e
change-id: 20251220-helper_proto-fb6e64182467
Best regards,
--
Zesen Liu <ftyghome(a)gmail.com>
Replace the NULL checks with IS_ERR_OR_NULL() in
KUNIT_BINARY_STR_ASSERTION() to prevent the strcmp() faulting if a
passed pointer is an ERR_PTR.
Commit 7ece381aa72d4 ("kunit: Protect string comparisons against NULL")
added the checks for NULL on both pointers so that asserts would fail,
instead of faulting, if either pointer is NULL. But either pointer
could hold an ERR_PTR value.
This assumes that the assertion is expecting both strings to be valid,
and is asserting the equality of their _content_.
Signed-off-by: Richard Fitzgerald <rf(a)opensource.cirrus.com>
---
include/kunit/test.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/kunit/test.h b/include/kunit/test.h
index 5ec5182b5e57..9cd1594ab697 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -906,7 +906,8 @@ do { \
}; \
\
_KUNIT_SAVE_LOC(test); \
- if (likely((__left) && (__right) && (strcmp(__left, __right) op 0))) \
+ if (likely(!IS_ERR_OR_NULL(__left) && !IS_ERR_OR_NULL(__right) && \
+ (strcmp(__left, __right) op 0))) \
break; \
\
\
--
2.47.3
The kunit_run_irq_test() helper allows a function to be run in hardirq
and softirq contexts (in addition to the task context). It does this by
running the user-provided function concurrently in the three contexts,
until either a timeout has expired or a number of iterations have
completed in the normal task context.
However, on setups where the initialisation of the hardirq and softirq
contexts (or, indeed, the scheduling of those tasks) is significantly
slower than the function execution, it's possible for that number of
iterations to be exceeded before any runs in irq contexts actually
occur. This occurs with the polyval.test_polyval_preparekey_in_irqs
test, which runs 20000 iterations of the relatively fast preparekey
function, and therefore fails often under many UML, 32-bit arm, m68k and
other environments.
Instead, ensure that the max_iterations limit counts executions in all
three contexts, and requires at least one of each. This will cause the
test to continue iterating until at least the irq contexts have been
tested, or the 1s wall-clock limit has been exceeded. This causes the
test to pass in all of my environments.
In so doing, we also update the task counters to atomic ints, to better
match both the 'int' max_iterations input, and to ensure they are
correctly updated across contexts.
Finally, we also fix a few potential assertion messages to be
less-specific to the original crypto usecases.
Fixes: b41dc83f0790 ("kunit, lib/crypto: Move run_irq_test() to common header")
Signed-off-by: David Gow <davidgow(a)google.com>
---
Changes since v1:
https://lore.kernel.org/all/20251219080850.921416-1-davidgow@google.com/
- Remove a leftover debug line which forced max_iterations to 1.
include/kunit/run-in-irq-context.h | 39 ++++++++++++++++++++----------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h
index 108e96433ea4..84694f383e37 100644
--- a/include/kunit/run-in-irq-context.h
+++ b/include/kunit/run-in-irq-context.h
@@ -20,8 +20,8 @@ struct kunit_irq_test_state {
bool task_func_reported_failure;
bool hardirq_func_reported_failure;
bool softirq_func_reported_failure;
- unsigned long hardirq_func_calls;
- unsigned long softirq_func_calls;
+ atomic_t hardirq_func_calls;
+ atomic_t softirq_func_calls;
struct hrtimer timer;
struct work_struct bh_work;
};
@@ -32,7 +32,7 @@ static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer)
container_of(timer, typeof(*state), timer);
WARN_ON_ONCE(!in_hardirq());
- state->hardirq_func_calls++;
+ atomic_inc(&state->hardirq_func_calls);
if (!state->func(state->test_specific_state))
state->hardirq_func_reported_failure = true;
@@ -48,7 +48,7 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
container_of(work, typeof(*state), bh_work);
WARN_ON_ONCE(!in_serving_softirq());
- state->softirq_func_calls++;
+ atomic_inc(&state->softirq_func_calls);
if (!state->func(state->test_specific_state))
state->softirq_func_reported_failure = true;
@@ -59,7 +59,10 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
* hardirq context concurrently, and reports a failure to KUnit if any
* invocation of @func in any context returns false. @func is passed
* @test_specific_state as its argument. At most 3 invocations of @func will
- * run concurrently: one in each of task, softirq, and hardirq context.
+ * run concurrently: one in each of task, softirq, and hardirq context. @func
+ * will continue running until either @max_iterations calls have been made (so
+ * long as at least one each runs in task, softirq, and hardirq contexts), or
+ * one second has passed.
*
* The main purpose of this interrupt context testing is to validate fallback
* code paths that run in contexts where the normal code path cannot be used,
@@ -85,6 +88,8 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
.test_specific_state = test_specific_state,
};
unsigned long end_jiffies;
+ int hardirq_calls, softirq_calls;
+ bool allctx = false;
/*
* Set up a hrtimer (the way we access hardirq context) and a work
@@ -94,14 +99,22 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func);
- /* Run for up to max_iterations or 1 second, whichever comes first. */
+ /* Run for up to max_iterations (including at least one task, softirq,
+ * and hardirq), or 1 second, whichever comes first.
+ */
end_jiffies = jiffies + HZ;
hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL,
HRTIMER_MODE_REL_HARD);
- for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
- i++) {
+ for (int task_calls = 0, calls = 0;
+ ((calls < max_iterations) || !allctx) && !time_after(jiffies, end_jiffies);
+ task_calls++) {
if (!func(test_specific_state))
state.task_func_reported_failure = true;
+
+ hardirq_calls = atomic_read(&state.hardirq_func_calls);
+ softirq_calls = atomic_read(&state.softirq_func_calls);
+ calls = task_calls + hardirq_calls + softirq_calls;
+ allctx = (task_calls > 0) && (hardirq_calls > 0) && (softirq_calls > 0);
}
/* Cancel the timer and work. */
@@ -109,21 +122,21 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
flush_work(&state.bh_work);
/* Sanity check: the timer and BH functions should have been run. */
- KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
+ KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.hardirq_func_calls), 0,
"Timer function was not called");
- KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
+ KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.softirq_func_calls), 0,
"BH work function was not called");
/* Check for incorrect hash values reported from any context. */
KUNIT_EXPECT_FALSE_MSG(
test, state.task_func_reported_failure,
- "Incorrect hash values reported from task context");
+ "Failure reported from task context");
KUNIT_EXPECT_FALSE_MSG(
test, state.hardirq_func_reported_failure,
- "Incorrect hash values reported from hardirq context");
+ "Failure reported from hardirq context");
KUNIT_EXPECT_FALSE_MSG(
test, state.softirq_func_reported_failure,
- "Incorrect hash values reported from softirq context");
+ "Failure reported from softirq context");
}
#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */
--
2.52.0.322.g1dd061c0dc-goog
The function get_desc64_base() performs a series of bitwise left shifts on
fields of various sizes. More specifically, when performing '<< 24' on
'desc->base2' (which is a u8), 'base2' is promoted to a signed integer
before shifting.
In a scenario where base2 >= 0x80, the shift places a 1 into bit 31,
causing the 32-bit intermediate value to become negative. When this
result is cast to uint64_t or ORed into the return value, sign extension
occurs, corrupting the upper 32 bits of the address (base3).
Example:
Given:
base0 = 0x5000
base1 = 0xd6
base2 = 0xf8
base3 = 0xfffffe7c
Expected return: 0xfffffe7cf8d65000
Actual return: 0xfffffffff8d65000
Fix this by explicitly casting the fields to 'uint64_t' before shifting
to prevent sign extension.
Signed-off-by: MJ Pooladkhay <mj(a)pooladkhay.com>
---
While using get_desc64_base() to set the HOST_TR_BASE value for a custom
educational hypervisor, I observed system freezes, either immediately or
after migrating the guest to a new core. I eventually realized that KVM
uses get_cpu_entry_area() for the TR base. Switching to that fixed my
freezes (which were triple faults on one core followed by soft lockups
on others, waiting on smp_call_function_many_cond) and helped me identify
the sign-extension bug in this helper function that was corrupting the
HOST_TR_BASE value.
Thanks,
MJ Pooladkhay
tools/testing/selftests/kvm/include/x86/processor.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 57d62a425..cc2f8fb6f 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -436,8 +436,11 @@ struct kvm_x86_state {
static inline uint64_t get_desc64_base(const struct desc64 *desc)
{
- return ((uint64_t)desc->base3 << 32) |
- (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+ uint64_t low = (uint64_t)desc->base0 |
+ ((uint64_t)desc->base1 << 16) |
+ ((uint64_t)desc->base2 << 24);
+
+ return (uint64_t)desc->base3 << 32 | low;
}
static inline uint64_t rdtsc(void)
--
2.52.0
Small clean up series to eliminate the extra includes of
<uapi/linux/types.h> from various VFIO selftests files. This include is
not causing any problems now, but it is causing benign typedef
redifinitions. Those redifinitions will become a problem when the VFIO
selftests library is built into KVM selftests, since KVM selftests build
with -std=gnu99.
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: Josh Hilke <jrhilke(a)google.com>
David Matlack (2):
tools include: Add definitions for __aligned_{l,b}e64
vfio: selftests: Drop <uapi/linux/types.h> includes
tools/include/linux/types.h | 8 ++++++++
.../selftests/vfio/lib/include/libvfio/iova_allocator.h | 1 -
tools/testing/selftests/vfio/lib/iommu.c | 1 -
tools/testing/selftests/vfio/lib/iova_allocator.c | 1 -
tools/testing/selftests/vfio/lib/vfio_pci_device.c | 1 -
tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 1 -
tools/testing/selftests/vfio/vfio_iommufd_setup_test.c | 1 -
7 files changed, 8 insertions(+), 6 deletions(-)
base-commit: d721f52e31553a848e0e9947ca15a49c5674aef3
--
2.52.0.322.g1dd061c0dc-goog
While debugging issues related to aarch64 only systems I ran into
speedbumps due to the lack of detail in the results reported when the
guest register read and reset value preservation tests were run, they
generated an immediately fatal assert without indicating which register
was being tested. Update these tests to report a result per register,
making it much easier to see what the problem being reported is.
A similar, though less severe, issue exists with the validation of the
individual bitfields in registers due to the use of immediately fatal
asserts. Update those asserts to be standard kselftest reports.
Finally we have a fix for spurious errors on some NV systems.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v3:
- Rebase onto v6.19-rc1.
- Link to v2: https://patch.msgid.link/20251114-kvm-arm64-set-id-regs-aarch64-v2-0-672f21…
Changes in v2:
- Add a fix for spurious failures with 64 bit only guests.
- Link to v1: https://patch.msgid.link/20251030-kvm-arm64-set-id-regs-aarch64-v1-0-96fe0d…
---
Mark Brown (4):
KVM: selftests: arm64: Report set_id_reg reads of test registers as tests
KVM: selftests: arm64: Report register reset tests individually
KVM: selftests: arm64: Make set_id_regs bitfield validatity checks non-fatal
KVM: selftests: arm64: Skip all 32 bit IDs when set_id_regs is aarch64 only
tools/testing/selftests/kvm/arm64/set_id_regs.c | 150 ++++++++++++++++++------
1 file changed, 111 insertions(+), 39 deletions(-)
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251028-kvm-arm64-set-id-regs-aarch64-ebb77969401c
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Greetings:
Welcome to v9, see changelog below.
This revision addresses feedback Willem gave on the selftests. No
functional or code changes to the implementation were made and
performance tests were not re-run.
This series introduces a new mechanism, IRQ suspension, which allows
network applications using epoll to mask IRQs during periods of high
traffic while also reducing tail latency (compared to existing
mechanisms, see below) during periods of low traffic. In doing so, this
balances CPU consumption with network processing efficiency.
Martin Karsten (CC'd) and I have been collaborating on this series for
several months and have appreciated the feedback from the community on
our RFC [1]. We've updated the cover letter and kernel documentation in
an attempt to more clearly explain how this mechanism works, how
applications can use it, and how it compares to existing mechanisms in
the kernel.
I briefly mentioned this idea at netdev conf 2024 (for those who were
there) and Martin described this idea in an earlier paper presented at
Sigmetrics 2024 [2].
~ The short explanation (TL;DR)
We propose adding a new napi config parameter: irq_suspend_timeout to
help balance CPU usage and network processing efficiency when using IRQ
deferral and napi busy poll.
If this parameter is set to a non-zero value *and* a user application
has enabled preferred busy poll on a busy poll context (via the
EPIOCSPARAMS ioctl introduced in commit 18e2bf0edf4d ("eventpoll: Add
epoll ioctl for epoll_params")), then application calls to epoll_wait
for that context will cause device IRQs and softirq processing to be
suspended as long as epoll_wait successfully retrieves data from the
NAPI. Each time data is retrieved, the irq_suspend_timeout is deferred.
If/when network traffic subsides and epoll_wait returns no data, IRQ
suspension is immediately reverted back to the existing
napi_defer_hard_irqs and gro_flush_timeout mechanism which was
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature")).
The irq_suspend_timeout serves as a safety mechanism. If userland takes
a long time processing data, irq_suspend_timeout will fire and restart
normal NAPI processing.
For a more in depth explanation, please continue reading.
~ Comparison with existing mechanisms
Interrupt mitigation can be accomplished in napi software, by setting
napi_defer_hard_irqs and gro_flush_timeout, or via interrupt coalescing
in the NIC. This can be quite efficient, but in both cases, a fixed
timeout (or packet count) needs to be configured. However, a fixed
timeout cannot effectively support both low- and high-load situations:
At low load, an application typically processes a few requests and then
waits to receive more input data. In this scenario, a large timeout will
cause unnecessary latency.
At high load, an application typically processes many requests before
being ready to receive more input data. In this case, a small timeout
will likely fire prematurely and trigger irq/softirq processing, which
interferes with the application's execution. This causes overhead, most
likely due to cache contention.
While NICs attempt to provide adaptive interrupt coalescing schemes,
these cannot properly take into account application-level processing.
An alternative packet delivery mechanism is busy-polling, which results
in perfect alignment of application processing and network polling. It
delivers optimal performance (throughput and latency), but results in
100% cpu utilization and is thus inefficient for below-capacity
workloads.
We propose to add a new packet delivery mode that properly alternates
between busy polling and interrupt-based delivery depending on busy and
idle periods of the application. During a busy period, the system
operates in busy-polling mode, which avoids interference. During an idle
period, the system falls back to interrupt deferral, but with a small
timeout to avoid excessive latencies. This delivery mode can also be
viewed as an extension of basic interrupt deferral, but alternating
between a small and a very large timeout.
This delivery mode is efficient, because it avoids softirq execution
interfering with application processing during busy periods. It can be
used with blocking epoll_wait to conserve cpu cycles during idle
periods. The effect of alternating between busy and idle periods is that
performance (throughput and latency) is very close to full busy polling,
while cpu utilization is lower and very close to interrupt mitigation.
~ Usage details
IRQ suspension is introduced via a per-NAPI configuration parameter that
controls the maximum time that IRQs can be suspended.
Here's how it is intended to work:
- The user application (or system administrator) uses the netdev-genl
netlink interface to set the pre-existing napi_defer_hard_irqs and
gro_flush_timeout NAPI config parameters to enable IRQ deferral.
- The user application (or system administrator) sets the proposed
irq_suspend_timeout parameter via the netdev-genl netlink interface
to a larger value than gro_flush_timeout to enable IRQ suspension.
- The user application issues the existing epoll ioctl to set the
prefer_busy_poll flag on the epoll context.
- The user application then calls epoll_wait to busy poll for network
events, as it normally would.
- If epoll_wait returns events to userland, IRQs are suspended for the
duration of irq_suspend_timeout.
- If epoll_wait finds no events and the thread is about to go to
sleep, IRQ handling using napi_defer_hard_irqs and gro_flush_timeout
is resumed.
As long as epoll_wait is retrieving events, IRQs (and softirq
processing) for the NAPI being polled remain disabled. When network
traffic reduces, eventually a busy poll loop in the kernel will retrieve
no data. When this occurs, regular IRQ deferral using gro_flush_timeout
for the polled NAPI is re-enabled.
Unless IRQ suspension is continued by subsequent calls to epoll_wait, it
automatically times out after the irq_suspend_timeout timer expires.
Regular deferral is also immediately re-enabled when the epoll context
is destroyed.
~ Usage scenario
The target scenario for IRQ suspension as packet delivery mode is a
system that runs a dominant application with substantial network I/O.
The target application can be configured to receive input data up to a
certain batch size (via epoll_wait maxevents parameter) and this batch
size determines the worst-case latency that application requests might
experience. Because packet delivery is suspended during the target
application's processing, the batch size also determines the worst-case
latency of concurrent applications using the same RX queue(s).
gro_flush_timeout should be set as small as possible, but large enough to
make sure that a single request is likely not being interfered with.
irq_suspend_timeout is largely a safety mechanism against misbehaving
applications. It should be set large enough to cover the processing of an
entire application batch, i.e., the factor between gro_flush_timeout and
irq_suspend_timeout should roughly correspond to the maximum batch size
that the target application would process in one go.
~ Important call out in the implementation
- Enabling per epoll-context preferred busy poll will now effectively
lead to a nonblocking iteration through napi_busy_loop, even when
busy_poll_usecs is 0. See patch 4.
~ Benchmark configs & descriptions
The changes were benchmarked with memcached [3] using the benchmarking
tool mutilate [4].
To facilitate benchmarking, a small patch [5] was applied to memcached
1.6.29 to allow setting per-epoll context preferred busy poll and other
settings via environment variables. Another small patch [6] was applied
to libevent to enable full busy-polling.
Multiple scenarios were benchmarked as described below and the scripts
used for producing these results can be found on github [7] (note: all
scenarios use NAPI-based traffic splitting via SO_INCOMING_ID by passing
-N to memcached):
- base:
- no other options enabled
- deferX:
- set defer_hard_irqs to 100
- set gro_flush_timeout to X,000
- napibusy:
- set defer_hard_irqs to 100
- set gro_flush_timeout to 200,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 64,
busy_poll_budget = 64, prefer_busy_poll = true)
- fullbusy:
- set defer_hard_irqs to 100
- set gro_flush_timeout to 5,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 1000,
busy_poll_budget = 64, prefer_busy_poll = true)
- change memcached's nonblocking epoll_wait invocation (via
libevent) to using a 1 ms timeout
- suspend0:
- set defer_hard_irqs to 0
- set gro_flush_timeout to 0
- set irq_suspend_timeout to 20,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 0,
busy_poll_budget = 64, prefer_busy_poll = true)
- suspendX:
- set defer_hard_irqs to 100
- set gro_flush_timeout to X,000
- set irq_suspend_timeout to 20,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 0,
busy_poll_budget = 64, prefer_busy_poll = true)
~ Benchmark results
Tested on:
Single socket AMD EPYC 7662 64-Core Processor
Hyperthreading disabled
4 NUMA Zones (NPS=4)
16 CPUs per NUMA zone (64 cores total)
2 x Dual port 100gbps Mellanox Technologies ConnectX-5 Ex EN NIC
The test machine is configured such that a single interface has 8 RX
queues. The queues' IRQs and memcached are pinned to CPUs that are
NUMA-local to the interface which is under test. The NIC's interrupt
coalescing configuration is left at boot-time defaults.
Results:
Results are shown below. The mechanism added by this series is
represented by the 'suspend' cases. Data presented shows a summary over
nearly 10 runs of each test case [8] using the scripts on github [7].
For latency, the median is shown. For throughput and CPU utilization,
the average is shown.
The results also include cycles-per-query (cpq) and
instruction-per-query (ipq) metrics, following the methodology proposed
in [2], to augment the CPU utilization numbers, which could be skewed
due to frequency scaling. We find that this does not appear to be the
case as CPU utilization and low-level metrics show similar trends.
These results were captured using the scripts on github [7] to
illustrate how this approach compares with other pre-existing
mechanisms. This data is not to be interpreted as scientific data
captured in a fully isolated lab setting, but instead as best effort,
illustrative information comparing and contrasting tradeoffs.
The absolute QPS results shift between submissions, but the
relative differences are equivalent. As patches are rebased,
several factors likely influence overall performance.
Compare:
- Throughput (MAX) and latencies of base vs suspend.
- CPU usage of napibusy and fullbusy during lower load (200K, 400K for
example) vs suspend.
- Latency of the defer variants vs suspend as timeout and load
increases.
- suspend0, which sets defer_hard_irqs and gro_flush_timeout to 0, has
nearly the same performance as the base case (this is FAQ item #1).
The overall takeaway is that the suspend variants provide a superior
combination of high throughput, low latency, and low cpu utilization
compared to all other variants. Each of the suspend variants works very
well, but some fine-tuning between latency and cpu utilization is still
possible by tuning the small timeout (gro_flush_timeout).
Note: we've reorganized the results to make comparison among testcases
with the same load easier.
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 200K 199946 112 239 416 26 12973 11343
defer10 200K 199971 54 124 142 29 19412 17460
defer20 200K 199986 60 130 153 26 15644 14095
defer50 200K 200025 79 144 182 23 12122 11632
defer200 200K 199999 164 254 309 19 8923 9635
fullbusy 200K 199998 46 118 133 100 43658 23133
napibusy 200K 199983 100 237 277 56 24840 24716
suspend0 200K 200020 105 249 432 30 14264 11796
suspend10 200K 199950 53 123 141 32 19518 16903
suspend20 200K 200037 58 126 151 30 16426 14736
suspend50 200K 199961 73 136 177 26 13310 12633
suspend200 200K 199998 149 251 306 21 9566 10203
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 400K 400014 139 269 707 41 9476 9343
defer10 400K 400016 59 133 166 53 13991 12989
defer20 400K 399952 67 140 172 47 12063 11644
defer50 400K 400007 87 162 198 39 9384 9880
defer200 400K 399979 181 274 330 31 7089 8430
fullbusy 400K 399987 50 123 156 100 21827 16037
napibusy 400K 400014 76 222 272 83 18185 16529
suspend0 400K 400015 127 350 776 47 10699 9603
suspend10 400K 400023 57 129 164 54 13758 13178
suspend20 400K 400043 62 135 169 49 12071 11826
suspend50 400K 400071 76 149 186 42 10011 10301
suspend200 400K 399961 154 269 327 34 7827 8774
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 600K 599951 149 266 574 61 9265 8876
defer10 600K 600006 71 147 203 76 11866 10936
defer20 600K 600123 76 152 203 66 10430 10342
defer50 600K 600162 95 172 217 54 8526 9142
defer200 600K 599942 200 301 357 46 6977 8212
fullbusy 600K 599990 55 127 177 100 14551 13983
napibusy 600K 600035 63 160 250 96 13937 14140
suspend0 600K 599903 127 320 732 68 10166 8963
suspend10 600K 599908 63 137 192 69 10902 11100
suspend20 600K 599961 66 141 194 65 9976 10370
suspend50 600K 599973 80 159 204 57 8678 9381
suspend200 600K 600010 157 277 346 48 7133 8381
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 800K 800039 181 300 536 87 9585 8304
defer10 800K 800038 181 530 939 96 10564 8970
defer20 800K 800029 112 225 329 90 10056 8935
defer50 800K 799999 120 208 296 82 9234 8562
defer200 800K 800066 227 338 401 63 7117 8129
fullbusy 800K 800040 61 134 190 100 10913 12608
napibusy 800K 799944 64 141 214 99 10828 12588
suspend0 800K 799911 126 248 509 85 9346 8498
suspend10 800K 800006 69 143 200 83 9410 9845
suspend20 800K 800120 74 150 207 78 8786 9454
suspend50 800K 799989 87 168 224 71 7946 8833
suspend200 800K 799987 160 292 357 62 6923 8229
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 1000K 906879 4079 5751 6216 98 9496 7904
defer10 1000K 860849 3643 6274 6730 99 10040 8676
defer20 1000K 896063 3298 5840 6349 98 9620 8237
defer50 1000K 919782 2962 5513 5807 97 9284 7951
defer200 1000K 970941 3059 5348 5984 95 8593 7959
fullbusy 1000K 999950 70 150 207 100 8732 10777
napibusy 1000K 999996 78 154 223 100 8722 10656
suspend0 1000K 949706 2666 5770 6660 99 9071 8046
suspend10 1000K 1000024 80 160 220 92 8137 9035
suspend20 1000K 1000059 83 165 226 89 7850 8804
suspend50 1000K 999955 95 180 240 84 7411 8459
suspend200 1000K 999914 163 299 366 77 6833 8078
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base MAX 1037654 4184 5453 5810 100 8411 7938
defer10 MAX 905607 4840 6151 6380 100 9639 8431
defer20 MAX 986463 4455 5594 5796 100 8848 8110
defer50 MAX 1077030 4000 5073 5299 100 8104 7920
defer200 MAX 1040728 4152 5385 5765 100 8379 7849
fullbusy MAX 1247536 3518 3935 3984 100 6998 7930
napibusy MAX 1136310 3799 7756 9964 100 7670 7877
suspend0 MAX 1057509 4132 5724 6185 100 8253 7918
suspend10 MAX 1215147 3580 3957 4041 100 7185 7944
suspend20 MAX 1216469 3576 3953 3988 100 7175 7950
suspend50 MAX 1215871 3577 3961 4075 100 7181 7949
suspend200 MAX 1216882 3556 3951 3988 100 7175 7955
~ FAQ
- Why is a new parameter needed? Does irq_suspend_timeout override
gro_flush_timeout?
Using the suspend mechanism causes the system to alternate between
polling mode and irq-driven packet delivery. During busy periods,
irq_suspend_timeout overrides gro_flush_timeout and keeps the system
busy polling, but when epoll finds no events, the setting of
gro_flush_timeout and napi_defer_hard_irqs determine the next step.
There are essentially three possible loops for network processing and
packet delivery:
1) hardirq -> softirq -> napi poll; basic interrupt delivery
2) timer -> softirq -> napi poll; deferred irq processing
3) epoll -> busy-poll -> napi poll; busy looping
Loop 2 can take control from Loop 1, if gro_flush_timeout and
napi_defer_hard_irqs are set.
If gro_flush_timeout and napi_defer_hard_irqs are set, Loops 2 and
3 "wrestle" with each other for control. During busy periods,
irq_suspend_timeout is used as timer in Loop 2, which essentially
tilts this in favour of Loop 3.
If gro_flush_timeout and napi_defer_hard_irqs are not set, Loop 3
cannot take control from Loop 1.
Therefore, setting gro_flush_timeout and napi_defer_hard_irqs is the
recommended usage, because otherwise setting irq_suspend_timeout
might not have any discernible effect.
This is shown in the results above: compare suspend0 with the base
case. Note that the lack of napi_defer_hard_irqs and
gro_flush_timeout produce similar results for both, which encourages
the use of napi_defer_hard_irqs and gro_flush_timeout in addition to
irq_suspend_timeout.
- Can the new timeout value be threaded through the new epoll ioctl ?
It is possible, but presents challenges for userspace. User
applications must ensure that the file descriptors added to epoll
contexts have the same NAPI ID to support busy polling.
An epoll context is not permanently tied to any particular NAPI ID.
So, a user application could decide to clear the file descriptors
from the context and add a new set of file descriptors with a
different NAPI ID to the context. Busy polling would work as
expected, but the meaning of the suspend timeout becomes ambiguous
because IRQs are not inherently associated with epoll contexts, but
rather with the NAPI. The user program would need to reissue the
ioctl to set the irq_suspend_timeout, but the napi_defer_hard_irqs
and gro_flush_timeout settings would come from the NAPI's
napi_config (which are set either by sysfs or by netlink). Such an
interface seems awkard to use from a user perspective.
Further, IRQs are related to NAPIs, which is why they are stored in
the napi_config space. Putting the irq_suspend_timeout in
the epoll context while other IRQ deferral mechanisms remain in the
NAPI's napi_config space seems like an odd design choice.
We've opted to keep all of the IRQ deferral parameters together and
place the irq_suspend_timeout in napi_config. This has nice benefits
for userspace: if a user app were to remove all file descriptors
from an epoll context and add new file descriptors with a new NAPI ID,
the correct suspend timeout for that NAPI ID would be used automatically
without the user application needing to do anything (like re-issuing an
ioctl, for example). All IRQ deferral related parameters are in one
place and can all be set the same way: with netlink.
- Can irq suspend be built by combining NIC coalescing and
gro_flush_timeout ?
No. The problem is that the long timeout must engage if and only if
prefer-busy is active.
When using NIC coalescing for the short timeout (without
napi_defer_hard_irqs/gro_flush_timeout), an interrupt after an idle
period will trigger softirq, which will run napi polling. At this
point, prefer-busy is not active, so NIC interrupts would be
re-enabled. Then it is not possible for the longer timeout to
interject to switch control back to polling. In other words, only by
using the software timer for the short timeout, it is possible to
extend the timeout without having to reprogram the NIC timer or
reach down directly and disable interrupts.
Using gro_flush_timeout for the long timeout also has problems, for
the same underlying reason. In the current napi implementation,
gro_flush_timeout is not tied to prefer-busy. We'd either have to
change that and in the process modify the existing deferral
mechanism, or introduce a state variable to determine whether
gro_flush_timeout is used as long timeout for irq suspend or whether
it is used for its default purpose. In an earlier version, we did
try something similar to the latter and made it work, but it ends up
being a lot more convoluted than our current proposal.
- Isn't it already possible to combine busy looping with irq deferral?
Yes, in fact enabling irq deferral via napi_defer_hard_irqs and
gro_flush_timeout is a precondition for prefer_busy_poll to have an
effect. If the application also uses a tight busy loop with
essentially nonblocking epoll_wait (accomplished with a very short
timeout parameter), this is the fullbusy case shown in the results.
An application using blocking epoll_wait is shown as the napibusy
case in the results. It's a hybrid approach that provides limited
latency benefits compared to the base case and plain irq deferral,
but not as good as fullbusy or suspend.
~ Special thanks
Several people were involved in earlier stages of the development of this
mechanism whom we'd like to thank:
- Peter Cai (CC'd), for the initial kernel patch and his contributions
to the paper.
- Mohammadamin Shafie (CC'd), for testing various versions of the kernel
patch and providing helpful feedback.
Thanks,
Martin and Joe
[1]: https://lore.kernel.org/netdev/20240812125717.413108-1-jdamato@fastly.com/
[2]: https://doi.org/10.1145/3626780
[3]: https://github.com/memcached/memcached/blob/master/doc/napi_ids.txt
[4]: https://github.com/leverich/mutilate
[5]: https://raw.githubusercontent.com/martinkarsten/irqsuspend/main/patches/mem…
[6]: https://raw.githubusercontent.com/martinkarsten/irqsuspend/main/patches/lib…
[7]: https://github.com/martinkarsten/irqsuspend
[8]: https://github.com/martinkarsten/irqsuspend/tree/main/results
v9:
- Addresses Willem's feedback on the selftests in patch 5 by fixing
the SPDX-License-Identifier, moving constants into variables in the
test script, reducing code duplication, shortening long lines, and
renaming variables to be more reader friendly. In the C test file,
added a comment explaining the if def blob and changed a few types
for strtoul.
v8: https://lore.kernel.org/netdev/20241108045337.292905-1-jdamato@fastly.com/
- Update patch 2 to drop the exports, as requested by Jakub.
v7: https://lore.kernel.org/netdev/20241108023912.98416-1-jdamato@fastly.com/
- Jakub noted that patch 2 adds unnecessary complexity by checking the
suspend timeout in the NAPI loop. This makes the code more
complicated and difficult to reason about. He's right; we've dropped
patch 2 which simplifies this series.
- Updated the cover letter with a full re-run of all test cases.
- Updated FAQ #2.
v6: https://lore.kernel.org/netdev/20241104215542.215919-1-jdamato@fastly.com/
- Updated the cover letter with a full re-run of all test cases,
including a new case suspend0, as requested by Sridhar previously.
- Updated the kernel documentation in patch 7 as suggested by Bagas
Sanjaya, which improved the htmldoc output.
v5: https://lore.kernel.org/netdev/20241103052421.518856-1-jdamato@fastly.com/
- Adjusted patch 5 to only suspend IRQs when ep_send_events returns a
positive return value. This issue was pointed out by Hillf Danton.
- Updated the commit message of patch 6 which still mentioned netcat,
despite the code being updated in v4 to replace it with socat and fixed
misspelling of netdevsim.
- Fixed a minor typo in patch 7 and removed an unnecessary paragraph.
- Added Sridhar Samudrala's Reviewed-by to patch 1-5 and 7.
v4: https://lore.kernel.org/netdev/20241102005214.32443-1-jdamato@fastly.com/
- Added a new FAQ item to cover letter.
- Updated patch 6 to use socat instead of nc in busy_poll_test.sh and
updated busy_poller.c to use netlink directly to configure napi
params.
- Updated the kernel documentation in patch 7 to include more details.
- Dropped Stanislav's Acked-by and Bagas' Reviewed-by from patch 7
since the documentation was updated.
v3: https://lore.kernel.org/netdev/20241101004846.32532-1-jdamato@fastly.com/
- Added Stanislav Fomichev's Acked-by to every patch except the newly
added selftest.
- Added Bagas Sanjaya's Reviewed-by to the documentation patch.
- Fixed the commit message of patch 2 to remove a reference to the now
non-existent sysfs setting.
- Added a self test which tests both "regular" busy poll and busy poll
with suspend enabled. This was added as patch 6 as requested by
Paolo. netdevsim was chosen instead of veth due to netdevsim's
pre-existing support for netdev-genl. See the commit message of
patch 6 for more details.
v2: https://lore.kernel.org/bpf/20241021015311.95468-1-jdamato@fastly.com/
- Cover letter updated, including a re-run of test data.
- Patch 1 rewritten to use netdev-genl instead of sysfs.
- Patch 3 updated with a comment added to napi_resume_irqs.
- Patch 4 rebased to apply now that commit b9ca079dd6b0 ("eventpoll:
Annotate data-race of busy_poll_usecs") has been picked up from VFS.
- Patch 6 updated the kernel documentation.
rfc -> v1:
- Cover letter updated to include more details.
- Patch 1 updated to remove the documentation added. This was moved to
patch 6 with the rest of the docs (see below).
- Patch 5 updated to fix an error uncovered by the kernel build robot.
See patch 5's changelog for more details.
- Patch 6 added which updates kernel documentation.
Joe Damato (2):
selftests: net: Add busy_poll_test
docs: networking: Describe irq suspension
Martin Karsten (4):
net: Add napi_struct parameter irq_suspend_timeout
net: Add control functions for irq suspension
eventpoll: Trigger napi_busy_loop, if prefer_busy_poll is set
eventpoll: Control irq suspension for prefer_busy_poll
Documentation/netlink/specs/netdev.yaml | 7 +
Documentation/networking/napi.rst | 170 ++++++++-
fs/eventpoll.c | 36 +-
include/linux/netdevice.h | 2 +
include/net/busy_poll.h | 3 +
include/uapi/linux/netdev.h | 1 +
net/core/dev.c | 39 ++
net/core/dev.h | 25 ++
net/core/netdev-genl-gen.c | 5 +-
net/core/netdev-genl.c | 12 +
tools/include/uapi/linux/netdev.h | 1 +
tools/testing/selftests/net/.gitignore | 1 +
tools/testing/selftests/net/Makefile | 3 +-
tools/testing/selftests/net/busy_poll_test.sh | 165 +++++++++
tools/testing/selftests/net/busy_poller.c | 346 ++++++++++++++++++
15 files changed, 809 insertions(+), 7 deletions(-)
create mode 100755 tools/testing/selftests/net/busy_poll_test.sh
create mode 100644 tools/testing/selftests/net/busy_poller.c
base-commit: dc7c381bb8649e3701ed64f6c3e55316675904d7
--
2.25.1
Currently, the test breaks if the SUT already has a default route
configured for IPv6. Fix by adding "metric 9999" to the `ip -6 ro add`
command, so that multiple default routes can coexist.
Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route")
Signed-off-by: Ricardo B. Marlière <rbm(a)suse.com>
---
tools/testing/selftests/net/fib-onlink-tests.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
index ec2d6ceb1f08..acf6b0617373 100755
--- a/tools/testing/selftests/net/fib-onlink-tests.sh
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -207,7 +207,7 @@ setup()
ip -netns ${PEER_NS} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
done
- ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64}
+ ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64} metric 9999
ip -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64}
set +e
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251218-rbm-selftests-net-fib-onlink-873ad01e6884
Best regards,
--
Ricardo B. Marlière <rbm(a)suse.com>
The top level kselftest Makefile supports an option FORCE_TARGETS which
causes any failures during the build to be propagated to the exit status
of the top level make, useful during build testing. Currently the recursion
done by the arm64 selftests ignores this option, meaning arm64 failures are
not reported via this mechanism. Add the logic to implement FORCE_TARGETS
so that it works for arm64.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/arm64/Makefile | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index c4c72ee2ef55..e456f3b62fa1 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -30,13 +30,15 @@ all:
@for DIR in $(ARM64_SUBTARGETS); do \
BUILD_TARGET=$(OUTPUT)/$$DIR; \
mkdir -p $$BUILD_TARGET; \
- make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \
+ $(if $(FORCE_TARGETS),|| exit); \
done
install: all
@for DIR in $(ARM64_SUBTARGETS); do \
BUILD_TARGET=$(OUTPUT)/$$DIR; \
- make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@ \
+ $(if $(FORCE_TARGETS),|| exit); \
done
run_tests: all
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251219-kselftest-arm64-force-targets-2ba8cebd1748
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Much work has recently gone into supporting block device integrity data
(sometimes called "metadata") in Linux. Many NVMe devices these days
support metadata transfers and/or automatic protection information
generation and verification. However, ublk devices can't yet advertise
integrity data capabilities. This patch series wires up support for
integrity data in ublk. The ublk feature is referred to as "integrity"
rather than "metadata" to match the block layer's name for it and to
avoid confusion with the existing and unrelated UBLK_IO_F_META.
To advertise support for integrity data, a ublk server fills out the
struct ublk_params's integrity field and sets UBLK_PARAM_TYPE_INTEGRITY.
The struct ublk_param_integrity flags and csum_type fields use the
existing LBMD_PI_* constants from the linux/fs.h UAPI header. The ublk
driver fills out a corresponding struct blk_integrity.
When a request with integrity data is issued to the ublk device, the
ublk driver sets UBLK_IO_F_INTEGRITY in struct ublksrv_io_desc's
op_flags field. This is necessary for a ublk server for which
bi_offload_capable() returns true to distinguish requests with integrity
data from those without.
Integrity data transfers can currently only be performed via the ublk
user copy mechanism. The overhead of zero-copy buffer registration makes
it less appealing for the small transfers typical of integrity data.
Additionally, neither io_uring NVMe passthru nor IORING_RW_ATTR_FLAG_PI
currently allow an io_uring registered buffer for the integrity data.
The ki_pos field of the struct kiocb passed to the user copy
->{read,write}_iter() callback gains a bit UBLKSRV_IO_INTEGRITY_FLAG for
a ublk server to indicate whether to access the request's data or
integrity data.
Not yet supported is an analogue for the IO_INTEGRITY_CHK_*/BIP_CHECK_*
flags to ask the ublk server to verify the guard, reftag, and/or apptag
of a request's protection information. The user copy mechanism currently
forbids a ublk server from reading the data/integrity buffer of a
read-direction request. We could potentially relax this restriction for
integrity data on reads. Alternatively, the ublk driver could verify the
requested fields as part of the user copy operation.
The first 2 commits harden blk_validate_integrity_limits() to reject
nonsensical pi_offset and interval_exp integrity limits.
Caleb Sander Mateos (17):
block: validate pi_offset integrity limit
block: validate interval_exp integrity limit
blk-integrity: take const pointer in blk_integrity_rq()
ublk: move ublk flag check functions earlier
ublk: set UBLK_IO_F_INTEGRITY in ublksrv_io_desc
ublk: add ublk_copy_user_bvec() helper
ublk: split out ublk_user_copy() helper
ublk: inline ublk_check_and_get_req() into ublk_user_copy()
ublk: move offset check out of __ublk_check_and_get_req()
ublk: optimize ublk_user_copy() on daemon task
selftests: ublk: add utility to get block device metadata size
selftests: ublk: add kublk support for integrity params
selftests: ublk: implement integrity user copy in kublk
selftests: ublk: support non-O_DIRECT backing files
selftests: ublk: add integrity data support to loop target
selftests: ublk: add integrity params test
selftests: ublk: add end-to-end integrity test
Stanley Zhang (3):
ublk: add integrity UAPI
ublk: support UBLK_PARAM_TYPE_INTEGRITY in device creation
ublk: implement integrity user copy
block/blk-settings.c | 14 +-
drivers/block/ublk_drv.c | 336 +++++++++++++------
include/linux/blk-integrity.h | 6 +-
include/uapi/linux/ublk_cmd.h | 20 +-
tools/testing/selftests/ublk/Makefile | 6 +-
tools/testing/selftests/ublk/common.c | 4 +-
tools/testing/selftests/ublk/fault_inject.c | 1 +
tools/testing/selftests/ublk/file_backed.c | 61 +++-
tools/testing/selftests/ublk/kublk.c | 85 ++++-
tools/testing/selftests/ublk/kublk.h | 37 +-
tools/testing/selftests/ublk/metadata_size.c | 37 ++
tools/testing/selftests/ublk/null.c | 1 +
tools/testing/selftests/ublk/stripe.c | 6 +-
tools/testing/selftests/ublk/test_common.sh | 10 +
tools/testing/selftests/ublk/test_loop_08.sh | 111 ++++++
tools/testing/selftests/ublk/test_null_04.sh | 166 +++++++++
16 files changed, 765 insertions(+), 136 deletions(-)
create mode 100644 tools/testing/selftests/ublk/metadata_size.c
create mode 100755 tools/testing/selftests/ublk/test_loop_08.sh
create mode 100755 tools/testing/selftests/ublk/test_null_04.sh
--
2.45.2
Note: this requires INPUT_PROP_PRESSUREPAD [1] which is not yet
available in Linus' tree but it is in Dmitry's for-linus tree.
Nicely enough MS defines a button type for a pressurepad touchpad [2]
and it looks like most touchpad vendors fill this in.
The selftests require a bit of prep work (and a hack for the test
itself) - hidtools 0.12 requires python-libevdev 0.13 which in turn
provides constructors for unknown properties.
[1] https://lore.kernel.org/linux-input/20251030011735.GA969565@quokka/T/#m9d9b…
[2] https://learn.microsoft.com/en-us/windows-hardware/design/component-guideli…
Signed-off-by: Peter Hutterer <peter.hutterer(a)who-t.net>
---
Peter Hutterer (3):
selftests/hid: require hidtools 0.12
selftests/hid: use a enum class for the different button types
HID: multitouch: set INPUT_PROP_PRESSUREPAD based on Digitizer/Button Type
drivers/hid/hid-multitouch.c | 12 ++++-
tools/testing/selftests/hid/tests/conftest.py | 14 +++++
.../testing/selftests/hid/tests/test_multitouch.py | 61 +++++++++++++++++-----
3 files changed, 73 insertions(+), 14 deletions(-)
---
base-commit: 2bc4c50a42f8b83f611d0475598dc72740e87640
change-id: 20251111-wip-hid-pressurepad-8a800cdf1813
Best regards,
--
Peter Hutterer <peter.hutterer(a)who-t.net>
From: Gary Guo <gary(a)garyguo.net>
When the `#![allow]` line was added, the doctest line number anchor
isn't updated which causes the line number printed in kunit test to be
off-by-one.
Fixes: ab844cf32058 ("rust: allow `unreachable_pub` for doctests")
Signed-off-by: Gary Guo <gary(a)garyguo.net>
---
scripts/rustdoc_test_gen.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs
index be05610496605..6fd9f5c84e2e4 100644
--- a/scripts/rustdoc_test_gen.rs
+++ b/scripts/rustdoc_test_gen.rs
@@ -206,7 +206,7 @@ macro_rules! assert_eq {{
/// The anchor where the test code body starts.
#[allow(unused)]
- static __DOCTEST_ANCHOR: i32 = ::core::line!() as i32 + {body_offset} + 1;
+ static __DOCTEST_ANCHOR: i32 = ::core::line!() as i32 + {body_offset} + 2;
{{
#![allow(unreachable_pub, clippy::disallowed_names)]
{body}
base-commit: 559e608c46553c107dbba19dae0854af7b219400
--
2.51.2
The rust doctests are numbered -- instead of named with the line number
-- in order to keep them moderately consistent even as the source file
changes.
However, the test numbers are generated by sorting the file/line
strings, and so the line numbers were sorted as strings, not integers.
So, for instance, a test on line 7 would sort in-between one on line 65
and one on line 75.
Instead, parse the numbers as an integer, and sort based on that. This
is a bit slower, uglier, and will break things once, but I suspect is
worth it (at least until we have a better solution).
Signed-off-by: David Gow <davidgow(a)google.com>
---
This is a pretty unpolished, likely-unidiomatic patch to work around the
test numbering being horrible.
I have three questions before I decide if this is worth continuing with:
1. Is it worth renumbering all of the tests (hopefully just once), or
would that break too many people's test histories?
2. Is there a better way of doing this in Rust? I can think of ways
which might be nicer if the whole thing is refactored somewhat
seriously, but if there's an easy numeric sort on strings, that'd be
much easier.
3. Should we wait until after all or some of the changes to the test
generation? Does the new --output-format=doctest option make this
easier/harder/different?
Does anyone have opinions/advice on those (or, indeed, on anything
else)?
Cheers,
-- David
---
scripts/rustdoc_test_gen.rs | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/scripts/rustdoc_test_gen.rs b/scripts/rustdoc_test_gen.rs
index be0561049660..60b0bbfb1896 100644
--- a/scripts/rustdoc_test_gen.rs
+++ b/scripts/rustdoc_test_gen.rs
@@ -116,7 +116,19 @@ fn main() {
.collect::<Vec<_>>();
// Sort paths.
- paths.sort();
+ paths.sort_by(|a, b|{
+ let a_name = a.file_name().unwrap().to_str().unwrap().to_string();
+ let (a_file, a_line) = a_name.rsplit_once('_').unwrap().0.rsplit_once('_').unwrap();
+ let a_line_no = a_line.parse::<u64>().unwrap();
+ let b_name = b.file_name().unwrap().to_str().unwrap().to_string();
+ let (b_file, b_line) = b_name.rsplit_once('_').unwrap().0.rsplit_once('_').unwrap();
+ let b_line_no = b_line.parse::<u64>().unwrap();
+
+ match a_file.cmp(b_file) {
+ std::cmp::Ordering::Equal => a_line_no.cmp(&b_line_no),
+ order => order,
+ }
+ });
let mut rust_tests = String::new();
let mut c_test_declarations = String::new();
--
2.52.0.322.g1dd061c0dc-goog
The kunit_run_irq_test() helper allows a function to be run in hardirq
and softirq contexts (in addition to the task context). It does this by
running the user-provided function concurrently in the three contexts,
until either a timeout has expired or a number of iterations have
completed in the normal task context.
However, on setups where the initialisation of the hardirq and softirq
contexts (or, indeed, the scheduling of those tasks) is significantly
slower than the function execution, it's possible for that number of
iterations to be exceeded before any runs in irq contexts actually
occur. This occurs with the polyval.test_polyval_preparekey_in_irqs
test, which runs 20000 iterations of the relatively fast preparekey
function, and therefore fails often under many UML, 32-bit arm, m68k and
other environments.
Instead, ensure that the max_iterations limit counts executions in all
three contexts, and requires at least one of each. This will cause the
test to continue iterating until at least the irq contexts have been
tested, or the 1s wall-clock limit has been exceeded. This causes the
test to pass in all of my environments.
In so doing, we also update the task counters to atomic ints, to better
match both the 'int' max_iterations input, and to ensure they are
correctly updated across contexts.
Finally, we also fix a few potential assertion messages to be
less-specific to the original crypto usecases.
Fixes: b41dc83f0790 ("kunit, lib/crypto: Move run_irq_test() to common header")
Signed-off-by: David Gow <davidgow(a)google.com>
---
include/kunit/run-in-irq-context.h | 41 ++++++++++++++++++++----------
1 file changed, 28 insertions(+), 13 deletions(-)
diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h
index 108e96433ea4..4d25aee0de6e 100644
--- a/include/kunit/run-in-irq-context.h
+++ b/include/kunit/run-in-irq-context.h
@@ -20,8 +20,8 @@ struct kunit_irq_test_state {
bool task_func_reported_failure;
bool hardirq_func_reported_failure;
bool softirq_func_reported_failure;
- unsigned long hardirq_func_calls;
- unsigned long softirq_func_calls;
+ atomic_t hardirq_func_calls;
+ atomic_t softirq_func_calls;
struct hrtimer timer;
struct work_struct bh_work;
};
@@ -32,7 +32,7 @@ static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer)
container_of(timer, typeof(*state), timer);
WARN_ON_ONCE(!in_hardirq());
- state->hardirq_func_calls++;
+ atomic_inc(&state->hardirq_func_calls);
if (!state->func(state->test_specific_state))
state->hardirq_func_reported_failure = true;
@@ -48,7 +48,7 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
container_of(work, typeof(*state), bh_work);
WARN_ON_ONCE(!in_serving_softirq());
- state->softirq_func_calls++;
+ atomic_inc(&state->softirq_func_calls);
if (!state->func(state->test_specific_state))
state->softirq_func_reported_failure = true;
@@ -59,7 +59,10 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
* hardirq context concurrently, and reports a failure to KUnit if any
* invocation of @func in any context returns false. @func is passed
* @test_specific_state as its argument. At most 3 invocations of @func will
- * run concurrently: one in each of task, softirq, and hardirq context.
+ * run concurrently: one in each of task, softirq, and hardirq context. @func
+ * will continue running until either @max_iterations calls have been made (so
+ * long as at least one each runs in task, softirq, and hardirq contexts), or
+ * one second has passed.
*
* The main purpose of this interrupt context testing is to validate fallback
* code paths that run in contexts where the normal code path cannot be used,
@@ -85,6 +88,10 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
.test_specific_state = test_specific_state,
};
unsigned long end_jiffies;
+ int hardirq_calls, softirq_calls;
+ bool allctx = false;
+
+ max_iterations = 1;
/*
* Set up a hrtimer (the way we access hardirq context) and a work
@@ -94,14 +101,22 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func);
- /* Run for up to max_iterations or 1 second, whichever comes first. */
+ /* Run for up to max_iterations (including at least one task, softirq,
+ * and hardirq), or 1 second, whichever comes first.
+ */
end_jiffies = jiffies + HZ;
hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL,
HRTIMER_MODE_REL_HARD);
- for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
- i++) {
+ for (int task_calls = 0, calls = 0;
+ ((calls < max_iterations) || !allctx) && !time_after(jiffies, end_jiffies);
+ task_calls++) {
if (!func(test_specific_state))
state.task_func_reported_failure = true;
+
+ hardirq_calls = atomic_read(&state.hardirq_func_calls);
+ softirq_calls = atomic_read(&state.softirq_func_calls);
+ calls = task_calls + hardirq_calls + softirq_calls;
+ allctx = (task_calls > 0) && (hardirq_calls > 0) && (softirq_calls > 0);
}
/* Cancel the timer and work. */
@@ -109,21 +124,21 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
flush_work(&state.bh_work);
/* Sanity check: the timer and BH functions should have been run. */
- KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
+ KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.hardirq_func_calls), 0,
"Timer function was not called");
- KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
+ KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.softirq_func_calls), 0,
"BH work function was not called");
/* Check for incorrect hash values reported from any context. */
KUNIT_EXPECT_FALSE_MSG(
test, state.task_func_reported_failure,
- "Incorrect hash values reported from task context");
+ "Failure reported from task context");
KUNIT_EXPECT_FALSE_MSG(
test, state.hardirq_func_reported_failure,
- "Incorrect hash values reported from hardirq context");
+ "Failure reported from hardirq context");
KUNIT_EXPECT_FALSE_MSG(
test, state.softirq_func_reported_failure,
- "Incorrect hash values reported from softirq context");
+ "Failure reported from softirq context");
}
#endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */
--
2.52.0.322.g1dd061c0dc-goog
A few improvements/fixes for the mm kselftests:
- Patch 1-2 extend support for more build configurations: out-of-tree
$KDIR, cross-compilation, etc.
- Patch 3-4 fix issues in the pagemap_ioctl tests, most importantly that
it does not report failures: ./run_kselftests.sh would report OK
even if some pagemap_ioctl tests fail. That's probably why the issue
in patch 3 went unnoticed.
---
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: David Hildenbrand <david(a)kernel.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shuah Khan <shuah(a)kernel.org>
---
Kevin Brodsky (4):
selftests/mm: remove flaky header check
selftests/mm: pass down full CC and CFLAGS to check_config.sh
selftests/mm: fix faulting-in code in pagemap_ioctl test
selftests/mm: fix exit code in pagemap_ioctl
tools/testing/selftests/mm/Makefile | 6 +-----
tools/testing/selftests/mm/check_config.sh | 3 +--
tools/testing/selftests/mm/pagemap_ioctl.c | 12 ++++++------
3 files changed, 8 insertions(+), 13 deletions(-)
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
--
2.51.2
The checksum_32 code was originally written to only handle 2-byte
aligned buffers, but was later extended to support arbitrary alignment.
However, the non-PPro variant doesn't apply the carry before jumping to
the 2- or 4-byte aligned versions, which clear CF.
This causes the new checksum_kunit test to fail, as it runs with a large
number of different possible alignments and both with and without
carries.
For example:
./tools/testing/kunit/kunit.py run --arch i386 --kconfig_add CONFIG_M486=y checksum
Gives:
KTAP version 1
# Subtest: checksum
1..3
ok 1 test_csum_fixed_random_inputs
# test_csum_all_carry_inputs: ASSERTION FAILED at lib/checksum_kunit.c:267
Expected result == expec, but
result == 65281 (0xff01)
expec == 65280 (0xff00)
not ok 2 test_csum_all_carry_inputs
# test_csum_no_carry_inputs: ASSERTION FAILED at lib/checksum_kunit.c:314
Expected result == expec, but
result == 65535 (0xffff)
expec == 65534 (0xfffe)
not ok 3 test_csum_no_carry_inputs
With this patch, it passes.
KTAP version 1
# Subtest: checksum
1..3
ok 1 test_csum_fixed_random_inputs
ok 2 test_csum_all_carry_inputs
ok 3 test_csum_no_carry_inputs
I also tested it on a real 486DX2, with the same results.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: David Gow <davidgow(a)google.com>
---
Re-sending this from [1]. While there's an argument that the whole
32-bit checksum code could do with rewriting, it's:
(a) worth fixing before someone takes the time to rewrite it, and
(b) worth any future rewrite starting from a point where the tests pass
I don't think there should be any downside to this fix: it only affects
ancient computers, and adds a single instruction which isn't in a loop.
Cheers,
-- David
[1]: https://lore.kernel.org/lkml/20230704083206.693155-2-davidgow@google.com/
---
arch/x86/lib/checksum_32.S | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 68f7fa3e1322..a5123b29b403 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -62,6 +62,7 @@ SYM_FUNC_START(csum_partial)
jl 8f
movzbl (%esi), %ebx
adcl %ebx, %eax
+ adcl $0, %eax
roll $8, %eax
inc %esi
testl $2, %esi
--
2.45.2.1089.g2a221341d9-goog
This introduces signal->exec_bprm, which is used to
fix the case when at least one of the sibling threads
is traced, and therefore the trace process may dead-lock
in ptrace_attach, but de_thread will need to wait for the
tracer to continue execution.
The solution is to detect this situation and allow
ptrace_attach to continue by temporarily releasing the
cred_guard_mutex, while de_thread() is still waiting for
traced zombies to be eventually released by the tracer.
In the case of the thread group leader we only have to wait
for the thread to become a zombie, which may also need
co-operation from the tracer due to PTRACE_O_TRACEEXIT.
When a tracer wants to ptrace_attach a task that already
is in execve, we simply retry the ptrace_may_access
check while temporarily installing the new credentials
and dumpability which are about to be used after execve
completes. If the ptrace_attach happens on a thread that
is a sibling-thread of the thread doing execve, it is
sufficient to check against the old credentials, as this
thread will be waited for, before the new credentials are
installed.
Other threads die quickly since the cred_guard_mutex is
released, but a deadly signal is already pending. In case
the mutex_lock_killable misses the signal, the non-zero
current->signal->exec_bprm makes sure they release the
mutex immediately and return with -ERESTARTNOINTR.
This means there is no API change, unlike the previous
version of this patch which was discussed here:
https://lore.kernel.org/lkml/b6537ae6-31b1-5c50-f32b-8b8332ace882@hotmail.d…
See tools/testing/selftests/ptrace/vmaccess.c
for a test case that gets fixed by this change.
Note that since the test case was originally designed to
test the ptrace_attach returning an error in this situation,
the test expectation needed to be adjusted, to allow the
API to succeed at the first attempt.
Signed-off-by: Bernd Edlinger <bernd.edlinger(a)hotmail.de>
---
fs/exec.c | 69 ++++++++++++++++-------
fs/proc/base.c | 6 ++
include/linux/cred.h | 1 +
include/linux/sched/signal.h | 18 ++++++
kernel/cred.c | 28 +++++++--
kernel/ptrace.c | 32 +++++++++++
kernel/seccomp.c | 12 +++-
tools/testing/selftests/ptrace/vmaccess.c | 23 +++++---
8 files changed, 155 insertions(+), 34 deletions(-)
v10: Changes to previous version, make the PTRACE_ATTACH
retun -EAGAIN, instead of execve return -ERESTARTSYS.
Added some lessions learned to the description.
v11: Check old and new credentials in PTRACE_ATTACH again without
changing the API.
Note: I got actually one response from an automatic checker to the v11 patch,
https://lore.kernel.org/lkml/202107121344.wu68hEPF-lkp@intel.com/
which is complaining about:
>> kernel/ptrace.c:425:26: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct cred const *old_cred @@ got struct cred const [noderef] __rcu *real_cred @@
417 struct linux_binprm *bprm = task->signal->exec_bprm;
418 const struct cred *old_cred;
419 struct mm_struct *old_mm;
420
421 retval = down_write_killable(&task->signal->exec_update_lock);
422 if (retval)
423 goto unlock_creds;
424 task_lock(task);
> 425 old_cred = task->real_cred;
v12: Essentially identical to v11.
- Fixed a minor merge conflict in linux v5.17, and fixed the
above mentioned nit by adding __rcu to the declaration.
- re-tested the patch with all linux versions from v5.11 to v6.6
v10 was an alternative approach which did imply an API change.
But I would prefer to avoid such an API change.
The difficult part is getting the right dumpability flags assigned
before de_thread starts, hope you like this version.
If not, the v10 is of course also acceptable.
Thanks
Bernd.
diff --git a/fs/exec.c b/fs/exec.c
index 2f2b0acec4f0..902d3b230485 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1041,11 +1041,13 @@ static int exec_mmap(struct mm_struct *mm)
return 0;
}
-static int de_thread(struct task_struct *tsk)
+static int de_thread(struct task_struct *tsk, struct linux_binprm *bprm)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
+ struct task_struct *t = tsk;
+ bool unsafe_execve_in_progress = false;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -1068,6 +1070,19 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk))
sig->notify_count--;
+ while_each_thread(tsk, t) {
+ if (unlikely(t->ptrace)
+ && (t != tsk->group_leader || !t->exit_state))
+ unsafe_execve_in_progress = true;
+ }
+
+ if (unlikely(unsafe_execve_in_progress)) {
+ spin_unlock_irq(lock);
+ sig->exec_bprm = bprm;
+ mutex_unlock(&sig->cred_guard_mutex);
+ spin_lock_irq(lock);
+ }
+
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
@@ -1158,6 +1173,11 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
sig->group_exec_task = NULL;
sig->notify_count = 0;
@@ -1169,6 +1189,11 @@ static int de_thread(struct task_struct *tsk)
return 0;
killed:
+ if (unlikely(unsafe_execve_in_progress)) {
+ mutex_lock(&sig->cred_guard_mutex);
+ sig->exec_bprm = NULL;
+ }
+
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exec_task = NULL;
@@ -1253,6 +1278,24 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
return retval;
+ /* If the binary is not readable then enforce mm->dumpable=0 */
+ would_dump(bprm, bprm->file);
+ if (bprm->have_execfd)
+ would_dump(bprm, bprm->executable);
+
+ /*
+ * Figure out dumpability. Note that this checking only of current
+ * is wrong, but userspace depends on it. This should be testing
+ * bprm->secureexec instead.
+ */
+ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
+ is_dumpability_changed(current_cred(), bprm->cred) ||
+ !(uid_eq(current_euid(), current_uid()) &&
+ gid_eq(current_egid(), current_gid())))
+ set_dumpable(bprm->mm, suid_dumpable);
+ else
+ set_dumpable(bprm->mm, SUID_DUMP_USER);
+
/*
* Ensure all future errors are fatal.
*/
@@ -1261,7 +1304,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Make this the only thread in the thread group.
*/
- retval = de_thread(me);
+ retval = de_thread(me, bprm);
if (retval)
goto out;
@@ -1284,11 +1327,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out;
- /* If the binary is not readable then enforce mm->dumpable=0 */
- would_dump(bprm, bprm->file);
- if (bprm->have_execfd)
- would_dump(bprm, bprm->executable);
-
/*
* Release all of the old mmap stuff
*/
@@ -1350,18 +1388,6 @@ int begin_new_exec(struct linux_binprm * bprm)
me->sas_ss_sp = me->sas_ss_size = 0;
- /*
- * Figure out dumpability. Note that this checking only of current
- * is wrong, but userspace depends on it. This should be testing
- * bprm->secureexec instead.
- */
- if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
- !(uid_eq(current_euid(), current_uid()) &&
- gid_eq(current_egid(), current_gid())))
- set_dumpable(current->mm, suid_dumpable);
- else
- set_dumpable(current->mm, SUID_DUMP_USER);
-
perf_event_exec();
__set_task_comm(me, kbasename(bprm->filename), true);
@@ -1480,6 +1506,11 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
return -ERESTARTNOINTR;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
bprm->cred = prepare_exec_creds();
if (likely(bprm->cred))
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..0da9adfadb48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2788,6 +2788,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ rv = -ERESTARTNOINTR;
+ goto out_free;
+ }
+
rv = security_setprocattr(PROC_I(inode)->op.lsm,
file->f_path.dentry->d_name.name, page,
count);
diff --git a/include/linux/cred.h b/include/linux/cred.h
index f923528d5cc4..b01e309f5686 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -159,6 +159,7 @@ extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
+extern bool is_dumpability_changed(const struct cred *, const struct cred *);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 0014d3adaf84..14df7073a0a8 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -234,9 +234,27 @@ struct signal_struct {
struct mm_struct *oom_mm; /* recorded mm when the thread group got
* killed by the oom killer */
+ struct linux_binprm *exec_bprm; /* Used to check ptrace_may_access
+ * against new credentials while
+ * de_thread is waiting for other
+ * traced threads to terminate.
+ * Set while de_thread is executing.
+ * The cred_guard_mutex is released
+ * after de_thread() has called
+ * zap_other_threads(), therefore
+ * a fatal signal is guaranteed to be
+ * already pending in the unlikely
+ * event, that
+ * current->signal->exec_bprm happens
+ * to be non-zero after the
+ * cred_guard_mutex was acquired.
+ */
+
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace)
+ * Held while execve runs, except when
+ * a sibling thread is being traced.
* Deprecated do not use in new code.
* Use exec_update_lock instead.
*/
diff --git a/kernel/cred.c b/kernel/cred.c
index 98cb4eca23fb..586cb6c7cf6b 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -433,6 +433,28 @@ static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
return false;
}
+/**
+ * is_dumpability_changed - Will changing creds from old to new
+ * affect the dumpability in commit_creds?
+ *
+ * Return: false - dumpability will not be changed in commit_creds.
+ * Return: true - dumpability will be changed to non-dumpable.
+ *
+ * @old: The old credentials
+ * @new: The new credentials
+ */
+bool is_dumpability_changed(const struct cred *old, const struct cred *new)
+{
+ if (!uid_eq(old->euid, new->euid) ||
+ !gid_eq(old->egid, new->egid) ||
+ !uid_eq(old->fsuid, new->fsuid) ||
+ !gid_eq(old->fsgid, new->fsgid) ||
+ !cred_cap_issubset(old, new))
+ return true;
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -467,11 +489,7 @@ int commit_creds(struct cred *new)
get_cred(new); /* we will require a ref for the subj creds too */
/* dumpability changes */
- if (!uid_eq(old->euid, new->euid) ||
- !gid_eq(old->egid, new->egid) ||
- !uid_eq(old->fsuid, new->fsuid) ||
- !gid_eq(old->fsgid, new->fsgid) ||
- !cred_cap_issubset(old, new)) {
+ if (is_dumpability_changed(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 443057bee87c..eb1c450bb7d7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/security.h>
+#include <linux/binfmts.h>
#include <linux/signal.h>
#include <linux/uio.h>
#include <linux/audit.h>
@@ -435,6 +436,28 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;
+ if (unlikely(task->in_execve)) {
+ struct linux_binprm *bprm = task->signal->exec_bprm;
+ const struct cred __rcu *old_cred;
+ struct mm_struct *old_mm;
+
+ retval = down_write_killable(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ task_lock(task);
+ old_cred = task->real_cred;
+ old_mm = task->mm;
+ rcu_assign_pointer(task->real_cred, bprm->cred);
+ task->mm = bprm->mm;
+ retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+ rcu_assign_pointer(task->real_cred, old_cred);
+ task->mm = old_mm;
+ task_unlock(task);
+ up_write(&task->signal->exec_update_lock);
+ if (retval)
+ goto unlock_creds;
+ }
+
write_lock_irq(&tasklist_lock);
retval = -EPERM;
if (unlikely(task->exit_state))
@@ -508,6 +531,14 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
+ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ return -ERESTARTNOINTR;
+ }
+
write_lock_irq(&tasklist_lock);
/* Are we already being traced? */
if (!current->ptrace) {
@@ -523,6 +554,7 @@ static int ptrace_traceme(void)
}
}
write_unlock_irq(&tasklist_lock);
+ mutex_unlock(¤t->signal->cred_guard_mutex);
return ret;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 255999ba9190..b29bbfa0b044 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1955,9 +1955,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
* Make sure we cannot change seccomp or nnp state via TSYNC
* while another thread is in the middle of calling exec.
*/
- if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
- mutex_lock_killable(¤t->signal->cred_guard_mutex))
- goto out_put_fd;
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ if (mutex_lock_killable(¤t->signal->cred_guard_mutex))
+ goto out_put_fd;
+
+ if (unlikely(current->signal->exec_bprm)) {
+ mutex_unlock(¤t->signal->cred_guard_mutex);
+ goto out_put_fd;
+ }
+ }
spin_lock_irq(¤t->sighand->siglock);
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
index 4db327b44586..3b7d81fb99bb 100644
--- a/tools/testing/selftests/ptrace/vmaccess.c
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -39,8 +39,15 @@ TEST(vmaccess)
f = open(mm, O_RDONLY);
ASSERT_GE(f, 0);
close(f);
- f = kill(pid, SIGCONT);
- ASSERT_EQ(f, 0);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_NE(f, -1);
+ ASSERT_NE(f, 0);
+ ASSERT_NE(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, pid);
+ f = waitpid(-1, NULL, 0);
+ ASSERT_EQ(f, -1);
+ ASSERT_EQ(errno, ECHILD);
}
TEST(attach)
@@ -57,22 +64,24 @@ TEST(attach)
sleep(1);
k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
- ASSERT_EQ(errno, EAGAIN);
- ASSERT_EQ(k, -1);
+ ASSERT_EQ(k, 0);
k = waitpid(-1, &s, WNOHANG);
ASSERT_NE(k, -1);
ASSERT_NE(k, 0);
ASSERT_NE(k, pid);
ASSERT_EQ(WIFEXITED(s), 1);
ASSERT_EQ(WEXITSTATUS(s), 0);
- sleep(1);
- k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ k = waitpid(-1, &s, 0);
+ ASSERT_EQ(k, pid);
+ ASSERT_EQ(WIFSTOPPED(s), 1);
+ ASSERT_EQ(WSTOPSIG(s), SIGTRAP);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
ASSERT_EQ(WIFSTOPPED(s), 1);
ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
- k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+ k = ptrace(PTRACE_CONT, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
--
2.39.2
[Joerg, can you put this and vtd in linux-next please. The vtd series is still
good at v3 thanks]
Currently each of the iommu page table formats duplicates all of the logic
to maintain the page table and perform map/unmap/etc operations. There are
several different versions of the algorithms between all the different
formats. The io-pgtable system provides an interface to help isolate the
page table code from the iommu driver, but doesn't provide tools to
implement the common algorithms.
This makes it very hard to improve the state of the pagetable code under
the iommu domains as any proposed improvement needs to alter a large
number of different driver code paths. Combined with a lack of software
based testing this makes improvement in this area very hard.
iommufd wants several new page table operations:
- More efficient map/unmap operations, using iommufd's batching logic
- unmap that returns the physical addresses into a batch as it progresses
- cut that allows splitting areas so large pages can have holes
poked in them dynamically (ie guestmemfd hitless shared/private
transitions)
- More agressive freeing of table memory to avoid waste
- Fragmenting large pages so that dirty tracking can be more granular
- Reassembling large pages so that VMs can run at full IO performance
in migration/dirty tracking error flows
- KHO integration for kernel live upgrade
Together these are algorithmically complex enough to be a very significant
task to go and implement in all the page table formats we support. Just
the "server" focused drivers use almost all the formats (ARMv8 S1&S2 / x86
PAE / AMDv1 / VT-d SS / RISCV)
Instead of doing the duplicated work, this series takes the first step to
consolidate the algorithms into one places. In spirit it is similar to the
work Christoph did a few years back to pull the redundant get_user_pages()
implementations out of the arch code into core MM. This unlocked a great
deal of improvement in that space in the following years. I would like to
see the same benefit in iommu as well.
My first RFC showed a bigger picture with all most all formats and more
algorithms. This series reorganizes that to be narrowly focused on just
enough to convert the AMD driver to use the new mechanism.
kunit tests are provided that allow good testing of the algorithms and all
formats on x86, nothing is arch specific.
AMD is one of the simpler options as the HW is quite uniform with few
different options/bugs while still requiring the complicated contiguous
pages support. The HW also has a very simple range based invalidation
approach that is easy to implement.
The AMD v1 and AMD v2 page table formats are implemented bit for bit
identical to the current code, tested using a compare kunit test that
checks against the io-pgtable version (on github, see below).
Updating the AMD driver to replace the io-pgtable layer with the new stuff
is fairly straightforward now. The layering is fixed up in the new version
so that all the invalidation goes through function pointers.
Several small fixing patches have come out of this as I've been fixing the
problems that the test suite uncovers in the current code, and
implementing the fixed version in iommupt.
On performance, there is a quite wide variety of implementation designs
across all the drivers. Looking at some key performance across
the main formats:
iommu_map():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 51,63 , 19.19 (AMDV1)
256*2^12, 386,1909 , 367,1795 , 79.79
256*2^21, 362,1633 , 355,1556 , 77.77
2^12, 56,62 , 52,59 , 11.11 (AMDv2)
256*2^12, 405,1355 , 357,1292 , 72.72
256*2^21, 393,1160 , 358,1114 , 67.67
2^12, 55,65 , 53,62 , 14.14 (VT-d second stage)
256*2^12, 391,518 , 332,512 , 35.35
256*2^21, 383,635 , 336,624 , 46.46
2^12, 57,65 , 55,63 , 12.12 (ARM 64 bit)
256*2^12, 380,389 , 361,369 , 2.02
256*2^21, 358,419 , 345,400 , 13.13
iommu_unmap():
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 69,88 , 65,85 , 23.23 (AMDv1)
256*2^12, 353,6498 , 331,6029 , 94.94
256*2^21, 373,6014 , 360,5706 , 93.93
2^12, 71,72 , 66,69 , 4.04 (AMDv2)
256*2^12, 228,891 , 206,871 , 76.76
256*2^21, 254,721 , 245,711 , 65.65
2^12, 69,87 , 65,82 , 20.20 (VT-d second stage)
256*2^12, 210,321 , 200,315 , 36.36
256*2^21, 255,349 , 238,342 , 30.30
2^12, 72,77 , 68,74 , 8.08 (ARM 64 bit)
256*2^12, 521,357 , 447,346 , -29.29
256*2^21, 489,358 , 433,345 , -25.25
* Above numbers include additional patches to remove the iommu_pgsize()
overheads. gcc 13.3.0, i7-12700
This version provides fairly consistent performance across formats. ARM
unmap performance is quite different because this version supports
contiguous pages and uses a very different algorithm for unmapping. Though
why it is so worse compared to AMDv1 I haven't figured out yet.
The per-format commits include a more detailed chart.
There is a second branch:
https://github.com/jgunthorpe/linux/commits/iommu_pt_all
Containing supporting work and future steps:
- ARM short descriptor (32 bit), ARM long descriptor (64 bit) formats
- RISCV format and RISCV conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_riscv
- Support for a DMA incoherent HW page table walker
- VT-d second stage format and VT-d conversion
https://github.com/jgunthorpe/linux/commits/iommu_pt_vtd
- DART v1 & v2 format
- Draft of a iommufd 'cut' operation to break down huge pages
- A compare test that checks the iommupt formats against the iopgtable
interface, including updating AMD to have a working iopgtable and patches
to make VT-d have an iopgtable for testing.
- A performance test to micro-benchmark map and unmap against iogptable
My strategy is to go one by one for the drivers:
- AMD driver conversion
- RISCV page table and driver
- Intel VT-d driver and VTDSS page table
- Flushing improvements for RISCV
- ARM SMMUv3
And concurrently work on the algorithm side:
- debugfs content dump, like VT-d has
- Cut support
- Increase/Decrease page size support
- map/unmap batching
- KHO
As we make more algorithm improvements the value to convert the drivers
increases.
This is on github: https://github.com/jgunthorpe/linux/commits/iommu_pt
v8:
- Remove unused to_amdv1pt/common_to_amdv1pt/to_x86_64_pt/common_to_x86_64_pt
- Fix 32 bit udiv compile failure in the kunit
v7: https://patch.msgid.link/r/0-v7-ab019a8791e2+175b8-iommu_pt_jgg@nvidia.com
- Rebase to v6.18-rc2
- Improve comments and documentation
- Add a few missed __sme_sets() for AMD CC
- Rename pt_iommu_flush_ops -> pt_iommu_driver_ops
VT-D -> VT-d
pt_clear_entry -> pt_clear_entries
pt_entry_write_is_dirty -> pt_entry_is_write_dirty
pt_entry_set_write_clean -> pt_entry_make_write_clean
- Tidy some of the map flow into a new function do_map()
- Fix ffz64()
v6: https://patch.msgid.link/r/0-v6-0fb54a1d9850+36b-iommu_pt_jgg@nvidia.com
- Improve comments and documentation
- Rename pt_entry_oa_full -> pt_entry_oa_exact
pt_has_system_page -> pt_has_system_page_size
pt_max_output_address_lg2 -> pt_max_oa_lg2
log2_f*() -> vaf* / oaf* / f*_t
pt_item_fully_covered -> pt_entry_fully_covered
- Fix missed constant propogation causing division
- Consolidate debugging checks to pt_check_install_leaf_args()
- Change collect->ignore_mapped to check_mapped
- Shuffle some hunks around to more appropriate patches
- Two new mini kunit tests
v5: https://patch.msgid.link/r/0-v5-116c4948af3d+68091-iommu_pt_jgg@nvidia.com
- Text grammar updates and kdoc fixes
v4: https://patch.msgid.link/r/0-v4-0d6a6726a372+18959-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc3
- Integrate the HATS/HATDis changes
- Remove 'default n' from kconfig
- Remove unused 'PT_FIXED_TOP_LEVEL'
- Improve comments and documentation
- Fix some compile warnings from kbuild robots
v3: https://patch.msgid.link/r/0-v3-a93aab628dbc+521-iommu_pt_jgg@nvidia.com
- Rebase on v6.16-rc2
- s/PT_ENTRY_WORD_SIZE/PT_ITEM_WORD_SIZE/s to follow the language better
- Comment and documentation updates
- Add PT_TOP_PHYS_MASK to help manage alignment restrictions on the top
pointer
- Add missed force_aperture = true
- Make pt_iommu_deinit() take care of the not-yet-inited error case
internally as AMD/RISCV/VTD all shared this logic
- Change gather_range() into gather_range_pages() so it also deals with
the page list. This makes the following cache flushing series simpler
- Fix missed update of unmap->unmapped in some error cases
- Change clear_contig() to order the gather more logically
- Remove goto from the error handling in __map_range_leaf()
- s/log2_/oalog2_/ in places where the argument is an oaddr_t
- Pass the pts to pt_table_install64/32()
- Do not use SIGN_EXTEND for the AMDv2 page table because of Vasant's
information on how PASID 0 works.
v2: https://patch.msgid.link/r/0-v2-5c26bde5c22d+58b-iommu_pt_jgg@nvidia.com
- AMD driver only, many code changes
RFC: https://lore.kernel.org/all/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/
Cc: Michael Roth <michael.roth(a)amd.com>
Cc: Alexey Kardashevskiy <aik(a)amd.com>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: James Gowans <jgowans(a)amazon.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
Alejandro Jimenez (1):
iommu/amd: Use the generic iommu page table
Jason Gunthorpe (14):
genpt: Generic Page Table base API
genpt: Add Documentation/ files
iommupt: Add the basic structure of the iommu implementation
iommupt: Add the AMD IOMMU v1 page table format
iommupt: Add iova_to_phys op
iommupt: Add unmap_pages op
iommupt: Add map_pages op
iommupt: Add read_and_clear_dirty op
iommupt: Add a kunit test for Generic Page Table
iommupt: Add a mock pagetable format for iommufd selftest to use
iommufd: Change the selftest to use iommupt instead of xarray
iommupt: Add the x86 64 bit page table format
iommu/amd: Remove AMD io_pgtable support
iommupt: Add a kunit test for the IOMMU implementation
.clang-format | 1 +
Documentation/driver-api/generic_pt.rst | 142 ++
Documentation/driver-api/index.rst | 1 +
drivers/iommu/Kconfig | 2 +
drivers/iommu/Makefile | 1 +
drivers/iommu/amd/Kconfig | 5 +-
drivers/iommu/amd/Makefile | 2 +-
drivers/iommu/amd/amd_iommu.h | 1 -
drivers/iommu/amd/amd_iommu_types.h | 110 +-
drivers/iommu/amd/io_pgtable.c | 577 --------
drivers/iommu/amd/io_pgtable_v2.c | 370 ------
drivers/iommu/amd/iommu.c | 538 ++++----
drivers/iommu/generic_pt/.kunitconfig | 13 +
drivers/iommu/generic_pt/Kconfig | 68 +
drivers/iommu/generic_pt/fmt/Makefile | 26 +
drivers/iommu/generic_pt/fmt/amdv1.h | 411 ++++++
drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 +
drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 +
drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 +
drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 +
drivers/iommu/generic_pt/fmt/iommu_template.h | 48 +
drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 11 +
drivers/iommu/generic_pt/fmt/x86_64.h | 255 ++++
drivers/iommu/generic_pt/iommu_pt.h | 1162 +++++++++++++++++
drivers/iommu/generic_pt/kunit_generic_pt.h | 713 ++++++++++
drivers/iommu/generic_pt/kunit_iommu.h | 183 +++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 487 +++++++
drivers/iommu/generic_pt/pt_common.h | 358 +++++
drivers/iommu/generic_pt/pt_defs.h | 329 +++++
drivers/iommu/generic_pt/pt_fmt_defaults.h | 233 ++++
drivers/iommu/generic_pt/pt_iter.h | 636 +++++++++
drivers/iommu/generic_pt/pt_log2.h | 122 ++
drivers/iommu/io-pgtable.c | 4 -
drivers/iommu/iommufd/Kconfig | 1 +
drivers/iommu/iommufd/iommufd_test.h | 11 +-
drivers/iommu/iommufd/selftest.c | 438 +++----
include/linux/generic_pt/common.h | 167 +++
include/linux/generic_pt/iommu.h | 271 ++++
include/linux/io-pgtable.h | 2 -
include/linux/irqchip/riscv-imsic.h | 3 +-
tools/testing/selftests/iommu/iommufd.c | 60 +-
tools/testing/selftests/iommu/iommufd_utils.h | 12 +
42 files changed, 6229 insertions(+), 1612 deletions(-)
create mode 100644 Documentation/driver-api/generic_pt.rst
delete mode 100644 drivers/iommu/amd/io_pgtable.c
delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c
create mode 100644 drivers/iommu/generic_pt/.kunitconfig
create mode 100644 drivers/iommu/generic_pt/Kconfig
create mode 100644 drivers/iommu/generic_pt/fmt/Makefile
create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h
create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h
create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c
create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h
create mode 100644 drivers/iommu/generic_pt/iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_generic_pt.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu.h
create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h
create mode 100644 drivers/iommu/generic_pt/pt_common.h
create mode 100644 drivers/iommu/generic_pt/pt_defs.h
create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h
create mode 100644 drivers/iommu/generic_pt/pt_iter.h
create mode 100644 drivers/iommu/generic_pt/pt_log2.h
create mode 100644 include/linux/generic_pt/common.h
create mode 100644 include/linux/generic_pt/iommu.h
base-commit: 8440410283bb5533b676574211f31f030a18011b
--
2.43.0
'available_events' is actually not required by
'test.d/event/toplevel-enable.tc' and its Existence has been tested in
'test.d/00basic/basic4.tc'.
So the require of 'available_events' can be dropped and then we can add
'instance' flag to test 'test.d/event/toplevel-enable.tc' for instance.
Test result show as below:
# ./ftracetest test.d/event/toplevel-enable.tc
=== Ftrace unit tests ===
[1] event tracing - enable/disable with top level files [PASS]
[2] (instance) event tracing - enable/disable with top level files [PASS]
# of passed: 2
# of failed: 0
# of unresolved: 0
# of untested: 0
# of unsupported: 0
# of xfailed: 0
# of undefined(test bug): 0
Signed-off-by: Zheng Yejian <zhengyejian1(a)huawei.com>
---
tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
index 93c10ea42a68..8b8e1aea985b 100644
--- a/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
+++ b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
@@ -1,7 +1,8 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# description: event tracing - enable/disable with top level files
-# requires: available_events set_event events/enable
+# requires: set_event events/enable
+# flags: instance
do_reset() {
echo > set_event
--
2.25.1
At this point I think everyone in the on the kernel side is happy with
this but there were some questions from the glibc side about the value
of controlling the shadow stack placement and size, especially with the
current inability to reuse the shadow stack for an exited thread. With
support for reuse it would be possible to have a cache of shadow stacks
as is currently supported for the normal stack.
Since the discussion petered out I'm resending this in order to give
people something work with while prototyping. It should be possible to
prototype any potential kernel features to help build out shadow stack
support in userspace by enabling shadow stack writes, as suggested by
Rick Edgecombe this may end up being required anyway for supporting more
exotic scenarios. On all current architectures with the feature writes
to shadow stack require specific instructions so there are still
security benefits even with writes enabled.
I did send a change implementing a feature writing a token on thread
exit to allow reuse:
https://lore.kernel.org/r/20250921-arm64-gcs-exit-token-v1-0-45cf64e648d5@k…
but wasn't planning to refresh it without some indication from the
userspace side that that'd be useful.
Non-process cover letter:
The kernel has added support for shadow stacks, currently x86 only using
their CET feature but both arm64 and RISC-V have equivalent features
(GCS and Zicfiss respectively), I am actively working on GCS[1]. With
shadow stacks the hardware maintains an additional stack containing only
the return addresses for branch instructions which is not generally
writeable by userspace and ensures that any returns are to the recorded
addresses. This provides some protection against ROP attacks and making
it easier to collect call stacks. These shadow stacks are allocated in
the address space of the userspace process.
Our API for shadow stacks does not currently offer userspace any
flexiblity for managing the allocation of shadow stacks for newly
created threads, instead the kernel allocates a new shadow stack with
the same size as the normal stack whenever a thread is created with the
feature enabled. The stacks allocated in this way are freed by the
kernel when the thread exits or shadow stacks are disabled for the
thread. This lack of flexibility and control isn't ideal, in the vast
majority of cases the shadow stack will be over allocated and the
implicit allocation and deallocation is not consistent with other
interfaces. As far as I can tell the interface is done in this manner
mainly because the shadow stack patches were in development since before
clone3() was implemented.
Since clone3() is readily extensible let's add support for specifying a
shadow stack when creating a new thread or process, keeping the current
implicit allocation behaviour if one is not specified either with
clone3() or through the use of clone(). The user must provide a shadow
stack pointer, this must point to memory mapped for use as a shadow
stackby map_shadow_stack() with an architecture specified shadow stack
token at the top of the stack.
Yuri Khrustalev has raised questions from the libc side regarding
discoverability of extended clone3() structure sizes[2], this seems like
a general issue with clone3(). There was a suggestion to add a hwcap on
arm64 which isn't ideal but is doable there, though architecture
specific mechanisms would also be needed for x86 (and RISC-V if it's
support gets merged before this does). The idea has, however, had
strong pushback from the architecture maintainers and it is possible to
detect support for this in clone3() by attempting a call with a
misaligned shadow stack pointer specified so no hwcap has been added.
[1] https://lore.kernel.org/linux-arm-kernel/20241001-arm64-gcs-v13-0-222b78d87…
[2] https://lore.kernel.org/r/aCs65ccRQtJBnZ_5@arm.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v23:
- Rebase onto v6.19-rc1.
- Link to v22: https://lore.kernel.org/r/20251015-clone3-shadow-stack-v22-0-a8c8da011427@k…
Changes in v22:
- Rebase onto v6.18-rc1.
- Cover letter updates.
- Link to v21: https://lore.kernel.org/r/20250916-clone3-shadow-stack-v21-0-910493527013@k…
Changes in v21:
- Rebase onto https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git kernel-6.18.clone3
- Rename shadow_stack_token to shstk_token, since it's a simple rename I've
kept the acks and reviews but I dropped the tested-bys just to be safe.
- Link to v20: https://lore.kernel.org/r/20250902-clone3-shadow-stack-v20-0-4d9fff1c53e7@k…
Changes in v20:
- Comment fixes and clarifications in x86 arch_shstk_validate_clone()
from Rick Edgecombe.
- Spelling fix in documentation.
- Link to v19: https://lore.kernel.org/r/20250819-clone3-shadow-stack-v19-0-bc957075479b@k…
Changes in v19:
- Rebase onto v6.17-rc1.
- Link to v18: https://lore.kernel.org/r/20250702-clone3-shadow-stack-v18-0-7965d2b694db@k…
Changes in v18:
- Rebase onto v6.16-rc3.
- Thanks to pointers from Yuri Khrustalev this version has been tested
on x86 so I have removed the RFT tag.
- Clarify clone3_shadow_stack_valid() comment about the Kconfig check.
- Remove redundant GCSB DSYNCs in arm64 code.
- Fix token validation on x86.
- Link to v17: https://lore.kernel.org/r/20250609-clone3-shadow-stack-v17-0-8840ed97ff6f@k…
Changes in v17:
- Rebase onto v6.16-rc1.
- Link to v16: https://lore.kernel.org/r/20250416-clone3-shadow-stack-v16-0-2ffc9ca3917b@k…
Changes in v16:
- Rebase onto v6.15-rc2.
- Roll in fixes from x86 testing from Rick Edgecombe.
- Rework so that the argument is shadow_stack_token.
- Link to v15: https://lore.kernel.org/r/20250408-clone3-shadow-stack-v15-0-3fa245c6e3be@k…
Changes in v15:
- Rebase onto v6.15-rc1.
- Link to v14: https://lore.kernel.org/r/20250206-clone3-shadow-stack-v14-0-805b53af73b9@k…
Changes in v14:
- Rebase onto v6.14-rc1.
- Link to v13: https://lore.kernel.org/r/20241203-clone3-shadow-stack-v13-0-93b89a81a5ed@k…
Changes in v13:
- Rebase onto v6.13-rc1.
- Link to v12: https://lore.kernel.org/r/20241031-clone3-shadow-stack-v12-0-7183eb8bee17@k…
Changes in v12:
- Add the regular prctl() to the userspace API document since arm64
support is queued in -next.
- Link to v11: https://lore.kernel.org/r/20241005-clone3-shadow-stack-v11-0-2a6a2bd6d651@k…
Changes in v11:
- Rebase onto arm64 for-next/gcs, which is based on v6.12-rc1, and
integrate arm64 support.
- Rework the interface to specify a shadow stack pointer rather than a
base and size like we do for the regular stack.
- Link to v10: https://lore.kernel.org/r/20240821-clone3-shadow-stack-v10-0-06e8797b9445@k…
Changes in v10:
- Integrate fixes & improvements for the x86 implementation from Rick
Edgecombe.
- Require that the shadow stack be VM_WRITE.
- Require that the shadow stack base and size be sizeof(void *) aligned.
- Clean up trailing newline.
- Link to v9: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99464@ke…
Changes in v9:
- Pull token validation earlier and report problems with an error return
to parent rather than signal delivery to the child.
- Verify that the top of the supplied shadow stack is VM_SHADOW_STACK.
- Rework token validation to only do the page mapping once.
- Drop no longer needed support for testing for signals in selftest.
- Fix typo in comments.
- Link to v8: https://lore.kernel.org/r/20240808-clone3-shadow-stack-v8-0-0acf37caf14c@ke…
Changes in v8:
- Fix token verification with user specified shadow stack.
- Don't track user managed shadow stacks for child processes.
- Link to v7: https://lore.kernel.org/r/20240731-clone3-shadow-stack-v7-0-a9532eebfb1d@ke…
Changes in v7:
- Rebase onto v6.11-rc1.
- Typo fixes.
- Link to v6: https://lore.kernel.org/r/20240623-clone3-shadow-stack-v6-0-9ee7783b1fb9@ke…
Changes in v6:
- Rebase onto v6.10-rc3.
- Ensure we don't try to free the parent shadow stack in error paths of
x86 arch code.
- Spelling fixes in userspace API document.
- Additional cleanups and improvements to the clone3() tests to support
the shadow stack tests.
- Link to v5: https://lore.kernel.org/r/20240203-clone3-shadow-stack-v5-0-322c69598e4b@ke…
Changes in v5:
- Rebase onto v6.8-rc2.
- Rework ABI to have the user allocate the shadow stack memory with
map_shadow_stack() and a token.
- Force inlining of the x86 shadow stack enablement.
- Move shadow stack enablement out into a shared header for reuse by
other tests.
- Link to v4: https://lore.kernel.org/r/20231128-clone3-shadow-stack-v4-0-8b28ffe4f676@ke…
Changes in v4:
- Formatting changes.
- Use a define for minimum shadow stack size and move some basic
validation to fork.c.
- Link to v3: https://lore.kernel.org/r/20231120-clone3-shadow-stack-v3-0-a7b8ed3e2acc@ke…
Changes in v3:
- Rebase onto v6.7-rc2.
- Remove stale shadow_stack in internal kargs.
- If a shadow stack is specified unconditionally use it regardless of
CLONE_ parameters.
- Force enable shadow stacks in the selftest.
- Update changelogs for RISC-V feature rename.
- Link to v2: https://lore.kernel.org/r/20231114-clone3-shadow-stack-v2-0-b613f8681155@ke…
Changes in v2:
- Rebase onto v6.7-rc1.
- Remove ability to provide preallocated shadow stack, just specify the
desired size.
- Link to v1: https://lore.kernel.org/r/20231023-clone3-shadow-stack-v1-0-d867d0b5d4d0@ke…
---
Mark Brown (8):
arm64/gcs: Return a success value from gcs_alloc_thread_stack()
Documentation: userspace-api: Add shadow stack API documentation
selftests: Provide helper header for shadow stack testing
fork: Add shadow stack support to clone3()
selftests/clone3: Remove redundant flushes of output streams
selftests/clone3: Factor more of main loop into test_clone3()
selftests/clone3: Allow tests to flag if -E2BIG is a valid error code
selftests/clone3: Test shadow stack support
Documentation/userspace-api/index.rst | 1 +
Documentation/userspace-api/shadow_stack.rst | 44 +++++
arch/arm64/include/asm/gcs.h | 8 +-
arch/arm64/kernel/process.c | 8 +-
arch/arm64/mm/gcs.c | 55 +++++-
arch/x86/include/asm/shstk.h | 11 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/shstk.c | 53 ++++-
include/asm-generic/cacheflush.h | 11 ++
include/linux/sched/task.h | 17 ++
include/uapi/linux/sched.h | 9 +-
kernel/fork.c | 93 +++++++--
tools/testing/selftests/clone3/clone3.c | 226 ++++++++++++++++++----
tools/testing/selftests/clone3/clone3_selftests.h | 65 ++++++-
tools/testing/selftests/ksft_shstk.h | 98 ++++++++++
15 files changed, 620 insertions(+), 81 deletions(-)
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20231019-clone3-shadow-stack-15d40d2bf536
Best regards,
--
Mark Brown <broonie(a)kernel.org>
The devpts_pts selftest has an ifdef in case an architecture does not
define TIOCGPTPEER, but the handling for this is broken since we need
errno to be set to EINVAL in order to skip the test as we should. Given
that this ioctl() has been defined since v4.15 we may as well just assume
it's there rather than write handling code which will probably never get
used.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v2:
- Rebase onto v6.19-rc1.
- Link to v1: https://patch.msgid.link/20251126-selftests-filesystems-devpts-tiocgptpeer-…
---
tools/testing/selftests/filesystems/devpts_pts.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/filesystems/devpts_pts.c b/tools/testing/selftests/filesystems/devpts_pts.c
index 54fea349204e..950e8b7f675b 100644
--- a/tools/testing/selftests/filesystems/devpts_pts.c
+++ b/tools/testing/selftests/filesystems/devpts_pts.c
@@ -100,7 +100,7 @@ static int resolve_procfd_symlink(int fd, char *buf, size_t buflen)
static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
{
int ret;
- int master = -1, slave = -1, fret = -1;
+ int master = -1, slave, fret = -1;
master = open(ptmx, O_RDWR | O_NOCTTY | O_CLOEXEC);
if (master < 0) {
@@ -119,9 +119,7 @@ static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
goto do_cleanup;
}
-#ifdef TIOCGPTPEER
slave = ioctl(master, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC);
-#endif
if (slave < 0) {
if (errno == EINVAL) {
fprintf(stderr, "TIOCGPTPEER is not supported. "
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251126-selftests-filesystems-devpts-tiocgptpeer-fbd30e579859
Best regards,
--
Mark Brown <broonie(a)kernel.org>
During my testing, I found that guest debugging with 'DR6.BD' does not
work in instruction emulation, as the current code only considers the
guest's DR7. Upon reviewing the code, I also observed that the checks
for the userspace guest debugging feature and the guest's own debugging
feature are repeated in different places during instruction
emulation, but the overall logic is the same. If guest debug
is enabled, it needs to exit to userspace; otherwise, a #DB
exception needs to be injected into the guest. Therefore, as
suggested by Jiangshan Lai, some cleanup has been done for #DB
handling in instruction emulation in this patchset. A new
function named 'kvm_inject_emulated_db()' is introduced to
consolidate all the checking logic. Moreover, I hope we can make
the #DB interception path use the same function as well.
Additionally, when I looked into the single-step #DB handling in
instruction emulation, I noticed that the interrupt shadow is toggled,
but it is not considered in the single-step #DB injection. This
oversight causes VM entry to fail on VMX (due to pending debug
exceptions state checking).
As pointed out by Sean, fault-like code #DBs can be coincident with
trap-like single-step #DBs at the instruction boundary on the hardware.
However it is difficult to emulate this in the emulator, as
kvm_vcpu_check_code_breakpoint() is called at the start of the next
instruction while the single-step #DB for the previous instruction has
already been injected.
v1->v2:
- cleanup in inject_emulated_exception().
- rename 'set_pending_dbg' callback as 'refresh_pending_dbg_exceptions'.
- fold refresh_pending_dbg_exceptions() call into
kvm_vcpu_do_singlestep().
- Split the change to move up kvm_set_rflags() into a single patch.
- Move the #DB and IRQ handler registration after guest debug testcases.
Hou Wenlong (9):
KVM: x86: Capture "struct x86_exception" in
inject_emulated_exception()
KVM: x86: Set guest DR6 by kvm_queue_exception_p() in instruction
emulation
KVM: x86: Check guest debug in DR access instruction emulation
KVM: x86: Only check effective code breakpoint in emulation
KVM: x86: Consolidate KVM_GUESTDBG_SINGLESTEP check into the
kvm_inject_emulated_db()
KVM: x86: Move kvm_set_rflags() up before kvm_vcpu_do_singlestep()
KVM: VMX: Refresh 'PENDING_DBG_EXCEPTIONS.BS' bit during instruction
emulation
KVM: selftests: Verify guest debug DR7.GD checking during instruction
emulation
KVM: selftests: Verify 'BS' bit checking in pending debug exception
during VM entry
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/emulate.c | 14 +--
arch/x86/kvm/kvm_emulate.h | 7 +-
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/vmx.c | 15 ++-
arch/x86/kvm/vmx/x86_ops.h | 1 +
arch/x86/kvm/x86.c | 116 ++++++++++--------
arch/x86/kvm/x86.h | 7 ++
.../selftests/kvm/include/x86/processor.h | 3 +-
tools/testing/selftests/kvm/x86/debug_regs.c | 72 ++++++++++-
11 files changed, 178 insertions(+), 68 deletions(-)
base-commit: 5d3e2d9ba9ed68576c70c127e4f7446d896f2af2
--
2.31.1
During my testing, I found that guest debugging with 'DR6.BD' does not
work in instruction emulation, as the current code only considers the
guest's DR7. Upon reviewing the code, I also observed that the checks
for the userspace guest debugging feature and the guest's own debugging
feature are repeated in different places during instruction
emulation, but the overall logic is the same. If guest debugging
is enabled, it needs to exit to userspace; otherwise, a #DB
exception needs to be injected into the guest. Therefore, as
suggested by Jiangshan Lai, some cleanup has been done for #DB
handling in instruction emulation in this patchset. A new
function named 'kvm_inject_emulated_db()' is introduced to
consolidate all the checking logic. Moreover, I hope we can make
the #DB interception path use the same function as well.
Additionally, when I looked into the single-step #DB handling in
instruction emulation, I noticed that the interrupt shadow is toggled,
but it is not considered in the single-step #DB injection. This
oversight causes VM entry to fail on VMX (due to pending debug
exceptions checking) or breaks the 'MOV SS' suppressed #DB. For the
latter, I have kept the behavior for now in my patchset, as I need some
suggestions.
Hou Wenlong (7):
KVM: x86: Set guest DR6 by kvm_queue_exception_p() in instruction
emulation
KVM: x86: Check guest debug in DR access instruction emulation
KVM: x86: Only check effective code breakpoint in emulation
KVM: x86: Consolidate KVM_GUESTDBG_SINGLESTEP check into the
kvm_inject_emulated_db()
KVM: VMX: Set 'BS' bit in pending debug exceptions during instruction
emulation
KVM: selftests: Verify guest debug DR7.GD checking during instruction
emulation
KVM: selftests: Verify 'BS' bit checking in pending debug exception
during VM entry
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/emulate.c | 14 +--
arch/x86/kvm/kvm_emulate.h | 7 +-
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/vmx.c | 14 ++-
arch/x86/kvm/vmx/x86_ops.h | 1 +
arch/x86/kvm/x86.c | 109 +++++++++++-------
arch/x86/kvm/x86.h | 7 ++
.../selftests/kvm/include/x86/processor.h | 3 +-
tools/testing/selftests/kvm/x86/debug_regs.c | 64 +++++++++-
11 files changed, 167 insertions(+), 63 deletions(-)
base-commit: ecbcc2461839e848970468b44db32282e5059925
--
2.31.1
This series makes the output from the ofdlocks test a bit easier for
tooling to work with, and also ignores the generated file while we're
here.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v3:
- Rebase onto v6.19-rc1.
- Link to v2: https://lore.kernel.org/r/20251015-selftest-filelock-ktap-v2-0-f5fd21b75c3a…
Changes in v2:
- Rebase onto v6.18-rc1.
- Link to v1: https://lore.kernel.org/r/20250818-selftest-filelock-ktap-v1-0-d41af77f1396…
---
Mark Brown (3):
kselftest/filelock: Use ksft_perror()
kselftest/filelock: Report each test in oftlocks separately
kselftest/filelock: Add a .gitignore file
tools/testing/selftests/filelock/.gitignore | 1 +
tools/testing/selftests/filelock/ofdlocks.c | 94 +++++++++++++----------------
2 files changed, 42 insertions(+), 53 deletions(-)
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20250604-selftest-filelock-ktap-f2ae998a0de0
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Hello,
While writing some Kunit tests for ext4 filesystem, I'm encountering an
issue in the way we display the diagnostic logs upon failures, when
using KUNIT_CASE_PARAM() to write the tests.
This can be observed by patching fs/ext4/mballoc-test.c to fail
and print one of the params:
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -350,6 +350,8 @@ static int mbt_kunit_init(struct kunit *test)
struct super_block *sb;
int ret;
+ KUNIT_FAIL(test, "Failed: blocksize_bits=%d", layout->blocksize_bits);
+
sb = mbt_ext4_alloc_super_block();
if (sb == NULL)
return -ENOMEM;
With the above change, we can observe the following output (snipped):
[18:50:25] ============== ext4_mballoc_test (7 subtests) ==============
[18:50:25] ================= test_new_blocks_simple ==================
[18:50:25] [FAILED] block_bits=10 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=12
[18:50:25] [FAILED] block_bits=12 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=16
[18:50:25] [FAILED] block_bits=16 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
[18:50:25] # test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
[18:50:25] Failed: blocksize_bits=10
[18:50:25] # test_new_blocks_simple: pass:0 fail:3 skip:0 total:3
[18:50:25] ============= [FAILED] test_new_blocks_simple ==============
<snip>
Note that the diagnostic logs don't show up correctly. Ideally they
should be before test result but here the first [FAILED] test has no
logs printed above whereas the last "Failed: blocksize_bits=10" print
comes after the last subtest, when it actually corresponds to the first
subtest.
The KTAP file itself seems to have diagnostic logs in the right place:
KTAP version 1
1..2
KTAP version 1
# Subtest: ext4_mballoc_test
# module: ext4
1..7
KTAP version 1
# Subtest: test_new_blocks_simple
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=10
not ok 1 block_bits=10 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=12
not ok 2 block_bits=12 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: EXPECTATION FAILED at fs/ext4/mballoc-test.c:364
Failed: blocksize_bits=16
not ok 3 block_bits=16 cluster_bits=3 blocks_per_group=8192 group_count=4 desc_size=64
# test_new_blocks_simple: pass:0 fail:3 skip:0 total:3
not ok 1 test_new_blocks_simple
<snip>
By tracing kunit_parser.py script, I could see the issue here is in the
parsing of the "Subtest: test_new_blocks_simple". We end up associating
everything below the subtest till "not ok 1 block_bits=10..." as
diagnostic logs of the subtest, while these lons actually belong to the
first of the 3 subtests under this test.
I tired to figure out a way to fix the parsing but fixing one thing
broke something else. Im starting to think that the issue is that there
are 3 subtests under test_new_block_simple (array of 3 params passed to
KUNIT_CASE_PARAM), but instead of creating 3 structured subtests, the
KTAP output dumps the results of all 3 directly under
subtest:test_new_blocks_simple. Which makes it tricky to determine where
the diagnostic log/attributes of test_new_blocks_simple ends and that of its
children begins.
I'm not very familiar with KUnit framework so I though I'd reach out
here for some pointers. I can dedicate some time fixing this but I'd
like to know if this is something we need to somehow fix in parsing or
during generation of the KTAP file itself? Any pointers would be
appreciated.
Thanks,
Ojaswin
From: Li RongQing <lirongqing(a)baidu.com>
The softlockup_panic sysctl is currently a binary option: panic immediately
or never panic on soft lockups.
Panicking on any soft lockup, regardless of duration, can be overly
aggressive for brief stalls that may be caused by legitimate operations.
Conversely, never panicking may allow severe system hangs to persist
undetected.
Extend softlockup_panic to accept an integer threshold, allowing the kernel
to panic only when the normalized lockup duration exceeds N watchdog
threshold periods. This provides finer-grained control to distinguish
between transient delays and persistent system failures.
The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic when duration >= 1 * threshold (20s default, original behavior)
- N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.)
The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility while allowing systems to tolerate brief lockups
while still catching severe, persistent hangs.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
Cc: Eduard Zingerman <eddyz87(a)gmail.com>
Cc: Hao Luo <haoluo(a)google.com>
Cc: Jiri Olsa <jolsa(a)kernel.org>
Cc: John Fastabend <john.fastabend(a)gmail.com>
Cc: KP Singh <kpsingh(a)kernel.org>
Cc: Lance Yang <lance.yang(a)linux.dev>
Cc: Martin KaFai Lau <martin.lau(a)linux.dev>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Song Liu <song(a)kernel.org>
Cc: Stanislav Fomichev <sdf(a)fomichev.me>
Cc: Yonghong Song <yonghong.song(a)linux.dev>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
---
Diff with v1: add a temp variable thresh_count
chang config to 0 in kernel/configs/debug.config
Documentation/admin-guide/kernel-parameters.txt | 10 +++++-----
arch/arm/configs/aspeed_g5_defconfig | 2 +-
arch/arm/configs/pxa3xx_defconfig | 2 +-
arch/openrisc/configs/or1klitex_defconfig | 2 +-
arch/powerpc/configs/skiroot_defconfig | 2 +-
drivers/gpu/drm/ci/arm.config | 2 +-
drivers/gpu/drm/ci/arm64.config | 2 +-
drivers/gpu/drm/ci/x86_64.config | 2 +-
kernel/configs/debug.config | 2 +-
kernel/watchdog.c | 10 ++++++----
lib/Kconfig.debug | 13 +++++++------
tools/testing/selftests/bpf/config | 2 +-
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
13 files changed, 28 insertions(+), 25 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d0afd..27c5f96 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6934,12 +6934,12 @@ Kernel parameters
softlockup_panic=
[KNL] Should the soft-lockup detector generate panics.
- Format: 0 | 1
+ Format: <int>
- A value of 1 instructs the soft-lockup detector
- to panic the machine when a soft-lockup occurs. It is
- also controlled by the kernel.softlockup_panic sysctl
- and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
+ A value of non-zero instructs the soft-lockup detector
+ to panic the machine when a soft-lockup duration exceeds
+ N thresholds. It is also controlled by the kernel.softlockup_panic
+ sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
respective build-time switch to that functionality.
softlockup_all_cpu_backtrace=
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 2e6ea13..ec558e5 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 07d422f..fb272e3 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_SHIRQ=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
# CONFIG_SCHED_DEBUG is not set
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index fb1eb9a..984b0e3 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf"
CONFIG_PRINTK_TIME=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 2b71a6d..a4114fc 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_WQ_WATCHDOG=y
diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config
index 411e814..d7c5167 100644
--- a/drivers/gpu/drm/ci/arm.config
+++ b/drivers/gpu/drm/ci/arm.config
@@ -52,7 +52,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=n
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_FW_LOADER_COMPRESS=y
diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config
index fddfbd4..ea0e307 100644
--- a/drivers/gpu/drm/ci/arm64.config
+++ b/drivers/gpu/drm/ci/arm64.config
@@ -161,7 +161,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config
index 8eaba388..7ac98a7 100644
--- a/drivers/gpu/drm/ci/x86_64.config
+++ b/drivers/gpu/drm/ci/x86_64.config
@@ -47,7 +47,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 9f6ab7d..774702591 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y
# Debug Oops, Lockups and Hangs
#
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0
-# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_PANIC_ON_OOPS=y
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 0685e3a..8168e0d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
unsigned long touch_ts, period_ts, now;
struct pt_regs *regs = get_irq_regs();
- int duration;
int softlockup_all_cpu_backtrace;
+ int duration, thresh_count;
unsigned long flags;
if (!watchdog_enabled)
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ thresh_count = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && thresh_count >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939..17a7a77 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM
the CPU stats and the interrupt counts during the "soft lockups".
config BOOTPARAM_SOFTLOCKUP_PANIC
- bool "Panic (Reboot) On Soft Lockups"
+ int "Panic (Reboot) On Soft Lockups"
depends on SOFTLOCKUP_DETECTOR
+ default 0
help
- Say Y here to enable the kernel to panic on "soft lockups",
- which are bugs that cause the kernel to loop in kernel
- mode for more than 20 seconds (configurable using the watchdog_thresh
- sysctl), without giving other tasks a chance to run.
+ Set to a non-zero value N to enable the kernel to panic on "soft
+ lockups", which are bugs that cause the kernel to loop in kernel
+ mode for more than (N * 20 seconds) (configurable using the
+ watchdog_thresh sysctl), without giving other tasks a chance to run.
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
@@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
high-availability systems that have uptime guarantees and
where a lockup must be resolved ASAP.
- Say N if unsure.
+ Say 0 if unsure.
config HAVE_HARDLOCKUP_DETECTOR_BUDDY
bool
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 558839e..2485538 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -1,6 +1,6 @@
CONFIG_BLK_DEV_LOOP=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BPF=y
CONFIG_BPF_EVENTS=y
CONFIG_BPF_JIT=y
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 0504c11..bb89d2d 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
--
2.9.4
sched_ext tasks can be starved by long-running RT tasks, especially since
RT throttling was replaced by deadline servers to boost only SCHED_NORMAL
tasks.
Several users in the community have reported issues with RT stalling
sched_ext tasks. This is fairly common on distributions or environments
where applications like video compositors, audio services, etc. run as RT
tasks by default.
Example trace (showing a per-CPU kthread stalled due to the sway Wayland
compositor running as an RT task):
runnable task stall (kworker/0:0[106377] failed to run for 5.043s)
...
CPU 0 : nr_run=3 flags=0xd cpu_rel=0 ops_qseq=20646200 pnt_seq=45388738
curr=sway[994] class=rt_sched_class
R kworker/0:0[106377] -5043ms
scx_state/flags=3/0x1 dsq_flags=0x0 ops_state/qseq=0/0
sticky/holding_cpu=-1/-1 dsq_id=0x8000000000000002 dsq_vtime=0 slice=20000000
cpus=01
This is often perceived as a bug in the BPF schedulers, but in reality they
can't do much: RT tasks run outside their control and can potentially
consume 100% of the CPU bandwidth.
Fix this by adding a sched_ext deadline server, so that sched_ext tasks are
also boosted and do not suffer starvation.
Two kselftests are also provided to verify the starvation fixes and
bandwidth allocation is correct.
== Design ==
- The EXT server is initialized at boot time and remains configured
throughout the system's lifetime
- It starts automatically when the first sched_ext task is enqueued
(rq->scx.nr_running == 1)
- The server's pick function (ext_server_pick_task) always selects
sched_ext tasks when active
- Runtime accounting happens in update_curr_scx() during task execution
and update_curr_idle() when idle
- Bandwidth accounting includes both fair and ext servers in root domain
calculations
- A debugfs interface (/sys/kernel/debug/sched/ext_server/) allows runtime
tuning of server parameters
== Highlights in this version ==
As discussed at the sched_ext microconference at LPC Tokyo, the plan is to
start with a simpler approach, avoiding automatically creating or tearing
down the EXT server bandwidth reservation when a BPF scheduler is loaded or
unloaded. Instead, the reservation is kept permanently active. This
significantly simplifies the logic while still addressing the starvation
issue.
Any fine-tuning of the bandwidth reservation is delegated to the system
administrator, who can adjust it via the debugfs interface. In the future,
a more suitable interface can be introduced and automatic removal of the
reservation when the BPF scheduler is unloaded can be revisited.
This patchset is also available in the following git branch:
git://git.kernel.org/pub/scm/linux/kernel/git/arighi/linux.git scx-dl-server
Changes in v11:
- do not create/remove the bandwidth reservation for the ext server when a
BPF scheduler is loaded/unloaded, but keep the reservation bandwdith
always active
- change rt_stall kselftest to validate both FAIR and EXT DL servers
- Link to v10: https://lore.kernel.org/all/20250903095008.162049-1-arighi@nvidia.com/
Changes in v10:
- reordered patches to better isolate sched_ext changes vs sched/deadline
changes (Andrea Righi)
- define ext_server only with CONFIG_SCHED_CLASS_EXT=y (Andrea Righi)
- add WARN_ON_ONCE(!cpus) check in dl_server_apply_params() (Andrea Righi)
- wait for inactive_task_timer to fire before removing the bandwidth
reservation (Juri Lelli)
- remove explicit dl_server_stop() in dequeue_task_scx() to reduce timer
reprogramming overhead (Juri Lelli)
- do not restart pick_task() when invoked by the dl_server (Tejun Heo)
- rename rq_dl_server to dl_server (Peter Zijlstra)
- fixed a missing dl_server start in dl_server_on() (Christian Loehle)
- add a comment to the rt_stall selftest to better explain the 4%
threshold (Emil Tsalapatis)
- Link to v9: https://lore.kernel.org/all/20251017093214.70029-1-arighi@nvidia.com/
Changes in v9:
- Drop the ->balance() logic as its functionality is now integrated into
->pick_task(), allowing dl_server to call pick_task_scx() directly
- Link to v8: https://lore.kernel.org/all/20250903095008.162049-1-arighi@nvidia.com/
Changes in v8:
- Add tj's patch to de-couple balance and pick_task and avoid changing
sched/core callbacks to propagate @rf
- Simplify dl_se->dl_server check (suggested by PeterZ)
- Small coding style fixes in the kselftests
- Link to v7: https://lore.kernel.org/all/20250809184800.129831-1-joelagnelf@nvidia.com/
Changes in v7:
- Rebased to Linus master
- Link to v6: https://lore.kernel.org/all/20250702232944.3221001-1-joelagnelf@nvidia.com/
Changes in v6:
- Added Acks to few patches
- Fixes to few nits suggested by Tejun
- Link to v5: https://lore.kernel.org/all/20250620203234.3349930-1-joelagnelf@nvidia.com/
Changes in v5:
- Added a kselftest (total_bw) to sched_ext to verify bandwidth values
from debugfs
- Address comment from Andrea about redundant rq clock invalidation
- Link to v4: https://lore.kernel.org/all/20250617200523.1261231-1-joelagnelf@nvidia.com/
Changes in v4:
- Fixed issues with hotplugged CPUs having their DL server bandwidth
altered due to loading SCX
- Fixed other issues
- Rebased on Linus master
- All sched_ext kselftests reliably pass now, also verified that the
total_bw in debugfs (CONFIG_SCHED_DEBUG) is conserved with these patches
- Link to v3: https://lore.kernel.org/all/20250613051734.4023260-1-joelagnelf@nvidia.com/
Changes in v3:
- Removed code duplication in debugfs. Made ext interface separate
- Fixed issue where rq_lock_irqsave was not used in the relinquish patch
- Fixed running bw accounting issue in dl_server_remove_params
- Link to v2: https://lore.kernel.org/all/20250602180110.816225-1-joelagnelf@nvidia.com/
Changes in v2:
- Fixed a hang related to using rq_lock instead of rq_lock_irqsave
- Added support to remove BW of DL servers when they are switched to/from EXT
- Link to v1: https://lore.kernel.org/all/20250315022158.2354454-1-joelagnelf@nvidia.com/
Andrea Righi (2):
sched_ext: Add a DL server for sched_ext tasks
selftests/sched_ext: Add test for sched_ext dl_server
Joel Fernandes (5):
sched/deadline: Clear the defer params
sched/debug: Fix updating of ppos on server write ops
sched/debug: Stop and start server based on if it was active
sched/debug: Add support to change sched_ext server params
selftests/sched_ext: Add test for DL server total_bw consistency
kernel/sched/core.c | 6 +
kernel/sched/deadline.c | 87 +++++--
kernel/sched/debug.c | 171 +++++++++++---
kernel/sched/ext.c | 42 ++++
kernel/sched/idle.c | 3 +
kernel/sched/sched.h | 2 +
kernel/sched/topology.c | 5 +
tools/testing/selftests/sched_ext/Makefile | 2 +
tools/testing/selftests/sched_ext/rt_stall.bpf.c | 23 ++
tools/testing/selftests/sched_ext/rt_stall.c | 240 +++++++++++++++++++
tools/testing/selftests/sched_ext/total_bw.c | 281 +++++++++++++++++++++++
11 files changed, 811 insertions(+), 51 deletions(-)
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.bpf.c
create mode 100644 tools/testing/selftests/sched_ext/rt_stall.c
create mode 100644 tools/testing/selftests/sched_ext/total_bw.c
This series creates a new PMU scheme on ARM, a partitioned PMU that
allows reserving a subset of counters for more direct guest access,
significantly reducing overhead. More details, including performance
benchmarks, can be read in the v1 cover letter linked below.
An overview of what this series accomplishes was presented at KVM
Forum 2025. Slides [1] and video [2] are linked below.
The long duration between v4 and v5 is due to time spent on this
project being monopolized preparing this feature for internal
production. As a result, there are too many improvements to fully list
here, but I will cover the notable ones.
v5:
* Rebase onto v6.18-rc7. This required pulling some reorganization
patches from Anish and Sean that were dependencies from previous
versions based on kvm/queue but never made it to upstream.
* Ensure FGTs (fine-grained traps) are correctly programmed at vCPU
load using kvm_vcpu_load_fgt() and helpers introduced by Oliver
Upton.
* Cleanly separate concerns of whether the partitioned PMU is enabled
for the guest and whether FGT should be enabled. This allows that
the capability can be VM-scoped while the implementation detail of
whether FGT and context switching are in effect can remain
vCPU-scoped.
* Shrink the uAPI change. Instead of a cap and corresponding ioctl,
the feature can be controlled by just a cap with an argument. The
cap is now also VM-scoped and enforces ordering that it should be
decided before vCPUs are created. Whether the cap is enabled is now
tracked by the new flag KVM_ARCH_ARM_PARTITIONED_PMU_ENABLED.
* Improve log messages when partitioning in the PMUv3 driver.
* Introduce a global variable armv8pmu_hpmn_max in the PMUv3 driver so
KVM code can read if a value was set before the PMU is probed. This
is needed to properly test if we have the capability before vCPUs
are created.
* Make it possible for a VMM to filter the HPMN0 feature bit.
* Fix event filter problems with PMEVTYPER handling in
writethrough_pmevtyper() and kvm_pmu_apply_event_filter() by using
kvm_pmu_event_mask() in the right spots. And if an event is
filtered, write the physical register with the appropriate exclude
bits set but keep the virtual register exactly what the guest wrote.
* Fix register access problems with the PMU register fast path handler
by lifting some static PMU access checks from sys_regs.c to use them
in the fast path too and make bit masking more strict for better ARM
compliance.
* Fix the readability and logic of programming the MDCR_EL2 register
when entering the guest. Make sure to set the HPME bit to allow host
counters to count guest events. Set TPM and TPMCR by default and
clear them if partitioning is enabled rather than the previous
inverted logic of leaving them clear and setting them if
partitioning is not enabled. Make the HPMN field computation more
clear.
* As part of lazy context switching, do a load when the guest is
switching to physical access to ensure any previous writes that only
reached the virtual registers reach the physical ones as well and
are not clobbered by the next vcpu_put().
* Other fixes and improvements that are too small to mention or left
out from my personal notes.
v4:
https://lore.kernel.org/kvmarm/20250714225917.1396543-1-coltonlewis@google.…
v3:
https://lore.kernel.org/kvm/20250626200459.1153955-1-coltonlewis@google.com/
v2:
https://lore.kernel.org/kvm/20250620221326.1261128-1-coltonlewis@google.com/
v1:
https://lore.kernel.org/kvm/20250602192702.2125115-1-coltonlewis@google.com/
[1] https://gitlab.com/qemu-project/kvm-forum/-/raw/main/_attachments/2025/Opti…
[2] https://www.youtube.com/watch?v=YRzZ8jMIA6M&list=PLW3ep1uCIRfxwmllXTOA2txfD…
Anish Ghulati (1):
KVM: arm64: Move arm_{psci,hypercalls}.h to an internal KVM path
Colton Lewis (20):
arm64: cpufeature: Add cpucap for HPMN0
KVM: arm64: Reorganize PMU functions
perf: arm_pmuv3: Introduce method to partition the PMU
perf: arm_pmuv3: Generalize counter bitmasks
perf: arm_pmuv3: Keep out of guest counter partition
KVM: arm64: Set up FGT for Partitioned PMU
KVM: arm64: Writethrough trapped PMEVTYPER register
KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned
KVM: arm64: Writethrough trapped PMOVS register
KVM: arm64: Write fast path PMU register handlers
KVM: arm64: Setup MDCR_EL2 to handle a partitioned PMU
KVM: arm64: Account for partitioning in PMCR_EL0 access
KVM: arm64: Context swap Partitioned PMU guest registers
KVM: arm64: Enforce PMU event filter at vcpu_load()
KVM: arm64: Implement lazy PMU context swaps
perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters
KVM: arm64: Inject recorded guest interrupts
KVM: arm64: Add KVM_CAP to partition the PMU
KVM: selftests: Add find_bit to KVM library
KVM: arm64: selftests: Add test case for partitioned PMU
Marc Zyngier (1):
KVM: arm64: Reorganize PMU includes
Sean Christopherson (2):
KVM: arm64: Include KVM headers to get forward declarations
KVM: arm64: Move ARM specific headers in include/kvm to arch directory
Documentation/virt/kvm/api.rst | 24 +
arch/arm/include/asm/arm_pmuv3.h | 28 +
arch/arm64/include/asm/arm_pmuv3.h | 61 +-
.../arm64/include/asm/kvm_arch_timer.h | 2 +
arch/arm64/include/asm/kvm_host.h | 24 +-
.../arm64/include/asm/kvm_pmu.h | 142 ++++
arch/arm64/include/asm/kvm_types.h | 7 +-
.../arm64/include/asm/kvm_vgic.h | 0
arch/arm64/kernel/cpufeature.c | 8 +
arch/arm64/kvm/Makefile | 2 +-
arch/arm64/kvm/arch_timer.c | 5 +-
arch/arm64/kvm/arm.c | 23 +-
{include => arch/arm64}/kvm/arm_hypercalls.h | 0
{include => arch/arm64}/kvm/arm_psci.h | 0
arch/arm64/kvm/config.c | 34 +-
arch/arm64/kvm/debug.c | 31 +-
arch/arm64/kvm/guest.c | 2 +-
arch/arm64/kvm/handle_exit.c | 2 +-
arch/arm64/kvm/hyp/Makefile | 6 +-
arch/arm64/kvm/hyp/include/hyp/switch.h | 211 ++++-
arch/arm64/kvm/hyp/nvhe/switch.c | 4 +-
arch/arm64/kvm/hyp/vhe/switch.c | 4 +-
arch/arm64/kvm/hypercalls.c | 4 +-
arch/arm64/kvm/pmu-direct.c | 464 +++++++++++
arch/arm64/kvm/pmu-emul.c | 678 +---------------
arch/arm64/kvm/pmu.c | 726 ++++++++++++++++++
arch/arm64/kvm/psci.c | 4 +-
arch/arm64/kvm/pvtime.c | 2 +-
arch/arm64/kvm/reset.c | 3 +-
arch/arm64/kvm/sys_regs.c | 110 +--
arch/arm64/kvm/trace_arm.h | 2 +-
arch/arm64/kvm/trng.c | 2 +-
arch/arm64/kvm/vgic/vgic-debug.c | 2 +-
arch/arm64/kvm/vgic/vgic-init.c | 2 +-
arch/arm64/kvm/vgic/vgic-irqfd.c | 2 +-
arch/arm64/kvm/vgic/vgic-kvm-device.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio-v2.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +-
arch/arm64/kvm/vgic/vgic-mmio.c | 4 +-
arch/arm64/kvm/vgic/vgic-v2.c | 2 +-
arch/arm64/kvm/vgic/vgic-v3-nested.c | 3 +-
arch/arm64/kvm/vgic/vgic-v3.c | 2 +-
arch/arm64/kvm/vgic/vgic-v5.c | 2 +-
arch/arm64/tools/cpucaps | 1 +
arch/arm64/tools/sysreg | 6 +-
drivers/perf/arm_pmuv3.c | 137 +++-
include/linux/perf/arm_pmu.h | 1 +
include/linux/perf/arm_pmuv3.h | 14 +-
include/uapi/linux/kvm.h | 1 +
tools/include/uapi/linux/kvm.h | 1 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../selftests/kvm/arm64/vpmu_counter_access.c | 77 +-
tools/testing/selftests/kvm/lib/find_bit.c | 1 +
53 files changed, 2049 insertions(+), 831 deletions(-)
rename include/kvm/arm_arch_timer.h => arch/arm64/include/asm/kvm_arch_timer.h (98%)
rename include/kvm/arm_pmu.h => arch/arm64/include/asm/kvm_pmu.h (61%)
rename include/kvm/arm_vgic.h => arch/arm64/include/asm/kvm_vgic.h (100%)
rename {include => arch/arm64}/kvm/arm_hypercalls.h (100%)
rename {include => arch/arm64}/kvm/arm_psci.h (100%)
create mode 100644 arch/arm64/kvm/pmu-direct.c
create mode 100644 tools/testing/selftests/kvm/lib/find_bit.c
base-commit: ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d
--
2.52.0.239.gd5f0c6e74e-goog
The IA32 Emulation support can be either removed from the kernel,
disabled by default or disabled at runtime. Some of x86 selftests
are crashing for all of above thus is_32bit_syscall_supported()
helper is added to skip int80 syscalls if they are not supported.
Slawomir Rosek (2):
selftests/x86/ldt_gdt: Skip int80 if not supported
selftests/x86/ptrace_syscall: Skip int80 if not supported
tools/testing/selftests/x86/ldt_gdt.c | 21 +++++++++++++++++++-
tools/testing/selftests/x86/ptrace_syscall.c | 20 +++++++++++++++++--
2 files changed, 38 insertions(+), 3 deletions(-)
--
2.52.0.305.g3fc767764a-goog
In cgroup v2, a mutual overlap check is required when at least one of two
cpusets is exclusive. However, this check should be relaxed and limited to
cases where both cpusets are exclusive.
This patch ensures that for sibling cpusets A1 (exclusive) and B1
(non-exclusive), change B1 cannot affect A1's exclusivity.
for example. Assume a machine has 4 CPUs (0-3).
root cgroup
/ \
A1 B1
Case 1:
Table 1.1: Before applying the patch
Step | A1's prstate | B1'sprstate |
#1> echo "0-1" > A1/cpuset.cpus | member | member |
#2> echo "root" > A1/cpuset.cpus.partition | root | member |
#3> echo "0" > B1/cpuset.cpus | root invalid | member |
After step #3, A1 changes from "root" to "root invalid" because its CPUs
(0-1) overlap with those requested by B1 (0-3). However, B1 can actually
use CPUs 2-3(from B1's parent), so it would be more reasonable for A1 to
remain as "root."
Table 1.2: After applying the patch
Step | A1's prstate | B1'sprstate |
#1> echo "0-1" > A1/cpuset.cpus | member | member |
#2> echo "root" > A1/cpuset.cpus.partition | root | member |
#3> echo "0" > B1/cpuset.cpus | root | member |
Case 2: (This situation remains unchanged from before)
Table 2.1: Before applying the patch
Step | A1's prstate | B1'sprstate |
#1> echo "0-1" > A1/cpuset.cpus | member | member |
#3> echo "1-2" > B1/cpuset.cpus | member | member |
#2> echo "root" > A1/cpuset.cpus.partition | root invalid | member |
Table 2.2: After applying the patch
Step | A1's prstate | B1'sprstate |
#1> echo "0-1" > A1/cpuset.cpus | member | member |
#3> echo "1-2" > B1/cpuset.cpus | member | member |
#2> echo "root" > A1/cpuset.cpus.partition | root invalid | member |
All other cases remain unaffected. For example, cgroup-v1, both A1 and
B1 are exclusive or non-exlusive.
---
v3 -> v4:
- Adjust the test_cpuset_prt.sh test file to align with the current
behavior.
v2 -> v3:
- Ensure compliance with constraints such as cpuset.cpus.exclusive.
- Link: https://lore.kernel.org/cgroups/20251113131434.606961-1-sunshaojie@kylinos.…
v1 -> v2:
- Keeps the current cgroup v1 behavior unchanged
- Link: https://lore.kernel.org/cgroups/c8e234f4-2c27-4753-8f39-8ae83197efd3@redhat…
---
kernel/cgroup/cpuset-internal.h | 3 ++
kernel/cgroup/cpuset-v1.c | 20 +++++++++
kernel/cgroup/cpuset.c | 43 ++++++++++++++-----
.../selftests/cgroup/test_cpuset_prs.sh | 5 ++-
4 files changed, 58 insertions(+), 13 deletions(-)
--
2.25.1
From: Li RongQing <lirongqing(a)baidu.com>
The softlockup_panic sysctl is currently a binary option: panic immediately
or never panic on soft lockups.
Panicking on any soft lockup, regardless of duration, can be overly
aggressive for brief stalls that may be caused by legitimate operations.
Conversely, never panicking may allow severe system hangs to persist
undetected.
Extend softlockup_panic to accept an integer threshold, allowing the kernel
to panic only when the normalized lockup duration exceeds N watchdog
threshold periods. This provides finer-grained control to distinguish
between transient delays and persistent system failures.
The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic when duration >= 1 * threshold (20s default, original behavior)
- N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.)
The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility while allowing systems to tolerate brief lockups
while still catching severe, persistent hangs.
Signed-off-by: Li RongQing <lirongqing(a)baidu.com>
---
Documentation/admin-guide/kernel-parameters.txt | 10 +++++-----
arch/arm/configs/aspeed_g5_defconfig | 2 +-
arch/arm/configs/pxa3xx_defconfig | 2 +-
arch/openrisc/configs/or1klitex_defconfig | 2 +-
arch/powerpc/configs/skiroot_defconfig | 2 +-
drivers/gpu/drm/ci/arm.config | 2 +-
drivers/gpu/drm/ci/arm64.config | 2 +-
drivers/gpu/drm/ci/x86_64.config | 2 +-
kernel/watchdog.c | 8 +++++---
lib/Kconfig.debug | 13 +++++++------
tools/testing/selftests/bpf/config | 2 +-
tools/testing/selftests/wireguard/qemu/kernel.config | 2 +-
12 files changed, 26 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a8d0afd..27c5f96 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6934,12 +6934,12 @@ Kernel parameters
softlockup_panic=
[KNL] Should the soft-lockup detector generate panics.
- Format: 0 | 1
+ Format: <int>
- A value of 1 instructs the soft-lockup detector
- to panic the machine when a soft-lockup occurs. It is
- also controlled by the kernel.softlockup_panic sysctl
- and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
+ A value of non-zero instructs the soft-lockup detector
+ to panic the machine when a soft-lockup duration exceeds
+ N thresholds. It is also controlled by the kernel.softlockup_panic
+ sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the
respective build-time switch to that functionality.
softlockup_all_cpu_backtrace=
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 2e6ea13..ec558e5 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig
index 07d422f..fb272e3 100644
--- a/arch/arm/configs/pxa3xx_defconfig
+++ b/arch/arm/configs/pxa3xx_defconfig
@@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_KERNEL=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_SHIRQ=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
# CONFIG_SCHED_DEBUG is not set
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig
index fb1eb9a..984b0e3 100644
--- a/arch/openrisc/configs/or1klitex_defconfig
+++ b/arch/openrisc/configs/or1klitex_defconfig
@@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf"
CONFIG_PRINTK_TIME=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
index 2b71a6d..a4114fc 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_PANIC_ON_OOPS=y
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
CONFIG_WQ_WATCHDOG=y
diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config
index 411e814..d7c5167 100644
--- a/drivers/gpu/drm/ci/arm.config
+++ b/drivers/gpu/drm/ci/arm.config
@@ -52,7 +52,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=n
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0
CONFIG_FW_LOADER_COMPRESS=y
diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config
index fddfbd4..ea0e307 100644
--- a/drivers/gpu/drm/ci/arm64.config
+++ b/drivers/gpu/drm/ci/arm64.config
@@ -161,7 +161,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config
index 8eaba388..7ac98a7 100644
--- a/drivers/gpu/drm/ci/x86_64.config
+++ b/drivers/gpu/drm/ci/x86_64.config
@@ -47,7 +47,7 @@ CONFIG_TMPFS=y
CONFIG_PROVE_LOCKING=n
CONFIG_DEBUG_LOCKDEP=n
CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_DETECT_HUNG_TASK=y
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 0685e3a..a5fa116 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly;
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);
+ CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC;
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
@@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT);
- if (softlockup_panic)
+ duration = duration / get_softlockup_thresh();
+
+ if (softlockup_panic && duration >= softlockup_panic)
panic("softlockup: hung tasks");
}
@@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "softlockup_sys_info",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ba36939..17a7a77 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM
the CPU stats and the interrupt counts during the "soft lockups".
config BOOTPARAM_SOFTLOCKUP_PANIC
- bool "Panic (Reboot) On Soft Lockups"
+ int "Panic (Reboot) On Soft Lockups"
depends on SOFTLOCKUP_DETECTOR
+ default 0
help
- Say Y here to enable the kernel to panic on "soft lockups",
- which are bugs that cause the kernel to loop in kernel
- mode for more than 20 seconds (configurable using the watchdog_thresh
- sysctl), without giving other tasks a chance to run.
+ Set to a non-zero value N to enable the kernel to panic on "soft
+ lockups", which are bugs that cause the kernel to loop in kernel
+ mode for more than (N * 20 seconds) (configurable using the
+ watchdog_thresh sysctl), without giving other tasks a chance to run.
The panic can be used in combination with panic_timeout,
to cause the system to reboot automatically after a
@@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
high-availability systems that have uptime guarantees and
where a lockup must be resolved ASAP.
- Say N if unsure.
+ Say 0 if unsure.
config HAVE_HARDLOCKUP_DETECTOR_BUDDY
bool
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 558839e..2485538 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -1,6 +1,6 @@
CONFIG_BLK_DEV_LOOP=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BPF=y
CONFIG_BPF_EVENTS=y
CONFIG_BPF_JIT=y
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
index 0504c11..bb89d2d 100644
--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_WQ_WATCHDOG=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
-CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_PANIC_TIMEOUT=-1
CONFIG_STACKTRACE=y
--
2.9.4
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. A division-by-zero crash in SNC detection on some platforms (e.g.,
Hygon).
2. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Maintainer notes:
-----------------
Patch 1: selftests/resctrl: Fix a division by zero error on Hygon
- This is a candidate for backport with "Fixes:" tag.
Patch 2: selftests/resctrl: Define CPU vendor IDs as bits to match usage
- This is *not* a candidate for backport since it is an enhancement and
preparatory patch for patch 3.
Patch 3: selftests/resctrl: Add CPU vendor detection for Hygon
Patch 4: selftests/resctrl: Fix non-contiguous CBM check for Hygon
- Even though they are fixes they are *not* candidates for backport
since they are based on another patch series (x86/resctrl: Fix
Platform QoS issues for Hygon) which is in process of being added to
resctrl.
-----------------
Changelog:
v5:
- Patch 2:
1. Fix a nit of "reverse fir ordering" of the variable declarations in
detect_vendor() in v4 patch series (Reinette).
2. Add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>.
v4:
- Cover letter: add maintainer notes outlining how these patches to be
handled (Reinette).
- Re-organize the patch series to move original patch 3 to the beginning
of series. The patch order has changed between v3 and v4 (Reinette):
v3 -> v4
patch #3 -> patch #1
patch #1 -> patch #2
patch #2 -> patch #3
patch #4 -> patch #4
- Patch 2:
1. Resolve a conflict against latest upstream kernel (Reinette).
2. Fix a nit to maintain the reverse fir ordering of variables in
detect_vendor() (Reinette).
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 4: move the maintainer note into the cover letter (Reinette).
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. A division-by-zero crash in SNC detection on some platforms (e.g.,
Hygon).
2. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Maintainer notes:
-----------------
Patch 1: selftests/resctrl: Fix a division by zero error on Hygon
- This is a candidate for backport with "Fixes:" tag.
Patch 2: selftests/resctrl: Define CPU vendor IDs as bits to match usage
- This is *not* a candidate for backport since it is an enhancement and
preparatory patch for patch 3.
Patch 3: selftests/resctrl: Add CPU vendor detection for Hygon
Patch 4: selftests/resctrl: Fix non-contiguous CBM check for Hygon
- Even though they are fixes they are *not* candidates for backport
since they are based on another patch series (x86/resctrl: Fix
Platform QoS issues for Hygon) which is in process of being added to
resctrl.
-----------------
Changelog:
v4:
- Cover letter: add maintainer notes outlining how these patches to be
handled (Reinette).
- Re-organize the patch series to move original patch 3 to the beginning
of series. The patch order has changed between v3 and v4 (Reinette):
v3 -> v4
patch #3 -> patch #1
patch #1 -> patch #2
patch #2 -> patch #3
patch #4 -> patch #4
- Patch 2:
1. Resolve a conflict against latest upstream kernel (Reinette).
2. Fix a nit to maintain the reverse fir ordering of variables in
detect_vendor() (Reinette).
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 4: move the maintainer note into the cover letter (Reinette).
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
This series extends BPF's cryptographic capabilities by adding kfuncs for
SHA hashing and ECDSA signature verification. These functions enable BPF
programs to perform cryptographic operations for use cases such as content
verification, integrity checking, and data authentication.
BPF programs increasingly need to verify data integrity and authenticity in
networking, security, and observability contexts. While BPF already supports
symmetric encryption/decryption, it lacks support for:
1. Cryptographic hashing - needed for content verification, fingerprinting,
and preparing message digests for signature operations
2. Asymmetric signature verification - needed to verify signed data without
requiring the signing key in the datapath
These capabilities enable use cases such as:
- Verifying signed network packets or application data in XDP/TC programs
- Implementing integrity checks in tracing and security monitoring
- Building zero-trust security models where BPF programs verify credentials
- Content-addressed storage and deduplication in BPF-based filesystems
Implementation:
The implementation follows BPF's existing crypto patterns:
1. Uses bpf_dynptr for safe memory access without page fault risks
2. Leverages the kernel's existing crypto library (lib/crypto/sha256.c and
crypto/ecdsa.c) rather than reimplementing algorithms
3. Provides context-based API for ECDSA to enable key reuse and support
multiple program types (syscall, XDP, TC)
4. Includes comprehensive selftests with NIST test vectors
Patch 1: bpf: Extend bpf_crypto_type with hash operations
- Adds hash operation callbacks to bpf_crypto_type structure
- Adds hash() and digestsize() function pointers
- Must come before crypto module to maintain bisectability
Patch 2: crypto: Add BPF hash algorithm type registration module
- Adds bpf_crypto_shash module in crypto/ subsystem
- Registers hash type with BPF crypto infrastructure
- Enables hash algorithm access through unified bpf_crypto_type interface
- Implements callbacks: alloc_tfm, free_tfm, hash, digestsize, get_flags
- Manages shash_desc lifecycle internally
Patch 3: bpf: Add SHA hash kfunc for cryptographic hashing
- Adds bpf_crypto_hash() kfunc for SHA-256/384/512
- Updates bpf_crypto_ctx_create() to support keyless operations
- Protected by CONFIG_CRYPTO_HASH2 guards
- Uses kernel's crypto library implementations
- Fixed u64 types for dynptr sizes to prevent truncation
Patch 4: selftests/bpf: Add tests for bpf_crypto_hash kfunc
- Tests basic functionality with NIST "abc" test vectors
- Validates error handling for invalid parameters (zero-length input)
- Ensures correct hash output for SHA-256, SHA-384, and SHA-512
- Adds CONFIG_CRYPTO_HASH2 and CONFIG_CRYPTO_SHA512 to selftest config
- Refactored test setup code to reduce duplication
Patch 5: bpf: Add ECDSA signature verification kfuncs
- Context-based API: bpf_ecdsa_ctx_create/acquire/release pattern
- Supports NIST curves (P-256, P-384, P-521)
- Adds bpf_ecdsa_verify() for signature verification
- Includes size query functions: keysize, digestsize, maxsize
- Enables use in non-sleepable contexts via pre-allocated contexts
- Uses crypto_sig API with p1363 format (r || s signatures)
Patch 6: selftests/bpf: Add tests for ECDSA signature verification
- Tests valid signature acceptance with RFC 6979 test vectors for P-256
- Tests invalid signature rejection
- Tests size query functions (keysize, digestsize, maxsize)
- Uses well-known NIST test vectors with "sample" message
- Adds CONFIG_CRYPTO_ECDSA to selftest config
v2:
- Fixed redundant __bpf_dynptr_is_rdonly() checks (Vadim)
- Added BPF hash algorithm type registration module in crypto/ subsystem
- Added CONFIG_CRYPTO_HASH2 guards around bpf_crypto_hash() kfunc and its
BTF registration, matching the pattern used for CONFIG_CRYPTO_ECDSA
- Added mandatory digestsize validation for hash operations
v3:
- Fixed patch ordering - header changes now in separate first commit before
crypto module to ensure bisectability (bot+bpf-ci)
- Fixed type mismatch - changed u32 to u64 for dynptr sizes in
bpf_crypto_hash() to match __bpf_dynptr_size() return type (Mykyta)
- Added CONFIG_CRYPTO_ECDSA to selftest config (Song)
- Refactored test code duplication with setup_skel() helper (Song)
- Added copyright notices to all new files
Daniel Hodges (6):
bpf: Extend bpf_crypto_type with hash operations
crypto: Add BPF hash algorithm type registration module
bpf: Add SHA hash kfunc for cryptographic hashing
selftests/bpf: Add tests for bpf_crypto_hash kfunc
bpf: Add ECDSA signature verification kfuncs
selftests/bpf: Add tests for ECDSA signature verification kfuncs
crypto/Makefile | 3 +
crypto/bpf_crypto_shash.c | 95 ++++++
include/linux/bpf_crypto.h | 2 +
kernel/bpf/crypto.c | 306 +++++++++++++++++-
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/crypto_hash.c | 147 +++++++++
.../selftests/bpf/prog_tests/ecdsa_verify.c | 75 +++++
.../testing/selftests/bpf/progs/crypto_hash.c | 142 ++++++++
.../selftests/bpf/progs/ecdsa_verify.c | 160 +++++++++
9 files changed, 925 insertions(+), 8 deletions(-)
create mode 100644 crypto/bpf_crypto_shash.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/crypto_hash.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/ecdsa_verify.c
create mode 100644 tools/testing/selftests/bpf/progs/crypto_hash.c
create mode 100644 tools/testing/selftests/bpf/progs/ecdsa_verify.c
--
2.51.0
When the selftest 'tap.c' is compiled with '-D_FORTIFY_SOURCE=3',
the strcpy() in rtattr_add_strsz() is replaced with a checked
version which causes the test to consistently fail when compiled
with toolchains for which this option is enabled by default.
TAP version 13
1..3
# Starting 3 tests from 1 test cases.
# RUN tap.test_packet_valid_udp_gso ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_gso: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_gso
not ok 1 tap.test_packet_valid_udp_gso
# RUN tap.test_packet_valid_udp_csum ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_csum: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_csum
not ok 2 tap.test_packet_valid_udp_csum
# RUN tap.test_packet_crash_tap_invalid_eth_proto ...
*** buffer overflow detected ***: terminated
# test_packet_crash_tap_invalid_eth_proto: Test terminated by assertion
# FAIL tap.test_packet_crash_tap_invalid_eth_proto
not ok 3 tap.test_packet_crash_tap_invalid_eth_proto
# FAILED: 0 / 3 tests passed.
# Totals: pass:0 fail:3 xfail:0 xpass:0 skip:0 error:0
A buffer overflow is detected by the fortified glibc __strcpy_chk()
since the __builtin_object_size() of `RTA_DATA(rta)` is incorrectly
reported as 1, even though there is ample space in its bounding
buffer `req`.
Additionally, given that IFLA_IFNAME also expects a null-terminated
string, callers of rtaddr_add_str{,sz}() could simply use the
rtaddr_add_strsz() variant. (which has been renamed to remove the
trailing `sz`) memset() has been used for this function since it
is unchecked and thus circumvents the issue discussed in the
previous paragraph.
Fixes: 2e64fe4624d1 ("selftests: add few test cases for tap driver")
Signed-off-by: Alice C. Munduruca <alice.munduruca(a)canonical.com>
Reviewed-by: Cengiz Can <cengiz.can(a)canonical.com>
---
tools/testing/selftests/net/tap.c | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/tools/testing/selftests/net/tap.c b/tools/testing/selftests/net/tap.c
index 247c3b3ac1c9..51a209014f1c 100644
--- a/tools/testing/selftests/net/tap.c
+++ b/tools/testing/selftests/net/tap.c
@@ -56,18 +56,12 @@ static void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr)
static struct rtattr *rtattr_add_str(struct nlmsghdr *nh, unsigned short type,
const char *s)
{
- struct rtattr *rta = rtattr_add(nh, type, strlen(s));
+ unsigned int strsz = strlen(s) + 1;
+ struct rtattr *rta;
- memcpy(RTA_DATA(rta), s, strlen(s));
- return rta;
-}
-
-static struct rtattr *rtattr_add_strsz(struct nlmsghdr *nh, unsigned short type,
- const char *s)
-{
- struct rtattr *rta = rtattr_add(nh, type, strlen(s) + 1);
+ rta = rtattr_add(nh, type, strsz);
- strcpy(RTA_DATA(rta), s);
+ memcpy(RTA_DATA(rta), s, strsz);
return rta;
}
@@ -119,7 +113,7 @@ static int dev_create(const char *dev, const char *link_type,
link_info = rtattr_begin(&req.nh, IFLA_LINKINFO);
- rtattr_add_strsz(&req.nh, IFLA_INFO_KIND, link_type);
+ rtattr_add_str(&req.nh, IFLA_INFO_KIND, link_type);
if (fill_info_data) {
info_data = rtattr_begin(&req.nh, IFLA_INFO_DATA);
--
2.48.1
The templated test names in psp.py had a bug that was not exposed
until 80970e0fc07e ("selftests: net: py: extract the case generation
logic") changed the order of test case evaluation and test case name
extraction.
The test cases created in psp_ip_ver_test_builder() and
ipver_test_builder() were only assigning formatted names to the test
cases they returned, when the test itself was run. This series moves
the test case naming to the point where the test function is created.
Using netdevsim psp:
Before:
./tools/testing/selftests/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.test_case
ok 2 psp.test_case
ok 3 psp.test_case
ok 4 psp.test_case
ok 5 psp.test_case
ok 6 psp.test_case
ok 7 psp.test_case
ok 8 psp.test_case
ok 9 psp.test_case
ok 10 psp.test_case
ok 11 psp.dev_list_devices
...
ok 28 psp.removal_device_bi
# Totals: pass:28 fail:0 xfail:0 xpass:0 skip:0 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 3 to 0xf
# Set PSP enable on device 3 to 0x0
After:
./tools/testing/selftests/drivers/net/psp.py
TAP version 13
1..28
ok 1 psp.data_basic_send_v0_ip4
ok 2 psp.data_basic_send_v0_ip6
ok 3 psp.data_basic_send_v1_ip4
ok 4 psp.data_basic_send_v1_ip6
ok 5 psp.data_basic_send_v2_ip4
ok 6 psp.data_basic_send_v2_ip6
ok 7 psp.data_basic_send_v3_ip4
ok 8 psp.data_basic_send_v3_ip6
ok 9 psp.data_mss_adjust_ip4
ok 10 psp.data_mss_adjust_ip6
ok 11 psp.dev_list_devices
...
ok 28 psp.removal_device_bi
# Totals: pass:28 fail:0 xfail:0 xpass:0 skip:0 error:0
#
# Responder logs (0):
# STDERR:
# Set PSP enable on device 3 to 0xf
# Set PSP enable on device 3 to 0x0
Signed-off-by: Daniel Zahka <daniel.zahka(a)gmail.com>
---
Daniel Zahka (2):
selftests: drv-net: psp: fix templated test names in psp_ip_ver_test_builder()
selftests: drv-net: psp: fix test names in ipver_test_builder()
tools/testing/selftests/drivers/net/psp.py | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
---
base-commit: 885bebac9909994050bbbeed0829c727e42bd1b7
change-id: 20251212-psp-test-fix-f0816c40a2c1
Best regards,
--
Daniel Zahka <daniel.zahka(a)gmail.com>
Make use of empty (NULL-terminated) array instead of NULL pointer to
avoid compiler errors while maintaining the behavior of the function
intact
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1
CC devpts_pts
CC file_stressor
CC anon_inode_test
anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull]
45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
| ^~~~
/usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL'
26 | #define NULL ((void*)0)
| ^~~~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT'
535 | __EXPECT(expected, #expected, seen, #seen, <, 1)
| ^~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT'
758 | __typeof__(_expected) __exp = (_expected); \
| ^~~~~~~~~
1 warning generated.
tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index 94c6c81c2..2c4c50500 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -42,7 +42,10 @@ TEST(anon_inode_no_exec)
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
- ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ char *const empty_argv[] = {NULL};
+ char *const empty_envp[] = {NULL};
+
+ ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, EACCES);
EXPECT_EQ(close(fd_context), 0);
--
2.43.0
Use __builtin_trap() to truly crash the program instead of dereferencing
null pointer which may be optimized by the compiler preventing the crash
from occurring
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1
CC stackdump_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_protocol_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
index a6f6d5f2a..5c8adee63 100644
--- a/tools/testing/selftests/coredump/coredump_test_helpers.c
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -56,7 +56,7 @@ void crashing_child(void)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
- i = *(int *)NULL;
+ __builtin_trap();
}
int create_detached_tmpfs(void)
--
2.43.0
Add descriptive message in the _Static_assert to comply with the C11
standard requirement to prevent compiler from throwing out error. The
compiler throws an error when _Static_assert is used without a message as
that is a C23 extension.
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk$ make LLVM=1 W=1
CC kublk
In file included from kublk.c:6:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from null.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from file_backed.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from common.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from stripe.c:3:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
In file included from fault_inject.c:11:
./kublk.h:220:43: error: '_Static_assert' with no message is a C23 extension [-Werror,-Wc23-extensions]
220 | _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
| ^
| , ""
1 error generated.
make: *** [../lib.mk:225: /home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/ublk/kublk] Error 1
tools/testing/selftests/ublk/kublk.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index fe42705c6..e5eb5f762 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -217,7 +217,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
unsigned tgt_data, unsigned q_id, unsigned is_target_io)
{
/* we only have 7 bits to encode q_id */
- _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7);
+ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7");
assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
return tag | (op << 16) | (tgt_data << 24) |
--
2.43.0
On the Android arm32 platform, when performing the futex_requeue test, it will
most likely return a failure. The specific reason is detailed in a commit[1]
previously submitted by Edward Liaw. However, this commit cannot perfectly
solve the problem. This is because using a barrier does not guarantee that
the child thread will wait on futex_wait.
This series of patches attempts to solve this problem by checking whether
the child thread is in a sleeping state. This is because when the child thread
goes to sleep, it indicates that it is waiting for the futex lock.
v1->v2:
- Solve the compilation problems found by the kernel test robot
- Cleanup the atomic library code for futex test
Link: https://lore.kernel.org/all/20240918231102.234253-1-edliaw@google.com/
In the thread_state_get() function, the logic to find the thread's state
character was using `sizeof(header) - 1` to calculate the offset from
the "State:\t" string.
The `header` variable is a `const char *` pointer. `sizeof()` on a
pointer returns the size of the pointer itself, not the length of the
string literal it points to. This makes the code's behavior dependent
on the architecture's pointer size.
This bug was identified on a 32-bit ARM build (`gsi_tv_arm`) for
Android, running on an ARMv8-based device, compiled with Clang 19.0.1.
On this 32-bit architecture, `sizeof(char *)` is 4. The expression
`sizeof(header) - 1` resulted in an incorrect offset of 3, causing the
test to read the wrong character from `/proc/[tid]/status` and fail.
On 64-bit architectures, `sizeof(char *)` is 8, so the expression
coincidentally evaluates to 7, which matches the length of "State:\t".
This is why the bug likely remained hidden on 64-bit builds.
To fix this and make the code portable and correct across all
architectures, this patch replaces `sizeof(header) - 1` with
`strlen(header)`. The `strlen()` function correctly calculates the
string's length, ensuring the correct offset is always used.
Signed-off-by: Wake Liu <wakel(a)google.com>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index f4807242c5b2..6f5e404a446c 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1317,7 +1317,7 @@ static thread_state thread_state_get(pid_t tid)
p = strstr(tmp, header);
if (p) {
/* For example, "State:\tD (disk sleep)" */
- c = *(p + sizeof(header) - 1);
+ c = *(p + strlen(header));
return c == 'D' ?
THR_STATE_UNINTERRUPTIBLE : THR_STATE_UNKNOWN;
}
--
2.52.0.223.gf5cc29aaa4-goog
This series adds namespace support to vhost-vsock and loopback. It does
not add namespaces to any of the other guest transports (virtio-vsock,
hyperv, or vmci).
The current revision supports two modes: local and global. Local
mode is complete isolation of namespaces, while global mode is complete
sharing between namespaces of CIDs (the original behavior).
The mode is set using /proc/sys/net/vsock/ns_mode.
Modes are per-netns and write-once. This allows a system to configure
namespaces independently (some may share CIDs, others are completely
isolated). This also supports future possible mixed use cases, where
there may be namespaces in global mode spinning up VMs while there are
mixed mode namespaces that provide services to the VMs, but are not
allowed to allocate from the global CID pool (this mode is not
implemented in this series).
If a socket or VM is created when a namespace is global but the
namespace changes to local, the socket or VM will continue working
normally. That is, the socket or VM assumes the mode behavior of the
namespace at the time the socket/VM was created. The original mode is
captured in vsock_create() and so occurs at the time of socket(2) and
accept(2) for sockets and open(2) on /dev/vhost-vsock for VMs. This
prevents a socket/VM connection from suddenly breaking due to a
namespace mode change. Any new sockets/VMs created after the mode change
will adopt the new mode's behavior.
Additionally, added tests for the new namespace features:
tools/testing/selftests/vsock/vmtest.sh
1..28
ok 1 vm_server_host_client
ok 2 vm_client_host_server
ok 3 vm_loopback
ok 4 ns_host_vsock_ns_mode_ok
ok 5 ns_host_vsock_ns_mode_write_once_ok
ok 6 ns_global_same_cid_fails
ok 7 ns_local_same_cid_ok
ok 8 ns_global_local_same_cid_ok
ok 9 ns_local_global_same_cid_ok
ok 10 ns_diff_global_host_connect_to_global_vm_ok
ok 11 ns_diff_global_host_connect_to_local_vm_fails
ok 12 ns_diff_global_vm_connect_to_global_host_ok
ok 13 ns_diff_global_vm_connect_to_local_host_fails
ok 14 ns_diff_local_host_connect_to_local_vm_fails
ok 15 ns_diff_local_vm_connect_to_local_host_fails
ok 16 ns_diff_global_to_local_loopback_local_fails
ok 17 ns_diff_local_to_global_loopback_fails
ok 18 ns_diff_local_to_local_loopback_fails
ok 19 ns_diff_global_to_global_loopback_ok
ok 20 ns_same_local_loopback_ok
ok 21 ns_same_local_host_connect_to_local_vm_ok
ok 22 ns_same_local_vm_connect_to_local_host_ok
ok 23 ns_mode_change_connection_continue_vm_ok
ok 24 ns_mode_change_connection_continue_host_ok
ok 25 ns_mode_change_connection_continue_both_ok
ok 26 ns_delete_vm_ok
ok 27 ns_delete_host_ok
ok 28 ns_delete_both_ok
SUMMARY: PASS=28 SKIP=0 FAIL=0
Dependent on series:
https://lore.kernel.org/all/20251108-vsock-selftests-fixes-and-improvements…
Thanks again for everyone's help and reviews!
Suggested-by: Sargun Dhillon <sargun(a)sargun.me>
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)gmail.com>
Changes in v12:
- add ns mode checking to _allow() callbacks to reject local mode for
incompatible transports (Stefano)
- flip vhost/loopback to return true for stream_allow() and
seqpacket_allow() in "vsock: add netns support to virtio transports"
(Stefano)
- add VMADDR_CID_ANY + local mode documentation in af_vsock.c (Stefano)
- change "selftests/vsock: add tests for host <-> vm connectivity with
namespaces" to skip test 29 in vsock_test for namespace local
vsock_test calls in a host local-mode namespace. There is a
false-positive edge case for that test encountered with the
->stream_allow() approach. More details in that patch.
- updated cover letter with new test output
- Link to v11: https://lore.kernel.org/r/20251120-vsock-vmtest-v11-0-55cbc80249a7@meta.com
Changes in v11:
- vmtest: add a patch to use ss in wait_for_listener functions and
support vsock, tcp, and unix. Change all patches to use the new
functions.
- vmtest: add a patch to re-use vm dmesg / warn counting functions
- Link to v10: https://lore.kernel.org/r/20251117-vsock-vmtest-v10-0-df08f165bf3e@meta.com
Changes in v10:
- Combine virtio common patches into one (Stefano)
- Resolve vsock_loopback virtio_transport_reset_no_sock() issue
with info->vsk setting. This eliminates the need for skb->cb,
so remove skb->cb patches.
- many line width 80 fixes
- Link to v9: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-0-852787a37bed@meta.com
Changes in v9:
- reorder loopback patch after patch for virtio transport common code
- remove module ordering tests patch because loopback no longer depends
on pernet ops
- major simplifications in vsock_loopback
- added a new patch for blocking local mode for guests, added test case
to check
- add net ref tracking to vsock_loopback patch
- Link to v8: https://lore.kernel.org/r/20251023-vsock-vmtest-v8-0-dea984d02bb0@meta.com
Changes in v8:
- Break generic cleanup/refactoring patches into standalone series,
remove those from this series
- Link to dependency: https://lore.kernel.org/all/20251022-vsock-selftests-fixes-and-improvements…
- Link to v7: https://lore.kernel.org/r/20251021-vsock-vmtest-v7-0-0661b7b6f081@meta.com
Changes in v7:
- fix hv_sock build
- break out vmtest patches into distinct, more well-scoped patches
- change `orig_net_mode` to `net_mode`
- many fixes and style changes in per-patch change sets (see individual
patches for specific changes)
- optimize `virtio_vsock_skb_cb` layout
- update commit messages with more useful descriptions
- vsock_loopback: use orig_net_mode instead of current net mode
- add tests for edge cases (ns deletion, mode changing, loopback module
load ordering)
- Link to v6: https://lore.kernel.org/r/20250916-vsock-vmtest-v6-0-064d2eb0c89d@meta.com
Changes in v6:
- define behavior when mode changes to local while socket/VM is alive
- af_vsock: clarify description of CID behavior
- af_vsock: use stronger langauge around CID rules (dont use "may")
- af_vsock: improve naming of buf/buffer
- af_vsock: improve string length checking on proc writes
- vsock_loopback: add space in struct to clarify lock protection
- vsock_loopback: do proper cleanup/unregister on vsock_loopback_exit()
- vsock_loopback: use virtio_vsock_skb_net() instead of sock_net()
- vsock_loopback: set loopback to NULL after kfree()
- vsock_loopback: use pernet_operations and remove callback mechanism
- vsock_loopback: add macros for "global" and "local"
- vsock_loopback: fix length checking
- vmtest.sh: check for namespace support in vmtest.sh
- Link to v5: https://lore.kernel.org/r/20250827-vsock-vmtest-v5-0-0ba580bede5b@meta.com
Changes in v5:
- /proc/net/vsock_ns_mode -> /proc/sys/net/vsock/ns_mode
- vsock_global_net -> vsock_global_dummy_net
- fix netns lookup in vhost_vsock to respect pid namespaces
- add callbacks for vsock_loopback to avoid circular dependency
- vmtest.sh loads vsock_loopback module
- remove vsock_net_mode_can_set()
- change vsock_net_write_mode() to return true/false based on success
- make vsock_net_mode enum instead of u8
- Link to v4: https://lore.kernel.org/r/20250805-vsock-vmtest-v4-0-059ec51ab111@meta.com
Changes in v4:
- removed RFC tag
- implemented loopback support
- renamed new tests to better reflect behavior
- completed suite of tests with permutations of ns modes and vsock_test
as guest/host
- simplified socat bridging with unix socket instead of tcp + veth
- only use vsock_test for success case, socat for failure case (context
in commit message)
- lots of cleanup
Changes in v3:
- add notion of "modes"
- add procfs /proc/net/vsock_ns_mode
- local and global modes only
- no /dev/vhost-vsock-netns
- vmtest.sh already merged, so new patch just adds new tests for NS
- Link to v2:
https://lore.kernel.org/kvm/20250312-vsock-netns-v2-0-84bffa1aa97a@gmail.com
Changes in v2:
- only support vhost-vsock namespaces
- all g2h namespaces retain old behavior, only common API changes
impacted by vhost-vsock changes
- add /dev/vhost-vsock-netns for "opt-in"
- leave /dev/vhost-vsock to old behavior
- removed netns module param
- Link to v1:
https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
Changes in v1:
- added 'netns' module param to vsock.ko to enable the
network namespace support (disabled by default)
- added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
- Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
---
Bobby Eshleman (12):
vsock: a per-net vsock NS mode state
vsock: add netns to vsock core
virtio: set skb owner of virtio_transport_reset_no_sock() reply
vsock: add netns support to virtio transports
selftests/vsock: add namespace helpers to vmtest.sh
selftests/vsock: prepare vm management helpers for namespaces
selftests/vsock: add vm_dmesg_{warn,oops}_count() helpers
selftests/vsock: use ss to wait for listeners instead of /proc/net
selftests/vsock: add tests for proc sys vsock ns_mode
selftests/vsock: add namespace tests for CID collisions
selftests/vsock: add tests for host <-> vm connectivity with namespaces
selftests/vsock: add tests for namespace deletion and mode changes
MAINTAINERS | 1 +
drivers/vhost/vsock.c | 59 +-
include/linux/virtio_vsock.h | 12 +-
include/net/af_vsock.h | 57 +-
include/net/net_namespace.h | 4 +
include/net/netns/vsock.h | 17 +
net/vmw_vsock/af_vsock.c | 272 +++++++-
net/vmw_vsock/hyperv_transport.c | 7 +-
net/vmw_vsock/virtio_transport.c | 19 +-
net/vmw_vsock/virtio_transport_common.c | 75 ++-
net/vmw_vsock/vmci_transport.c | 26 +-
net/vmw_vsock/vsock_loopback.c | 23 +-
tools/testing/selftests/vsock/vmtest.sh | 1077 +++++++++++++++++++++++++++++--
13 files changed, 1522 insertions(+), 127 deletions(-)
---
base-commit: 962ac5ca99a5c3e7469215bf47572440402dfd59
change-id: 20250325-vsock-vmtest-b3a21d2102c2
prerequisite-message-id: <20251022-vsock-selftests-fixes-and-improvements-v1-0-edeb179d6463(a)meta.com>
prerequisite-patch-id: a2eecc3851f2509ed40009a7cab6990c6d7cfff5
prerequisite-patch-id: 501db2100636b9c8fcb3b64b8b1df797ccbede85
prerequisite-patch-id: ba1a2f07398a035bc48ef72edda41888614be449
prerequisite-patch-id: fd5cc5445aca9355ce678e6d2bfa89fab8a57e61
prerequisite-patch-id: 795ab4432ffb0843e22b580374782e7e0d99b909
prerequisite-patch-id: 1499d263dc933e75366c09e045d2125ca39f7ddd
prerequisite-patch-id: f92d99bb1d35d99b063f818a19dcda999152d74c
prerequisite-patch-id: e3296f38cdba6d903e061cff2bbb3e7615e8e671
prerequisite-patch-id: bc4662b4710d302d4893f58708820fc2a0624325
prerequisite-patch-id: f8991f2e98c2661a706183fde6b35e2b8d9aedcf
prerequisite-patch-id: 44bf9ed69353586d284e5ee63d6fffa30439a698
prerequisite-patch-id: d50621bc630eeaf608bbaf260370c8dabf6326df
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
This patch series adds __rust_helper to every single rust helper. The
patches do not depend on each other, so maintainers please go ahead and
pick up any patches relevant to your subsystem! Or provide your Acked-by
so that Miguel can pick them up.
These changes were generated by adding __rust_helper and running
ClangFormat. Unrelated formatting changes were removed manually.
Why is __rust_helper needed?
============================
Currently, C helpers cannot be inlined into Rust even when using LTO
because LLVM detects slightly different options on the codegen units.
* LLVM doesn't want to inline functions compiled with
`-fno-delete-null-pointer-checks` with code compiled without. The C
CGUs all have this enabled and Rust CGUs don't. Inlining is okay since
this is one of the hardening features that does not change the ABI,
and we shouldn't have null pointer dereferences in these helpers.
* LLVM doesn't want to inline functions with different list of builtins. C
side has `-fno-builtin-wcslen`; `wcslen` is not a Rust builtin, so
they should be compatible, but LLVM does not perform inlining due to
attributes mismatch.
* clang and Rust doesn't have the exact target string. Clang generates
`+cmov,+cx8,+fxsr` but Rust doesn't enable them (in fact, Rust will
complain if `-Ctarget-feature=+cmov,+cx8,+fxsr` is used). x86-64
always enable these features, so they are in fact the same target
string, but LLVM doesn't understand this and so inlining is inhibited.
This can be bypassed with `--ignore-tti-inline-compatible`, but this
is a hidden option.
(This analysis was written by Gary Guo.)
How is this fixed?
==================
To fix this we need to add __always_inline to all helpers when compiling
with LTO. However, it should not be added when running bindgen as
bindgen will ignore functions marked inline. To achieve this, we are
using a #define called __rust_helper that is defined differently
depending on whether bindgen is running or not.
Note that __rust_helper is currently always #defined to nothing.
Changing it to __always_inline will happen separately in another patch
series.
Signed-off-by: Alice Ryhl <aliceryhl(a)google.com>
---
Alice Ryhl (46):
rust: auxiliary: add __rust_helper to helpers
rust: barrier: add __rust_helper to helpers
rust: binder: add __rust_helper to helpers
rust: bitmap: add __rust_helper to helpers
rust: bitops: add __rust_helper to helpers
rust: blk: add __rust_helper to helpers
rust: bug: add __rust_helper to helpers
rust: clk: add __rust_helper to helpers
rust: completion: add __rust_helper to helpers
rust: cpu: add __rust_helper to helpers
rust: cpufreq: add __rust_helper to helpers
rust: cpumask: add __rust_helper to helpers
rust: cred: add __rust_helper to helpers
rust: device: add __rust_helper to helpers
rust: dma: add __rust_helper to helpers
rust: drm: add __rust_helper to helpers
rust: err: add __rust_helper to helpers
rust: fs: add __rust_helper to helpers
rust: io: add __rust_helper to helpers
rust: irq: add __rust_helper to helpers
rust: jump_label: add __rust_helper to helpers
rust: kunit: add __rust_helper to helpers
rust: maple_tree: add __rust_helper to helpers
rust: mm: add __rust_helper to helpers
rust: of: add __rust_helper to helpers
rust: pci: add __rust_helper to helpers
rust: pid_namespace: add __rust_helper to helpers
rust: platform: add __rust_helper to helpers
rust: poll: add __rust_helper to helpers
rust: processor: add __rust_helper to helpers
rust: property: add __rust_helper to helpers
rust: rbtree: add __rust_helper to helpers
rust: rcu: add __rust_helper to helpers
rust: refcount: add __rust_helper to helpers
rust: regulator: add __rust_helper to helpers
rust: scatterlist: add __rust_helper to helpers
rust: security: add __rust_helper to helpers
rust: slab: add __rust_helper to helpers
rust: sync: add __rust_helper to helpers
rust: task: add __rust_helper to helpers
rust: time: add __rust_helper to helpers
rust: uaccess: add __rust_helper to helpers
rust: usb: add __rust_helper to helpers
rust: wait: add __rust_helper to helpers
rust: workqueue: add __rust_helper to helpers
rust: xarray: add __rust_helper to helpers
rust/helpers/auxiliary.c | 6 +++--
rust/helpers/barrier.c | 6 ++---
rust/helpers/binder.c | 13 ++++-----
rust/helpers/bitmap.c | 6 +++--
rust/helpers/bitops.c | 11 +++++---
rust/helpers/blk.c | 4 +--
rust/helpers/bug.c | 4 +--
rust/helpers/build_bug.c | 2 +-
rust/helpers/clk.c | 24 +++++++++--------
rust/helpers/completion.c | 2 +-
rust/helpers/cpu.c | 2 +-
rust/helpers/cpufreq.c | 3 ++-
rust/helpers/cpumask.c | 32 +++++++++++++---------
rust/helpers/cred.c | 4 +--
rust/helpers/device.c | 16 +++++------
rust/helpers/dma.c | 15 ++++++-----
rust/helpers/drm.c | 7 ++---
rust/helpers/err.c | 6 ++---
rust/helpers/fs.c | 2 +-
rust/helpers/io.c | 64 +++++++++++++++++++++++---------------------
rust/helpers/irq.c | 6 +++--
rust/helpers/jump_label.c | 2 +-
rust/helpers/kunit.c | 2 +-
rust/helpers/maple_tree.c | 3 ++-
rust/helpers/mm.c | 20 +++++++-------
rust/helpers/mutex.c | 13 ++++-----
rust/helpers/of.c | 2 +-
rust/helpers/page.c | 9 ++++---
rust/helpers/pci.c | 13 +++++----
rust/helpers/pid_namespace.c | 8 +++---
rust/helpers/platform.c | 2 +-
rust/helpers/poll.c | 5 ++--
rust/helpers/processor.c | 2 +-
rust/helpers/property.c | 2 +-
rust/helpers/rbtree.c | 5 ++--
rust/helpers/rcu.c | 4 +--
rust/helpers/refcount.c | 10 +++----
rust/helpers/regulator.c | 24 ++++++++++-------
rust/helpers/scatterlist.c | 12 +++++----
rust/helpers/security.c | 26 ++++++++++--------
rust/helpers/signal.c | 2 +-
rust/helpers/slab.c | 14 +++++-----
rust/helpers/spinlock.c | 13 ++++-----
rust/helpers/sync.c | 4 +--
rust/helpers/task.c | 24 ++++++++---------
rust/helpers/time.c | 12 ++++-----
rust/helpers/uaccess.c | 8 +++---
rust/helpers/usb.c | 3 ++-
rust/helpers/vmalloc.c | 7 ++---
rust/helpers/wait.c | 2 +-
rust/helpers/workqueue.c | 8 +++---
rust/helpers/xarray.c | 10 +++----
52 files changed, 280 insertions(+), 226 deletions(-)
---
base-commit: 54e3eae855629702c566bd2e130d9f40e7f35bde
change-id: 20251202-define-rust-helper-f7b531813007
Best regards,
--
Alice Ryhl <aliceryhl(a)google.com>
When the selftest 'tap.c' is compiled with '-D_FORTIFY_SOURCE=3', the
strcpy() in rtattr_add_strsz() is replaced with a checked version which
causes the test to consistently fail when compiled with toolchains for
which this option is enabled by default.
TAP version 13
1..3
# Starting 3 tests from 1 test cases.
# RUN tap.test_packet_valid_udp_gso ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_gso: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_gso
not ok 1 tap.test_packet_valid_udp_gso
# RUN tap.test_packet_valid_udp_csum ...
*** buffer overflow detected ***: terminated
# test_packet_valid_udp_csum: Test terminated by assertion
# FAIL tap.test_packet_valid_udp_csum
not ok 2 tap.test_packet_valid_udp_csum
# RUN tap.test_packet_crash_tap_invalid_eth_proto ...
*** buffer overflow detected ***: terminated
# test_packet_crash_tap_invalid_eth_proto: Test terminated by assertion
# FAIL tap.test_packet_crash_tap_invalid_eth_proto
not ok 3 tap.test_packet_crash_tap_invalid_eth_proto
# FAILED: 0 / 3 tests passed.
# Totals: pass:0 fail:3 xfail:0 xpass:0 skip:0 error:0
A buffer overflow is detected by the fortified glibc __strcpy_chk()
since the __builtin_object_size() of `RTA_DATA(rta)` is incorrectly
reported as 1, even though there is ample space in its bounding buffer
`req`.
Using the unchecked function memcpy() here instead allows us to match
the way rtattr_add_str() is written while avoiding the spurious test
failure.
Fixes: 2e64fe4624d1 ("selftests: add few test cases for tap driver")
Signed-off-by: Alice C. Munduruca <alice.munduruca(a)canonical.com>
---
tools/testing/selftests/net/tap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/tap.c b/tools/testing/selftests/net/tap.c
index 247c3b3ac1c9..dd961b629295 100644
--- a/tools/testing/selftests/net/tap.c
+++ b/tools/testing/selftests/net/tap.c
@@ -67,7 +67,7 @@ static struct rtattr *rtattr_add_strsz(struct nlmsghdr *nh, unsigned short type,
{
struct rtattr *rta = rtattr_add(nh, type, strlen(s) + 1);
- strcpy(RTA_DATA(rta), s);
+ memcpy(RTA_DATA(rta), s, strlen(s) + 1);
return rta;
}
--
2.48.1
From: Jack Thomson <jackabt(a)amazon.com>
This patch series adds ARM64 support for the KVM_PRE_FAULT_MEMORY
feature, which was previously only available on x86 [1]. This allows us
to reduce the number of stage-2 faults during execution. This is of
benefit in post-copy migration scenarios, particularly in memory
intensive applications, where we are experiencing high latencies due to
the stage-2 faults.
Patch Overview:
- The first patch adds support for the KVM_PRE_FAULT_MEMORY ioctl
on arm64.
- The second patch updates the pre_fault_memory_test to support
arm64.
- The last patch extends the pre_fault_memory_test to cover
different vm memory backings.
=== Changes Since v2 [2] ===
- Update fault info synthesize value. Thanks Suzuki
- Remove change to selftests for unaligned mmap allocations. Thanks
Sean
[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com
[2]: https://lore.kernel.org/linux-arm-kernel/20251013151502.6679-1-jackabt.amaz…
Jack Thomson (3):
KVM: arm64: Add pre_fault_memory implementation
KVM: selftests: Enable pre_fault_memory_test for arm64
KVM: selftests: Add option for different backing in pre-fault tests
Documentation/virt/kvm/api.rst | 3 +-
arch/arm64/kvm/Kconfig | 1 +
arch/arm64/kvm/arm.c | 1 +
arch/arm64/kvm/mmu.c | 73 +++++++++++-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 110 ++++++++++++++----
6 files changed, 159 insertions(+), 30 deletions(-)
base-commit: 8a4821412cf2c1429fffa07c012dd150f2edf78c
--
2.43.0
Since Armv9.6, FEAT_LSUI supplies the load/store instructions for
previleged level to access to access user memory without clearing
PSTATE.PAN bit.
This patchset support FEAT_LSUI and applies in futex atomic operation
and user_swpX emulation where can replace from ldxr/st{l}xr
pair implmentation with clearing PSTATE.PAN bit to correspondant
load/store unprevileged atomic operation without clearing PSTATE.PAN bit.
This patch based on v6.19-rc1
Patch Sequences
================
Patch #1 adds cpufeature for FEAT_LSUI
Patch #2-#3 expose FEAT_LSUI to guest
Patch #4 adds Kconfig for FEAT_LSUI
Patch #5-#6 support futex atomic-op with FEAT_LSUI
Patch #7-#9 support user_swpX emulation with FEAT_LSUI
Patch History
==============
from v10 to v11:
- rebase to v6.19-rc1
- use cast instruction to emulate deprecated swpb instruction
- https://lore.kernel.org/all/20251103163224.818353-1-yeoreum.yun@arm.com/
from v9 to v10:
- apply FEAT_LSUI to user_swpX emulation.
- add test coverage for LSUI bit in ID_AA64ISAR3_EL1
- rebase to v6.18-rc4
- https://lore.kernel.org/all/20250922102244.2068414-1-yeoreum.yun@arm.com/
from v8 to v9:
- refotoring __lsui_cmpxchg64()
- rebase to v6.17-rc7
- https://lore.kernel.org/all/20250917110838.917281-1-yeoreum.yun@arm.com/
from v7 to v8:
- implements futex_atomic_eor() and futex_atomic_cmpxchg() with casalt
with C helper.
- Drop the small optimisation on ll/sc futex_atomic_set operation.
- modify some commit message.
- https://lore.kernel.org/all/20250816151929.197589-1-yeoreum.yun@arm.com/
from v6 to v7:
- wrap FEAT_LSUI with CONFIG_AS_HAS_LSUI in cpufeature
- remove unnecessary addition of indentation.
- remove unnecessary mte_tco_enable()/disable() on LSUI operation.
- https://lore.kernel.org/all/20250811163635.1562145-1-yeoreum.yun@arm.com/
from v5 to v6:
- rebase to v6.17-rc1
- https://lore.kernel.org/all/20250722121956.1509403-1-yeoreum.yun@arm.com/
from v4 to v5:
- remove futex_ll_sc.h futext_lsui and lsui.h and move them to futex.h
- reorganize the patches.
- https://lore.kernel.org/all/20250721083618.2743569-1-yeoreum.yun@arm.com/
from v3 to v4:
- rebase to v6.16-rc7
- modify some patch's title.
- https://lore.kernel.org/all/20250617183635.1266015-1-yeoreum.yun@arm.com/
from v2 to v3:
- expose FEAT_LUSI to guest
- add help section for LUSI Kconfig
- https://lore.kernel.org/all/20250611151154.46362-1-yeoreum.yun@arm.com/
from v1 to v2:
- remove empty v9.6 menu entry
- locate HAS_LUSI in cpucaps in order
- https://lore.kernel.org/all/20250611104916.10636-1-yeoreum.yun@arm.com/
Yeoreum Yun (9):
arm64: cpufeature: add FEAT_LSUI
KVM: arm64: expose FEAT_LSUI to guest
KVM: arm64: kselftest: set_id_regs: add test for FEAT_LSUI
arm64: Kconfig: Detect toolchain support for LSUI
arm64: futex: refactor futex atomic operation
arm64: futex: support futex with FEAT_LSUI
arm64: separate common LSUI definitions into lsui.h
arm64: armv8_deprecated: convert user_swpX to inline function
arm64: armv8_deprecated: apply FEAT_LSUI for swpX emulation.
arch/arm64/Kconfig | 5 +
arch/arm64/include/asm/futex.h | 291 +++++++++++++++---
arch/arm64/include/asm/lsui.h | 25 ++
arch/arm64/kernel/armv8_deprecated.c | 111 +++++--
arch/arm64/kernel/cpufeature.c | 10 +
arch/arm64/kvm/sys_regs.c | 3 +-
arch/arm64/tools/cpucaps | 1 +
.../testing/selftests/kvm/arm64/set_id_regs.c | 1 +
8 files changed, 381 insertions(+), 66 deletions(-)
create mode 100644 arch/arm64/include/asm/lsui.h
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
Make use of empty (NULL-terminated) array instead of NULL pointer to
avoid compiler errors while maintaining the behavior of the function
intact
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/filesystems$ make LLVM=1 W=1
CC devpts_pts
CC file_stressor
CC anon_inode_test
anon_inode_test.c:45:37: warning: null passed to a callee that requires a non-null argument [-Wnonnull]
45 | ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
| ^~~~
/usr/lib/llvm-18/lib/clang/18/include/__stddef_null.h:26:14: note: expanded from macro 'NULL'
26 | #define NULL ((void*)0)
| ^~~~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:535:11: note: expanded from macro 'ASSERT_LT'
535 | __EXPECT(expected, #expected, seen, #seen, <, 1)
| ^~~~~~~~
/home/clint/Desktop/kernel-dev/linux-v1/tools/testing/selftests/../../../tools/testing/selftests/kselftest_harness.h:758:33: note: expanded from macro '__EXPECT'
758 | __typeof__(_expected) __exp = (_expected); \
| ^~~~~~~~~
1 warning generated.
tools/testing/selftests/filesystems/anon_inode_test.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
index 94c6c81c2..2c4c50500 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -42,7 +42,10 @@ TEST(anon_inode_no_exec)
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
- ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ char *const empty_argv[] = {NULL};
+ char *const empty_envp[] = {NULL};
+
+ ASSERT_LT(execveat(fd_context, "", empty_argv, empty_envp, AT_EMPTY_PATH), 0);
ASSERT_EQ(errno, EACCES);
EXPECT_EQ(close(fd_context), 0);
--
2.43.0
Use __builtin_trap() to truly crash the program instead of dereferencing
null pointer which may be optimized by the compiler preventing the crash
from occurring
Signed-off-by: Clint George <clintbgeorge(a)gmail.com>
---
[] Testing:
The diff between before and after of running the kselftest test of the
module shows no regression on system with x86 architecture
Let me know if any more testing is needed to be done
[] Error log:
~/Desktop/kernel-dev/linux-v1/tools/testing/selftests/coredump$ make LLVM=1 W=1
CC stackdump_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
CC coredump_socket_protocol_test
coredump_test_helpers.c:59:6: warning: indirection of non-volatile null pointer will be deleted, not trap [-Wnull-dereference]
59 | i = *(int *)NULL;
| ^~~~~~~~~~~~
coredump_test_helpers.c:59:6: note: consider using __builtin_trap() or qualifying pointer with 'volatile'
1 warning generated.
tools/testing/selftests/coredump/coredump_test_helpers.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
index a6f6d5f2a..5c8adee63 100644
--- a/tools/testing/selftests/coredump/coredump_test_helpers.c
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -56,7 +56,7 @@ void crashing_child(void)
pthread_create(&thread, NULL, do_nothing, NULL);
/* crash on purpose */
- i = *(int *)NULL;
+ __builtin_trap();
}
int create_detached_tmpfs(void)
--
2.43.0
On the Android arm32 platform, when performing the futex_requeue test, it will
most likely return a failure. The specific reason is detailed in a commit[1]
previously submitted by Edward Liaw. However, this commit cannot perfectly
solve the problem. This is because using a barrier does not guarantee that
the child thread will wait on futex_wait.
This series of patches attempts to solve this problem by checking whether
the child thread is in a sleeping state. This is because when the child thread
goes to sleep, it indicates that it is waiting for the futex lock.
Link: https://lore.kernel.org/all/20240918231102.234253-1-edliaw@google.com/
On Thu, 11 Dec 2025, Jiaqi Yan wrote:
> CONFIGs seem alright to me. Do you boot kernel with cmdline options like "default_hugepagesz=1G hugepagesz=1G hugepages=64", or dynamically set up
> huge pages via "echo 64 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"?
Neither of these. When I do the test is skipped:
# echo 64 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
# ./arm64/sea_to_user
Random seed: 0x6b8b4567
# Mapped 0x40000 pages: gva=0x80000000 to gpa=0xff80000000
# Before EINJect: data=0xbaadcafe
# EINJ_GVA=0x81234bad, einj_gpa=0xff81234bad, einj_hva=0xffff41234bad,
einj_hpa=0x80241234bad
ok 1 # SKIP EINJ module probably not loaded?sh: line 1:
/sys/kernel/debug/apei/einj/error_type: No such file or directory
Bail out! Failed to write EINJ entry: No such file or directory (2)
# 1 skipped test(s) detected. Consider enabling relevant config options to
improve coverage.
# Planned tests != run tests (0 != 1)
# Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0
This patch series suggests fixes for several corner cases in the RISC-V
vector ptrace implementation:
- init vector context with proper vlenb, to avoid reading zero vlenb
by an early attached debugger
- follow gdbserver expectations and return ENODATA instead of EINVAL
if vector extension is supported but not yet activated for the
traced process
- validate input vector csr registers in ptrace, to maintain an accurate
view of the tracee's vector context across multiple halt/resume
debug cycles
For detailed description see the appropriate commit messages. A new test
suite validate_v_ptrace is added to the tools/testing/selftests/riscv/vector
to verify some of the vector ptrace functionality and corner cases.
So far tested on the following platforms:
- test in QEMU rv32/rv64
- test on c908 (BananaPi CanMV K230D Zero)
- test on c906 (MangoPi MQ Pro)
Previous versions:
- v4: https://lore.kernel.org/linux-riscv/20251108194207.1257866-1-geomatsi@gmail…
- v3: https://lore.kernel.org/linux-riscv/20251025210655.43099-1-geomatsi@gmail.c…
- v2: https://lore.kernel.org/linux-riscv/20250821173957.563472-1-geomatsi@gmail.…
- v1: https://lore.kernel.org/linux-riscv/20251007115840.2320557-1-geomatsi@gmail…
Changes in v5:
- add support and minimal set of tests for XTheadVector
Changes in v4:
The form 'vsetvli x0, x0, ...' can only be used if VLMAX remains
unchanged, see spec 6.2. This condition was not met by the initial
values in the selftests w.r.t. the initial zeroed context. QEMU accepted
such values, but actual hardware (c908, BananaPi CanMV Zero board) did
not, setting vill. So fix the selftests after testing on hardware:
- replace 'vsetvli x0, x0, ...' by 'vsetvli rd, x0, ...'
- fixed instruction returns VLMAX, so use it in checks as well
- replace fixed vlenb == 16 in the syscall test
Changes in v3:
Address the review comments by Andy Chiu and rework the approach:
- drop forced vector context save entirely
- perform strict validation of vector csr regs in ptrace
Changes in v2:
- add thread_info flag to allow to force vector context save
- force vector context save after vector ptrace to ensure valid vector
context in the next ptrace operations
- force vector context save on the first context switch after vector
context init to get proper vlenb
---
Ilya Mamay (1):
riscv: ptrace: return ENODATA for inactive vector extension
Sergey Matyukevich (8):
riscv: vector: init vector context with proper vlenb
riscv: csr: define vtype register elements
riscv: ptrace: validate input vector csr registers
selftests: riscv: test ptrace vector interface
selftests: riscv: verify initial vector state with ptrace
selftests: riscv: verify syscalls discard vector context
selftests: riscv: verify ptrace rejects invalid vector csr inputs
selftests: riscv: verify ptrace accepts valid vector csr values
arch/riscv/include/asm/csr.h | 17 +
arch/riscv/kernel/ptrace.c | 98 +-
arch/riscv/kernel/vector.c | 12 +-
.../testing/selftests/riscv/vector/.gitignore | 2 +
tools/testing/selftests/riscv/vector/Makefile | 10 +-
.../selftests/riscv/vector/v_helpers.c | 23 +
.../selftests/riscv/vector/v_helpers.h | 2 +
.../riscv/vector/validate_v_ptrace.c | 919 ++++++++++++++++++
8 files changed, 1075 insertions(+), 8 deletions(-)
create mode 100644 tools/testing/selftests/riscv/vector/validate_v_ptrace.c
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
--
2.52.0
Note: it's net/ only bits and doesn't include changes, which shoulf be
merged separately and are posted separately. The full branch for
convenience is at [1], and the patch is here:
https://lore.kernel.org/io-uring/7486ab32e99be1f614b3ef8d0e9bc77015b173f7.1…
Many modern NICs support configurable receive buffer lengths, and zcrx and
memory providers can use buffers larger than 4K/PAGE_SIZE on x86 to improve
performance. When paired with hw-gro larger rx buffer sizes can drastically
reduce the number of buffers traversing the stack and save a lot of processing
time. It also allows to give to users larger contiguous chunks of data. The
idea was first floated around by Saeed during netdev conf 2024 and was
asked about by a few folks.
Single stream benchmarks showed up to ~30% CPU util improvement.
E.g. comparison for 4K vs 32K buffers using a 200Gbit NIC:
packets=23987040 (MB=2745098), rps=199559 (MB/s=22837)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 1.53 0.00 27.78 2.72 1.31 66.45 0.22
packets=24078368 (MB=2755550), rps=200319 (MB/s=22924)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 0.69 0.00 8.26 31.65 1.83 57.00 0.57
This series adds net infrastructure for memory providers configuring
the size and implements it for bnxt. It's an opt-in feature for drivers,
they should advertise support for the parameter in the qops and must check
if the hardware supports the given size. It's limited to memory providers
as it drastically simplifies implementation. It doesn't affect the fast
path zcrx uAPI, and the sizes is defined in zcrx terms, which allows it
to be flexible and adjusted in the future, see Patch 8 for details.
A liburing example can be found at [2]
full branch:
[1] https://github.com/isilence/linux.git zcrx/large-buffers-v7
Liburing example:
[2] https://github.com/isilence/liburing.git zcrx/rx-buf-len
v7: - Add xa_destroy
- Rebase
v6: - Update docs and add a selftest
v5: https://lore.kernel.org/netdev/cover.1760440268.git.asml.silence@gmail.com/
- Remove all unnecessary bits like configuration via netlink, and
multi-stage queue configuration.
v4: https://lore.kernel.org/all/cover.1760364551.git.asml.silence@gmail.com/
- Update fbnic qops
- Propagate max buf len for hns3
- Use configured buf size in __bnxt_alloc_rx_netmem
- Minor stylistic changes
v3: https://lore.kernel.org/all/cover.1755499375.git.asml.silence@gmail.com/
- Rebased, excluded zcrx specific patches
- Set agg_size_fac to 1 on warning
v2: https://lore.kernel.org/all/cover.1754657711.git.asml.silence@gmail.com/
- Add MAX_PAGE_ORDER check on pp init
- Applied comments rewording
- Adjust pp.max_len based on order
- Patch up mlx5 queue callbacks after rebase
- Minor ->queue_mgmt_ops refactoring
- Rebased to account for both fill level and agg_size_fac
- Pass providers buf length in struct pp_memory_provider_params and
apply it in __netdev_queue_confi().
- Use ->supported_ring_params to validate drivers support of set
qcfg parameters.
Jakub Kicinski (1):
eth: bnxt: adjust the fill level of agg queues with larger buffers
Pavel Begunkov (8):
net: page pool: xa init with destroy on pp init
net: page_pool: sanitise allocation order
net: memzero mp params when closing a queue
net: let pp memory provider to specify rx buf len
eth: bnxt: store rx buffer size per queue
eth: bnxt: allow providers to set rx buf size
io_uring/zcrx: document area chunking parameter
selftests: iou-zcrx: test large chunk sizes
Documentation/networking/iou-zcrx.rst | 20 +++
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 118 ++++++++++++++----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +-
include/net/netdev_queues.h | 9 ++
include/net/page_pool/types.h | 1 +
net/core/netdev_rx_queue.c | 14 ++-
net/core/page_pool.c | 4 +
.../selftests/drivers/net/hw/iou-zcrx.c | 72 +++++++++--
.../selftests/drivers/net/hw/iou-zcrx.py | 37 ++++++
11 files changed, 236 insertions(+), 49 deletions(-)
--
2.52.0
Problem
=======
When host APEI is unable to claim a synchronous external abort (SEA)
during guest abort, today KVM directly injects an asynchronous SError
into the VCPU then resumes it. The injected SError usually results in
unpleasant guest kernel panic.
One of the major situation of guest SEA is when VCPU consumes recoverable
uncorrected memory error (UER), which is not uncommon at all in modern
datacenter servers with large amounts of physical memory. Although SError
and guest panic is sufficient to stop the propagation of corrupted memory,
there is room to recover from an UER in a more graceful manner.
Proposed Solution
=================
The idea is, we can replay the SEA to the faulting VCPU. If the memory
error consumption or the fault that cause SEA is not from guest kernel,
the blast radius can be limited to the poison-consuming guest process,
while the VM can keep running.
In addition, instead of doing under the hood without involving userspace,
there are benefits to redirect the SEA to VMM:
- VM customers care about the disruptions caused by memory errors, and
VMM usually has the responsibility to start the process of notifying
the customers of memory error events in their VMs. For example some
cloud provider emits a critical log in their observability UI [1], and
provides a playbook for customers on how to mitigate disruptions to
their workloads.
- VMM can protect future memory error consumption by unmapping the poisoned
pages from stage-2 page table with KVM userfault [2], or by splitting the
memslot that contains the poisoned pages.
- VMM can keep track of SEA events in the VM. When VMM thinks the status
on the host or the VM is bad enough, e.g. number of distinct SEAs
exceeds a threshold, it can restart the VM on another healthy host.
- Behavior parity with x86 architecture. When machine check exception
(MCE) is caused by VCPU, kernel or KVM signals userspace SIGBUS to
let VMM either recover from the MCE, or terminate itself with VM.
The prior RFC proposes to implement SIGBUS on arm64 as well, but
Marc preferred KVM exit over signal [3]. However, implementation
aside, returning SEA to VMM is on par with returning MCE to VMM.
Once SEA is redirected to VMM, among other actions, VMM is encouraged
to inject external aborts into the faulting VCPU.
New UAPIs
=========
This patchset introduces following userspace-visible changes to empower
VMM to control what happens for SEA on guest memory:
- KVM_CAP_ARM_SEA_TO_USER. While taking SEA, if userspace has enabled
this new capability at VM creation, and the SEA is not owned by kernel
allocated memory, instead of injecting SError, return KVM_EXIT_ARM_SEA
to userspace.
- KVM_EXIT_ARM_SEA. This is the VM exit reason VMM gets. The details
about the SEA is provided in arm_sea as much as possible, including
sanitized ESR value at EL2, faulting guest virtual and physical
addresses if available.
* From v3 [4]
- Rebased on commit 3a8660878839 ("Linux 6.18-rc1").
- In selftest, print a message if GVA or GPA expects to be valid.
* From v2 [5]:
- Rebased on "[PATCH] KVM: arm64: nv: Handle SEAs due to VNCR redirection" [6]
and kvmarm/next commit 7b8346bd9fce6 ("KVM: arm64: Don't attempt vLPI
mappings when vPE allocation is disabled")
- Took the host_owns_sea implementation from Oliver [7, 8].
- Excluded the guest SEA injection patches.
- Updated selftest.
* From v1 [9]:
- Rebased on commit 4d62121ce9b5 ("KVM: arm64: vgic-debug: Avoid
dereferencing NULL ITE pointer").
- Sanitize ESR_EL2 before reporting it to userspace.
- Do not do KVM_EXIT_ARM_SEA when SEA is caused by memory allocated to
stage-2 translation table.
[1] https://cloud.google.com/solutions/sap/docs/manage-host-errors
[2] https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com
[3] https://lore.kernel.org/kvm/86pljbqqh0.wl-maz@kernel.org
[4] https://lore.kernel.org/kvmarm/20250731205844.1346839-1-jiaqiyan@google.com
[5] https://lore.kernel.org/kvm/20250604050902.3944054-1-jiaqiyan@google.com
[6] https://lore.kernel.org/kvmarm/20250729182342.3281742-1-oliver.upton@linux.…
[7] https://lore.kernel.org/kvm/aHFohmTb9qR_JG1E@linux.dev
[8] https://lore.kernel.org/kvm/aHK-DPufhLy5Dtuk@linux.dev
[9] https://lore.kernel.org/kvm/20250505161412.1926643-1-jiaqiyan@google.com
Jiaqi Yan (3):
KVM: arm64: VM exit to userspace to handle SEA
KVM: selftests: Test for KVM_EXIT_ARM_SEA
Documentation: kvm: new UAPI for handling SEA
Documentation/virt/kvm/api.rst | 61 ++++
arch/arm64/include/asm/kvm_host.h | 2 +
arch/arm64/kvm/arm.c | 5 +
arch/arm64/kvm/mmu.c | 68 +++-
include/uapi/linux/kvm.h | 10 +
tools/arch/arm64/include/asm/esr.h | 2 +
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/arm64/sea_to_user.c | 331 ++++++++++++++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 1 +
9 files changed, 480 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/kvm/arm64/sea_to_user.c
--
2.51.0.760.g7b8bcc2412-goog
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
2. A division-by-zero crash in SNC detection on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Changelog:
v3:
- Patch 1:
1. Update the return types of detect_vendor() and get_vendor() from
'int' to 'unsigned int' to align with their usage as bitmask values
and to prevent potentially risky type conversions (Fenghua).
2. Split the code changes of "define CPU vendor IDs as bits to match
usage" from original patch 1 into a separate patch (this patch,
suggested by Fenghua and Reinette).
3. Introduce the flag 'initialized' to simplify the get_vendor() ->
detect_vendor() logic (Reinette).
- Patch 2 (original patch 1):
1. Move the code changes of "define CPU vendor IDs as bits to match
usage" into patch 1.
- Patch 3 (original patch 2):
1. Fix a nit of code comment for affected platforms (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
- Patch 4 (original patch 3):
1. Fix a nit to avoid calling get_vendor() twice (Fenghua).
2. Add Reviewed-by: Fenghua Yu <fenghuay(a)nvidia.com>.
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (4):
selftests/resctrl: Define CPU vendor IDs as bits to match usage
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 6 ++--
tools/testing/selftests/resctrl/resctrl.h | 8 ++++--
.../testing/selftests/resctrl/resctrl_tests.c | 28 +++++++++++++------
tools/testing/selftests/resctrl/resctrlfs.c | 10 +++++++
4 files changed, 39 insertions(+), 13 deletions(-)
--
2.47.3
v26: CONFIG_RISCV_USER_CFI depends on CONFIG_MMU (dependency of shadow stack
on MMU). Used b4 to pick tags, apparantly it messed up some tag picks. Fixing it
v25: Removal of `riscv_nousercfi` from `cpufeature.c` and instead placing
it as extern in `usercfi.h` was leading to build error whene cfi config
is not selected. Placed `riscv_nousercfi` outside cfi config ifdef block
in `usercfi.h`
v24:
Took PaulW suggestion for fixing commit desc and checkpatch fixes
in patch #21.
"riscv: kernel command line option to opt out of user cfi"
Collected "Tested-by" tags from Andreas and Valentin. Thanks a lot
for testing it.
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v26:
- fixed S-O-B tag messed up by b4.
- CONFIG_RISCV_USER_CFI should depend on CONFIG_MMU
v25:
- fixed build error when cfi config is not selected due to missing symbol
error on `riscv_nousercfi`.
v24:
- Checkpatch fixes in patch # 21.
- Collected "Tested-by" tags.
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v26:
- Link to v25: https://lore.kernel.org/r/20251205-v5_user_cfi_series-v25-0-1a07c0127361@ri…
Changes in v25:
- Link to v24: https://lore.kernel.org/r/20251204-v5_user_cfi_series-v24-0-ada7a3ba14dc@ri…
Changes in v24:
- Link to v23: https://lore.kernel.org/r/20251112-v5_user_cfi_series-v23-0-b55691eacf4f@ri…
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 23 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 97 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 25 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2482 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
This is a follow-up series of [1]. It tries to fix a possible UAF in the
fops of cros_ec_chardev after the underlying protocol device has gone by
using revocable.
The 1st patch introduces the revocable which is an implementation of ideas
from the talk [2].
The 2nd and 3rd patches add test cases for revocable in Kunit and selftest.
The 4th patch converts existing protocol devices to resource providers
of cros_ec_device.
The 5th - 7th are PoC patches for showing the use case of "Replace file
operations" below.
---
I came out with 2 possible usages of revocable.
1. Use primitive APIs
Use the primitive APIs of revocable directly.
The file operations make sure the resources are available when using them.
This is what the series original proposed[3][4]. Even though it has the
finest grain for accessing the resources, it makes the user code verbose.
Per feedback from the community, I'm looking for some subsystem level
helpers so that user code can be simlper.
2. Replace file operations
Replace filp->f_op to revocable-aware warppers.
The warppers make sure the resources are available in the file operations.
The user code needs to provide a callback .try_access() to tell the wrappers
where/how to *save* the pointers of resources.
Known drawback:
- The warppers reserve the resources for all file operations even if they
might be unused.
- The user code still needs to be revocable-aware.
- The whole file operation becomes a SRCU read-side critical section. Are
there any functions can't be called in the critical section? If there is,
the file operations may not be awared of that.
See 5th - 7th patches for an example usage.
[1] https://lore.kernel.org/chrome-platform/20250721044456.2736300-6-tzungbi@ke…
[2] https://lpc.events/event/17/contributions/1627/
[3] https://lore.kernel.org/chrome-platform/20250912081718.3827390-5-tzungbi@ke…
[4] https://lore.kernel.org/chrome-platform/20250912081718.3827390-6-tzungbi@ke…
v5:
- Rebase onto next-20251015.
- Add more context about the PoC.
- Support multiple revocable providers in the PoC.
v4: https://lore.kernel.org/chrome-platform/20250923075302.591026-1-tzungbi@ker…
- Rebase onto next-20250922.
- Remove the 5th patch from v3.
- Add fops replacement PoC in 5th - 7th patches.
v3: https://lore.kernel.org/chrome-platform/20250912081718.3827390-1-tzungbi@ke…
- Rebase onto https://lore.kernel.org/chrome-platform/20250828083601.856083-1-tzungbi@ker…
and next-20250912.
- The 4th patch changed accordingly.
v2: https://lore.kernel.org/chrome-platform/20250820081645.847919-1-tzungbi@ker…
- Rename "ref_proxy" -> "revocable".
- Add test cases in Kunit and selftest.
v1: https://lore.kernel.org/chrome-platform/20250814091020.1302888-1-tzungbi@ke…
Tzung-Bi Shih (7):
revocable: Revocable resource management
revocable: Add Kunit test cases
selftests: revocable: Add kselftest cases
platform/chrome: Protect cros_ec_device lifecycle with revocable
revocable: Add fops replacement
char: misc: Leverage revocable fops replacement
platform/chrome: cros_ec_chardev: Secure cros_ec_device via revocable
.../driver-api/driver-model/index.rst | 1 +
.../driver-api/driver-model/revocable.rst | 87 +++++++
MAINTAINERS | 9 +
drivers/base/Kconfig | 8 +
drivers/base/Makefile | 5 +-
drivers/base/revocable.c | 233 ++++++++++++++++++
drivers/base/revocable_test.c | 110 +++++++++
drivers/char/misc.c | 8 +
drivers/platform/chrome/cros_ec.c | 5 +
drivers/platform/chrome/cros_ec_chardev.c | 22 +-
fs/Makefile | 2 +-
fs/fs_revocable.c | 154 ++++++++++++
include/linux/fs.h | 2 +
include/linux/fs_revocable.h | 21 ++
include/linux/miscdevice.h | 4 +
include/linux/platform_data/cros_ec_proto.h | 4 +
include/linux/revocable.h | 53 ++++
tools/testing/selftests/Makefile | 1 +
.../selftests/drivers/base/revocable/Makefile | 7 +
.../drivers/base/revocable/revocable_test.c | 116 +++++++++
.../drivers/base/revocable/test-revocable.sh | 39 +++
.../base/revocable/test_modules/Makefile | 10 +
.../revocable/test_modules/revocable_test.c | 188 ++++++++++++++
23 files changed, 1086 insertions(+), 3 deletions(-)
create mode 100644 Documentation/driver-api/driver-model/revocable.rst
create mode 100644 drivers/base/revocable.c
create mode 100644 drivers/base/revocable_test.c
create mode 100644 fs/fs_revocable.c
create mode 100644 include/linux/fs_revocable.h
create mode 100644 include/linux/revocable.h
create mode 100644 tools/testing/selftests/drivers/base/revocable/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/revocable_test.c
create mode 100755 tools/testing/selftests/drivers/base/revocable/test-revocable.sh
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/Makefile
create mode 100644 tools/testing/selftests/drivers/base/revocable/test_modules/revocable_test.c
--
2.51.0.788.g6d19910ace-goog
The get_hw_info uses a smaller user buffer on purpose to check how
the kernel updates only the fields that fit in the buffer. The test
created a custom smaller struct for this, but the helper function later
treats the buffer as struct iommu_test_hw_info. This makes the compiler
warn about a possible out-of-bounds access (-Warray-bounds).
This keeps the test behavior the same and removes the warning.
Signed-off-by: Kathara Sasikumar <katharasasikumar007(a)gmail.com>
---
tools/testing/selftests/iommu/iommufd.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 10e051b6f592..f6aceb65313f 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -755,9 +755,7 @@ TEST_F(iommufd_ioas, get_hw_info)
struct iommu_test_hw_info info;
uint64_t trailing_bytes;
} buffer_larger;
- struct iommu_test_hw_info_buffer_smaller {
- __u32 flags;
- } buffer_smaller;
+ struct iommu_test_hw_info buffer_smaller;
if (self->device_id) {
uint8_t max_pasid = 0;
--
2.51.0
GCC gets a bit confused and reports:
In function '_test_cmd_get_hw_info',
inlined from 'iommufd_ioas_get_hw_info' at iommufd.c:779:3,
inlined from 'wrapper_iommufd_ioas_get_hw_info' at iommufd.c:752:1:
>> iommufd_utils.h:804:37: warning: array subscript 'struct iommu_test_hw_info[0]' is partly outside array bounds of 'struct iommu_test_hw_info_buffer_smaller[1]' [-Warray-bounds=]
804 | assert(!info->flags);
| ~~~~^~~~~~~
iommufd.c: In function 'wrapper_iommufd_ioas_get_hw_info':
iommufd.c:761:11: note: object 'buffer_smaller' of size 4
761 | } buffer_smaller;
| ^~~~~~~~~~~~~~
While it is true that "struct iommu_test_hw_info[0]" is partly out of
bounds of the input pointer, it is not true that info->flags is out of
bounds. Unclear why it warns on this.
Reuse an existing properly sized stack buffer and pass a truncated length
instead to test the same thing.
Fixes: af4fde93c319 ("iommufd/selftest: Add coverage for IOMMU_GET_HW_INFO ioctl")
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202512032344.kaAcKFIM-lkp@intel.com/
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
---
tools/testing/selftests/iommu/iommufd.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 10e051b6f592df..dadad277f4eb2e 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -755,9 +755,6 @@ TEST_F(iommufd_ioas, get_hw_info)
struct iommu_test_hw_info info;
uint64_t trailing_bytes;
} buffer_larger;
- struct iommu_test_hw_info_buffer_smaller {
- __u32 flags;
- } buffer_smaller;
if (self->device_id) {
uint8_t max_pasid = 0;
@@ -789,8 +786,9 @@ TEST_F(iommufd_ioas, get_hw_info)
* the fields within the size range still gets updated.
*/
test_cmd_get_hw_info(self->device_id,
- IOMMU_HW_INFO_TYPE_DEFAULT,
- &buffer_smaller, sizeof(buffer_smaller));
+ IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact,
+ offsetofend(struct iommu_test_hw_info,
+ flags));
test_cmd_get_hw_info_pasid(self->device_id, &max_pasid);
ASSERT_EQ(0, max_pasid);
if (variant->pasid_capable) {
base-commit: 93013488dd77dd2ea8bd23355a5587d9e6dac185
--
2.43.0
On Thu, 11 Dec 2025 18:04:48 +0000,
Jiaqi Yan <jiaqiyan(a)google.com> wrote:
>
> [1 <text/plain; UTF-8 (quoted-printable)>]
> Hi Sebastian,
>
> CONFIGs seem alright to me. Do you boot kernel with cmdline options like
> "default_hugepagesz=1G hugepagesz=1G hugepages=64", or dynamically set up
> huge pages via "echo 64 >
> /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"?
I don't think this is irrelevant. The whole thing seems to have some
logic flaws, see the extensive report from Zenghui[1] as a reply to
your series.
M.
[1] https://lore.kernel.org/r/3061f5f8-cef0-b7b1-c4de-f2ceea29af9a@huawei.com
--
Without deviation from the norm, progress is not possible.
Hi,
sea_to_user fails for me with:
Random seed: 0x6b8b4567
==== Test Assertion Failure ====
include/kvm_syscalls.h:58: mem != MAP_FAILED
pid=4923 tid=4923 errno=12 - Cannot allocate memory
1 0x0000000000405afb: __kvm_mmap at kvm_syscalls.h:58 (discriminator 3)
2 (inlined by) kvm_mmap at kvm_syscalls.h:65 (discriminator 3)
3 (inlined by) vm_mem_add at kvm_util.c:1036 (discriminator 3)
4 0x0000000000402377: vm_create_with_sea_handler at sea_to_user.c:278
5 (inlined by) main at sea_to_user.c:324
6 0x0000ffff8d0a621b: ?? ??:0
7 0x0000ffff8d0a62fb: ?? ??:0
8 0x000000000040282f: _start at ??:?
mmap() failed, rc: -1 errno: 12 (Cannot allocate memory)
this could be related to my config?
# CONFIG_CGROUP_HUGETLB is not set
CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y
CONFIG_HAVE_ARCH_HUGE_VMAP=y
CONFIG_HAVE_ARCH_HUGE_VMALLOC=y
CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y
CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y
# CONFIG_PERSISTENT_HUGE_ZERO_FOLIO is not set
CONFIG_TRANSPARENT_HUGEPAGE=y
# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set
CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y
# CONFIG_TRANSPARENT_HUGEPAGE_NEVER is not set
CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER=y
# CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS is not set
# CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE is not set
# CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE is not set
CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER=y
# CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS is not set
# CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE is not set
# CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE is not set
CONFIG_PGTABLE_HAS_HUGE_LEAVES=y
CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP=y
CONFIG_ARCH_SUPPORTS_HUGETLBFS=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING=y
Looking at the code smth like this would skip the test:
get_backing_src_pagesz(VM_MEM_SRC_ANONYMOUS_HUGETLB);
..or is this not worth the effort and my config is too weird?
Thanks,
Sebastian
v25: Removal of `riscv_nousercfi` from `cpufeature.c` and instead placing
it as extern in `usercfi.h` was leading to build error whene cfi config
is not selected. Placed `riscv_nousercfi` outside cfi config ifdef block
in `usercfi.h`
v24:
Took PaulW suggestion for fixing commit desc and checkpatch fixes
in patch #21.
"riscv: kernel command line option to opt out of user cfi"
Collected "Tested-by" tags from Andreas and Valentin. Thanks a lot
for testing it.
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v25:
- fixed build error when cfi config is not selected due to missing symbol
error on `riscv_nousercfi`.
v24:
- Checkpatch fixes in patch # 21.
- Collected "Tested-by" tags.
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v25:
- Link to v24: https://lore.kernel.org/r/20251204-v5_user_cfi_series-v24-0-ada7a3ba14dc@ri…
Changes in v24:
- Link to v23: https://lore.kernel.org/r/20251112-v5_user_cfi_series-v23-0-b55691eacf4f@ri…
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 97 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 25 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2481 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v7 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v7:
- Update comments in #3 (Paolo Abeni <pabeni(a)redhat.com>)
- Update comments and use synack_type TCP_SYNACK_RETRANS and num_timeout in #9. (Paolo Abeni <pabeni(a)redhat.com>)
v6:
- Update comment in #3 to highlight RX path is only used for virtio-net (Paolo Abeni <pabeni(a)redhat.com>)
- Rename TCP_CONG_WANTS_ECT_1 to TCP_CONG_ECT_1_NEGOTIATION to distiguish from TCP_CONG_ECT_1_ESTABLISH (Paolo Abeni <pabeni(a)redhat.com>)
- Move TCP_CONG_ECT_1_ESTABLISH in #6 to latter patch series (Paolo Abeni <pabeni(a)redhat.com>)
- Add new synack_type instead of moving the increment of num_retran in #9 (Paolo Abeni <pabeni(a)redhat.com>)
- Use new synack_type TCP_SYNACK_RETRANS and num_retrans for SYN/ACK retx fallbackk for AccECN in #10 (Paolo Abeni <pabeni(a)redhat.com>)
- Do not cast const struct into non-const in #11, and set AccECN fail mode after tcp_rtx_synack() (Paolo Abeni <pabeni(a)redhat.com>)
v5:
- Move previous #11 in v4 in latter patch after discussion with RFC author.
- Add #3 to update the comments for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN. (Parav Pandit <parav(a)nvidia.com>)
- Add gro self-test for TCP CWR flag in #4. (Eric Dumazet <edumazet(a)google.com>)
- Add fixes: tag into #7 (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message of #8 and if condition check (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #13 (Paolo Abeni <pabeni(a)redhat.com>)
v4:
- Add previous #13 in v2 back after dicussion with the RFC author.
- Add TCP_ACCECN_OPTION_PERSIST to tcp_ecn_option sysctl to ignore AccECN fallback policy on sending AccECN option.
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1. (Paolo Abeni <pabeni(a)redhat.com>)
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3. (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #4. (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message to fix old AccECN commits in #5. (Paolo Abeni <pabeni(a)redhat.com>)
- Remove unnecessary brackets in #10. (Paolo Abeni <pabeni(a)redhat.com>)
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2. (Paolo Abeni <pabeni(a)redhat.com>)
---
Chia-Yu Chang (11):
selftests/net: gro: add self-test for TCP CWR flag
tcp: ECT_1_NEGOTIATION and NEEDS_ACCECN identifiers
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: add TCP_SYNACK_RETRANS synack_type
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: detect loss ACK w/ AccECN option and add
TCP_ACCECN_OPTION_PERSIST
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
Documentation/networking/ip-sysctl.rst | 4 +-
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/tcp.h | 4 +-
include/net/inet_ecn.h | 20 +++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 92 ++++++++++++++-----
net/ipv4/inet_connection_sock.c | 4 +
net/ipv4/sysctl_net_ipv4.c | 4 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 5 +-
net/ipv4/tcp_input.c | 37 +++++++-
net/ipv4/tcp_minisocks.c | 46 +++++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 32 ++++---
net/ipv4/tcp_timer.c | 3 +
tools/testing/selftests/drivers/net/gro.c | 81 +++++++++++-----
16 files changed, 284 insertions(+), 86 deletions(-)
--
2.34.1
Dear friends,
this patch series adds support for nested seccomp listeners. It allows container
runtimes and other sandboxing software to install seccomp listeners on top of
existing ones, which is useful for nested LXC containers and other similar use-cases.
Expecting potential discussions around this patch series, I'm going to present a talk
at LPC 2025 about the design and implementation details of this feature [1].
Git tree (based on for-next/seccomp):
v3: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v3
current: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners
Changelog for version 3:
- almost completely rewritten (no static array on the stack, no nesting limit)
- more testcases
Changelog for version 2:
- add some explanatory comments
- add RWB tags from Tycho Andersen (thanks, Tycho! ;) )
- CC-ed Aleksa as he might be interested in this stuff too
Links to previous versions:
v2: https://lore.kernel.org/all/20251202115200.110646-1-aleksandr.mikhalitsyn@c…
tree: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v2
v1: https://lore.kernel.org/all/20251201122406.105045-1-aleksandr.mikhalitsyn@c…
tree: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v1
Link: https://lpc.events/event/19/contributions/2241/ [1]
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: bpf(a)vger.kernel.org
Cc: Kees Cook <kees(a)kernel.org>
Cc: Andy Lutomirski <luto(a)amacapital.net>
Cc: Will Drewry <wad(a)chromium.org>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Aleksa Sarai <cyphar(a)cyphar.com>
Cc: Tycho Andersen <tycho(a)tycho.pizza>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Stéphane Graber <stgraber(a)stgraber.org>
Alexander Mikhalitsyn (7):
seccomp: remove unused argument from seccomp_do_user_notification
seccomp: use bitfields for boolean flags on seccomp_filter struct
seccomp: keep track of seccomp filters with closed listeners
seccomp: mark first listener in the tree
seccomp: handle multiple listeners case
seccomp: allow nested listeners
tools/testing/selftests/seccomp: test nested listeners
.../userspace-api/seccomp_filter.rst | 6 +
include/linux/seccomp.h | 3 +-
include/uapi/linux/seccomp.h | 13 +-
kernel/seccomp.c | 129 +++++++-
tools/include/uapi/linux/seccomp.h | 13 +-
tools/testing/selftests/seccomp/seccomp_bpf.c | 303 ++++++++++++++++++
6 files changed, 438 insertions(+), 29 deletions(-)
--
2.43.0
Hi,
This series fixes a verifier issue with bpf_d_path() and adds a
regression test to cover its use within a hook function.
Patch 1 updates the bpf_d_path() helper prototype so that the second
argument is marked as MEM_WRITE. This makes it explicit to the verifier
that the helper writes into the provided buffer.
Patch 2 extends the existing d_path selftest to cover incorrect verifier
assumptions caused by an incorrect function prototype. The test program calls
bpf_d_path() and checks if the first character of the path can be read.
It ensures the verifier does not assume the buffer remains unwritten.
Changelog
=========
v5:
- Moved the temporary file for the fallocate test from /tmp to /dev/shm
Since bpf CI's 9P filesystem under /tmp does not support fallocate.
v4:
- Use the fallocate hook instead of an LSM hook to simplify the selftest,
as suggested by Matt and Alexei.
- Add a utility function in test_d_path.c to load the BPF program,
improving code reuse.
v3:
- Switch the pathname prefix loop to use bpf_for() instead of
#pragma unroll, as suggested by Matt.
- Remove /tmp/bpf_d_path_test in the test cleanup path.
- Add the missing Reviewed-by tags.
v2:
- Merge the new test into the existing d_path selftest rather than
creating new files.
- Add PID filtering in the LSM program to avoid nondeterministic failures
due to unrelated processes triggering bprm_check_security.
- Synchronize child execution using a pipe to ensure deterministic
updates to the PID.
Thanks for your time and reviews.
Shuran Liu (2):
bpf: mark bpf_d_path() buffer as writeable
selftests/bpf: add regression test for bpf_d_path()
kernel/trace/bpf_trace.c | 2 +-
.../testing/selftests/bpf/prog_tests/d_path.c | 91 +++++++++++++++----
.../testing/selftests/bpf/progs/test_d_path.c | 23 +++++
3 files changed, 97 insertions(+), 19 deletions(-)
--
2.52.0
As describe in the help string, the user might want to disable these
tests if they don't like to see stacktraces/BUG etc in their kernel log.
However, if they enable PANIC_ON_OOPS, these tests also crash the
machine, which it's safe to assume _almost_ nobody wants.
One might argue that _absolutely_ nobody ever wants their kernel to
crash so this should just be a hard dependency instead of a default.
However, since this is rather special code that's anyway concerned with
deliberately doing "bad" things, the normal rules don't seem to apply,
hence prefer flexibility and allow users to set up a crashing Kconfig if
they so choose.
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
---
lib/kunit/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig
index 50ecf55d2b9c8a82f2aff7a0b4156bd6179b0a2f..498cc51e493dc9a819e012b8082fb765f25512b9 100644
--- a/lib/kunit/Kconfig
+++ b/lib/kunit/Kconfig
@@ -28,7 +28,7 @@ config KUNIT_FAULT_TEST
bool "Enable KUnit tests which print BUG stacktraces"
depends on KUNIT_TEST
depends on !UML
- default y
+ default !PANIC_ON_OOPS
help
Enables fault handling tests for the KUnit framework. These tests may
trigger a kernel BUG(), and the associated stack trace, even when they
---
base-commit: 7bc16e72ddb993d706f698c2f6cee694e485f557
change-id: 20251207-kunit-fault-no-panic-e9bdce848031
Best regards,
--
Brendan Jackman <jackmanb(a)google.com>
This series improves error propagation in cpumap and adds selftests that
cover the failure cases.
Currently, failures returned from __cpu_map_entry_alloc() are ignored
and always converted to -ENOMEM by cpu_map_update_elem(). This series
ensures the correct error propagation and adds selftests.
Changes:
v2:
- send to bpf-next, not to bpf
- drop Fixes: tag
v1: https://lore.kernel.org/bpf/20251128160504.57844-1-enjuk@amazon.com/
Kohei Enju (2):
bpf: cpumap: propagate underlying error in cpu_map_update_elem()
selftests/bpf: add tests for attaching invalid fd
kernel/bpf/cpumap.c | 21 ++++++++++++-------
.../bpf/prog_tests/xdp_cpumap_attach.c | 19 +++++++++++++++--
2 files changed, 30 insertions(+), 10 deletions(-)
--
2.51.0
If there is a large number (hundreds) of dmabufs allocated, the text
output generated from dmabuf_iter_seq_show can exceed common user buffer
sizes (e.g. PAGE_SIZE) necessitating multiple start/stop cycles to
iterate through all dmabufs. However the dmabuf iterator currently
returns NULL in dmabuf_iter_seq_start for all non-zero pos values, which
results in the truncation of the output before all dmabufs are handled.
After dma_buf_iter_begin / dma_buf_iter_next, the refcount of the buffer
is elevated so that the BPF iterator program can run without holding any
locks. When a stop occurs, instead of immediately dropping the reference
on the buffer, stash a pointer to the buffer in seq->priv until
either start is called or the iterator is released. This also enables
the resumption of iteration without first walking through the list of
dmabufs based on the pos value.
Fixes: 76ea95534995 ("bpf: Add dmabuf iterator")
Signed-off-by: T.J. Mercier <tjmercier(a)google.com>
---
kernel/bpf/dmabuf_iter.c | 56 +++++++++++++++++++++++++++++++++++-----
1 file changed, 49 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
index 4dd7ef7c145c..cd500248abd9 100644
--- a/kernel/bpf/dmabuf_iter.c
+++ b/kernel/bpf/dmabuf_iter.c
@@ -6,10 +6,33 @@
#include <linux/kernel.h>
#include <linux/seq_file.h>
+struct dmabuf_iter_priv {
+ /*
+ * If this pointer is non-NULL, the buffer's refcount is elevated to
+ * prevent destruction between stop/start. If reading is not resumed and
+ * start is never called again, then dmabuf_iter_seq_fini drops the
+ * reference when the iterator is released.
+ */
+ struct dma_buf *dmabuf;
+};
+
static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
{
- if (*pos)
- return NULL;
+ struct dmabuf_iter_priv *p = seq->private;
+
+ if (*pos) {
+ struct dma_buf *dmabuf = p->dmabuf;
+
+ if (!dmabuf)
+ return NULL;
+
+ /*
+ * Always resume from where we stopped, regardless of the value
+ * of pos.
+ */
+ p->dmabuf = NULL;
+ return dmabuf;
+ }
return dma_buf_iter_begin();
}
@@ -54,8 +77,11 @@ static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
{
struct dma_buf *dmabuf = v;
- if (dmabuf)
- dma_buf_put(dmabuf);
+ if (dmabuf) {
+ struct dmabuf_iter_priv *p = seq->private;
+
+ p->dmabuf = dmabuf;
+ }
}
static const struct seq_operations dmabuf_iter_seq_ops = {
@@ -71,11 +97,27 @@ static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
seq_puts(seq, "dmabuf iter\n");
}
+static int dmabuf_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ p->dmabuf = NULL;
+ return 0;
+}
+
+static void dmabuf_iter_seq_fini(void *priv)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ if (p->dmabuf)
+ dma_buf_put(p->dmabuf);
+}
+
static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
.seq_ops = &dmabuf_iter_seq_ops,
- .init_seq_private = NULL,
- .fini_seq_private = NULL,
- .seq_priv_size = 0,
+ .init_seq_private = dmabuf_iter_seq_init,
+ .fini_seq_private = dmabuf_iter_seq_fini,
+ .seq_priv_size = sizeof(struct dmabuf_iter_priv),
};
static struct bpf_iter_reg bpf_dmabuf_reg_info = {
base-commit: 30f09200cc4aefbd8385b01e41bde2e4565a6f0e
--
2.52.0.177.g9f829587af-goog
The resctrl selftest currently exhibits several failures on Hygon CPUs
due to missing vendor detection and edge-case handling specific to
Hygon's architecture.
This patch series addresses three distinct issues:
1. Missing CPU vendor detection, causing the test to fail with
"# Can not get vendor info..." on Hygon CPUs.
2. A division-by-zero crash in SNC detection on Hygon CPUs.
3. Incorrect handling of non-contiguous CBM support on Hygon CPUs.
These changes enable resctrl selftest to run successfully on
Hygon CPUs that support Platform QoS features.
Changelog:
v2:
- Patch 1: switch all of the vendor id bitmasks to use BIT() (Reinette)
- Patch 2: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
- Patch 3: add Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
add a maintainer note to highlight it is not a candidate for
backport (Reinette)
Xiaochen Shen (3):
selftests/resctrl: Add CPU vendor detection for Hygon
selftests/resctrl: Fix a division by zero error on Hygon
selftests/resctrl: Fix non-contiguous CBM check for Hygon
tools/testing/selftests/resctrl/cat_test.c | 4 ++--
tools/testing/selftests/resctrl/resctrl.h | 6 ++++--
tools/testing/selftests/resctrl/resctrl_tests.c | 2 ++
tools/testing/selftests/resctrl/resctrlfs.c | 10 ++++++++++
4 files changed, 18 insertions(+), 4 deletions(-)
--
2.47.3
This patch series introduces a new configfs attribute that enables sending
messages directly through netconsole without going through the kernel's logging
infrastructure.
This feature allows users to send custom messages, alerts, or status updates
directly to netconsole receivers by writing to
/sys/kernel/config/netconsole/<target>/send_msg, without poluting kernel
buffers, and sending msgs to the serial, which could be slow.
At Meta this is currently used in two cases right now (through printk by
now):
a) When a new workload enters or leave the machine.
b) From time to time, as a "ping" to make sure the netconsole/machine
is alive.
The implementation reuses the existing message transmission functions
(send_msg_udp() and send_ext_msg_udp()) to handle both basic and extended
message formats.
Regarding code organization, this version uses forward declarations for
send_msg_udp() and send_ext_msg_udp() functions rather than relocating them
within the file. While forward declarations do add a small amount of
redundancy, they avoid the larger churn that would result from moving entire
function definitions.
---
Breno Leitao (4):
netconsole: extract message fragmentation into send_msg_udp()
netconsole: Add configfs attribute for direct message sending
selftests/netconsole: Switch to configfs send_msg interface
Documentation: netconsole: Document send_msg configfs attribute
Documentation/networking/netconsole.rst | 40 +++++++++++++++
drivers/net/netconsole.c | 59 ++++++++++++++++++----
.../selftests/drivers/net/netcons_sysdata.sh | 2 +-
3 files changed, 91 insertions(+), 10 deletions(-)
---
base-commit: ab084f0b8d6d2ee4b1c6a28f39a2a7430bdfa7f0
change-id: 20251127-netconsole_send_msg-89813956dc23
Best regards,
--
Breno Leitao <leitao(a)debian.org>
This series improves the CPU cost of RX token management by adding an
attribute to NETDEV_CMD_BIND_RX that configures sockets using the
binding to avoid the xarray allocator and instead use a per-binding niov
array and a uref field in niov.
Improvement is ~13% cpu util per RX user thread.
Using kperf, the following results were observed:
Before:
Average RX worker idle %: 13.13, flows 4, test runs 11
After:
Average RX worker idle %: 26.32, flows 4, test runs 11
Two other approaches were tested, but with no improvement. Namely, 1)
using a hashmap for tokens and 2) keeping an xarray of atomic counters
but using RCU so that the hotpath could be mostly lockless. Neither of
these approaches proved better than the simple array in terms of CPU.
The attribute NETDEV_A_DMABUF_AUTORELEASE is added to toggle the
optimization. It is an optional attribute and defaults to 0 (i.e.,
optimization on).
To: David S. Miller <davem(a)davemloft.net>
To: Eric Dumazet <edumazet(a)google.com>
To: Jakub Kicinski <kuba(a)kernel.org>
To: Paolo Abeni <pabeni(a)redhat.com>
To: Simon Horman <horms(a)kernel.org>
To: Kuniyuki Iwashima <kuniyu(a)google.com>
To: Willem de Bruijn <willemb(a)google.com>
To: Neal Cardwell <ncardwell(a)google.com>
To: David Ahern <dsahern(a)kernel.org>
To: Mina Almasry <almasrymina(a)google.com>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Andrew Lunn <andrew+netdev(a)lunn.ch>
To: Shuah Khan <shuah(a)kernel.org>
Cc: Stanislav Fomichev <sdf(a)fomichev.me>
Cc: netdev(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)meta.com>
Changes in v7:
- use netlink instead of sockopt (Stan)
- restrict system to only one mode, dmabuf bindings can not co-exist
with different modes (Stan)
- use static branching to enforce single system-wide mode (Stan)
- Link to v6: https://lore.kernel.org/r/20251104-scratch-bobbyeshleman-devmem-tcp-token-u…
Changes in v6:
- renamed 'net: devmem: use niov array for token management' to refer to
optionality of new config
- added documentation and tests
- make autorelease flag per-socket sockopt instead of binding
field / sysctl
- many per-patch changes (see Changes sections per-patch)
- Link to v5: https://lore.kernel.org/r/20251023-scratch-bobbyeshleman-devmem-tcp-token-u…
Changes in v5:
- add sysctl to opt-out of performance benefit, back to old token release
- Link to v4: https://lore.kernel.org/all/20250926-scratch-bobbyeshleman-devmem-tcp-token…
Changes in v4:
- rebase to net-next
- Link to v3: https://lore.kernel.org/r/20250926-scratch-bobbyeshleman-devmem-tcp-token-u…
Changes in v3:
- make urefs per-binding instead of per-socket, reducing memory
footprint
- fallback to cleaning up references in dmabuf unbind if socket
leaked tokens
- drop ethtool patch
- Link to v2: https://lore.kernel.org/r/20250911-scratch-bobbyeshleman-devmem-tcp-token-u…
Changes in v2:
- net: ethtool: prevent user from breaking devmem single-binding rule
(Mina)
- pre-assign niovs in binding->vec for RX case (Mina)
- remove WARNs on invalid user input (Mina)
- remove extraneous binding ref get (Mina)
- remove WARN for changed binding (Mina)
- always use GFP_ZERO for binding->vec (Mina)
- fix length of alloc for urefs
- use atomic_set(, 0) to initialize sk_user_frags.urefs
- Link to v1: https://lore.kernel.org/r/20250902-scratch-bobbyeshleman-devmem-tcp-token-u…
---
Bobby Eshleman (5):
net: devmem: rename tx_vec to vec in dmabuf binding
net: devmem: refactor sock_devmem_dontneed for autorelease split
net: devmem: implement autorelease token management
net: devmem: document NETDEV_A_DMABUF_AUTORELEASE netlink attribute
selftests: drv-net: devmem: add autorelease tests
Documentation/netlink/specs/netdev.yaml | 12 +++
Documentation/networking/devmem.rst | 70 +++++++++++++
include/net/netmem.h | 1 +
include/net/sock.h | 7 +-
include/uapi/linux/netdev.h | 1 +
net/core/devmem.c | 121 ++++++++++++++++++----
net/core/devmem.h | 13 ++-
net/core/netdev-genl-gen.c | 5 +-
net/core/netdev-genl.c | 13 ++-
net/core/sock.c | 103 ++++++++++++++----
net/ipv4/tcp.c | 78 +++++++++++---
net/ipv4/tcp_ipv4.c | 13 ++-
net/ipv4/tcp_minisocks.c | 3 +-
tools/include/uapi/linux/netdev.h | 1 +
tools/testing/selftests/drivers/net/hw/devmem.py | 22 +++-
tools/testing/selftests/drivers/net/hw/ncdevmem.c | 19 ++--
16 files changed, 401 insertions(+), 81 deletions(-)
---
base-commit: 4c52142904b33b41c3ff7ee58670b4e3b3bf1120
change-id: 20250829-scratch-bobbyeshleman-devmem-tcp-token-upstream-292be174d503
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
This series introduces NUMA-aware memory placement support for KVM guests
with guest_memfd memory backends. It builds upon Fuad Tabba's work (V17)
that enabled host-mapping for guest_memfd memory [1] and can be applied
directly applied on KVM tree [2] (branch kvm-next, base commit: a6ad5413,
Merge branch 'guest-memfd-mmap' into HEAD)
== Background ==
KVM's guest-memfd memory backend currently lacks support for NUMA policy
enforcement, causing guest memory allocations to be distributed across host
nodes according to kernel's default behavior, irrespective of any policy
specified by the VMM. This limitation arises because conventional userspace
NUMA control mechanisms like mbind(2) don't work since the memory isn't
directly mapped to userspace when allocations occur.
Fuad's work [1] provides the necessary mmap capability, and this series
leverages it to enable mbind(2).
== Implementation ==
This series implements proper NUMA policy support for guest-memfd by:
1. Adding mempolicy-aware allocation APIs to the filemap layer.
2. Introducing custom inodes (via a dedicated slab-allocated inode cache,
kvm_gmem_inode_info) to store NUMA policy and metadata for guest memory.
3. Implementing get/set_policy vm_ops in guest_memfd to support NUMA
policy.
With these changes, VMMs can now control guest memory placement by mapping
guest_memfd file descriptor and using mbind(2) to specify:
- Policy modes: default, bind, interleave, or preferred
- Host NUMA nodes: List of target nodes for memory allocation
These Policies affect only future allocations and do not migrate existing
memory. This matches mbind(2)'s default behavior which affects only new
allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags (Not
supported for guest_memfd as it is unmovable by design).
== Upstream Plan ==
Phased approach as per David's guest_memfd extension overview [3] and
community calls [4]:
Phase 1 (this series):
1. Focuses on shared guest_memfd support (non-CoCo VMs).
2. Builds on Fuad's host-mapping work [1].
Phase2 (future work):
1. NUMA support for private guest_memfd (CoCo VMs).
2. Depends on SNP in-place conversion support [5].
This series provides a clean integration path for NUMA-aware memory
management for guest_memfd and lays the groundwork for future confidential
computing NUMA capabilities.
Thanks,
Shivank
== Changelog ==
- v1,v2: Extended the KVM_CREATE_GUEST_MEMFD IOCTL to pass mempolicy.
- v3: Introduced fbind() syscall for VMM memory-placement configuration.
- v4-v6: Current approach using shared_policy support and vm_ops (based on
suggestions from David [6] and guest_memfd bi-weekly upstream
call discussion [7]).
- v7: Use inodes to store NUMA policy instead of file [8].
- v8: Rebase on top of Fuad's V12: Host mmaping for guest_memfd memory.
- v9: Rebase on top of Fuad's V13 and incorporate review comments
- V10: Rebase on top of Fuad's V17. Use latest guest_memfd inode patch
from Ackerley (with David's review comments). Use newer kmem_cache_create()
API variant with arg parameter (Vlastimil)
- V11: Rebase on kvm-next, remove RFC tag, use Ackerley's latest patch
and fix a rcu race bug during kvm module unload.
[1] https://lore.kernel.org/all/20250729225455.670324-1-seanjc@google.com
[2] https://git.kernel.org/pub/scm/virt/kvm/kvm.git/log/?h=next
[3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com
[4] https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAo…
[5] https://lore.kernel.org/all/20250613005400.3694904-1-michael.roth@amd.com
[6] https://lore.kernel.org/all/6fbef654-36e2-4be5-906e-2a648a845278@redhat.com
[7] https://lore.kernel.org/all/2b77e055-98ac-43a1-a7ad-9f9065d7f38f@amd.com
[8] https://lore.kernel.org/all/diqzbjumm167.fsf@ackerleytng-ctop.c.googlers.com
Ackerley Tng (1):
KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes
Matthew Wilcox (Oracle) (2):
mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio()
mm/filemap: Extend __filemap_get_folio() to support NUMA memory
policies
Shivank Garg (4):
mm/mempolicy: Export memory policy symbols
KVM: guest_memfd: Add slab-allocated inode cache
KVM: guest_memfd: Enforce NUMA mempolicy using shared policy
KVM: guest_memfd: selftests: Add tests for mmap and NUMA policy
support
fs/bcachefs/fs-io-buffered.c | 2 +-
fs/btrfs/compression.c | 4 +-
fs/btrfs/verity.c | 2 +-
fs/erofs/zdata.c | 2 +-
fs/f2fs/compress.c | 2 +-
include/linux/pagemap.h | 18 +-
include/uapi/linux/magic.h | 1 +
mm/filemap.c | 23 +-
mm/mempolicy.c | 6 +
mm/readahead.c | 2 +-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
.../testing/selftests/kvm/guest_memfd_test.c | 121 ++++++++
virt/kvm/guest_memfd.c | 262 ++++++++++++++++--
virt/kvm/kvm_main.c | 7 +-
virt/kvm/kvm_mm.h | 9 +-
15 files changed, 412 insertions(+), 50 deletions(-)
--
2.43.0
---
== Earlier Postings ==
v10: https://lore.kernel.org/all/20250811090605.16057-2-shivankg@amd.com
v9: https://lore.kernel.org/all/20250713174339.13981-2-shivankg@amd.com
v8: https://lore.kernel.org/all/20250618112935.7629-1-shivankg@amd.com
v7: https://lore.kernel.org/all/20250408112402.181574-1-shivankg@amd.com
v6: https://lore.kernel.org/all/20250226082549.6034-1-shivankg@amd.com
v5: https://lore.kernel.org/all/20250219101559.414878-1-shivankg@amd.com
v4: https://lore.kernel.org/all/20250210063227.41125-1-shivankg@amd.com
v3: https://lore.kernel.org/all/20241105164549.154700-1-shivankg@amd.com
v2: https://lore.kernel.org/all/20240919094438.10987-1-shivankg@amd.com
v1: https://lore.kernel.org/all/20240916165743.201087-1-shivankg@amd.com
Hi all,
This patch series introduces improvements to the cgroup selftests by adding
a helper function to better handle asynchronous updates in cgroup statistics.
These changes are especially useful for managing cgroup stats like
memory.stat and cgroup.stat, which can be affected by delays (e.g., RCU
callbacks and asynchronous rstat flushing).
Patch 1/3 adds cg_read_key_long_poll(), a generic helper to poll a numeric
key in a cgroup file until it reaches an expected value or a retry limit is
hit. Patches 2/3 and 3/3 convert existing tests to use this helper, making
them more robust on busy systems.
v5:
- Drop the "/* 3s total */" comment from MEMCG_SOCKSTAT_WAIT_RETRIES as
suggested by Shakeel, so it does not become stale if the wait interval
changes.
- Elaborate in the commit message of patch 3/3 on the rationale behind the
8s timeout in test_kmem_dead_cgroups(), and add a comment next to
KMEM_DEAD_WAIT_RETRIES explaining that it is a generous upper bound
derived from stress testing and not tied to a specific kernel constant.
- Add Reviewed-by: Shakeel Butt <shakeel.butt(a)linux.dev> to this series.
- Link to v4: https://lore.kernel.org/all/20251124123816.486164-1-zhangguopeng@kylinos.cn/
v4:
- Patch 1/3: Add the cg_read_key_long_poll() helper to poll cgroup keys
with retries and configurable intervals.
- Patch 2/3: Update test_memcg_sock() to use cg_read_key_long_poll() for
handling delayed "sock " counter updates in memory.stat.
- Patch 3/3: Replace the sleep-and-retry logic in test_kmem_dead_cgroups()
with cg_read_key_long_poll() for waiting on nr_dying_descendants.
- Link to v3: https://lore.kernel.org/all/p655qedqjaakrnqpytc6dltejfluxo6jrffcltfz2ivonmk…
v3:
- Move MEMCG_SOCKSTAT_WAIT_* defines after the #include block as suggested.
- Link to v2: https://lore.kernel.org/all/5ad2b75f-748a-4e93-8d11-63295bda0cbf@linux.dev/
v2:
- Clarify the rationale for the 3s timeout and mention the periodic rstat
flush interval (FLUSH_TIME = 2 * HZ) in the comment.
- Replace hardcoded retry count and wait interval with macros to avoid
magic numbers and make the timeout calculation explicit.
- Link to v1: https://lore.kernel.org/all/20251119122758.85610-1-ioworker0@gmail.com/
Thanks to Michal Koutný for the suggestion, and to Lance Yang and Shakeel Butt for their reviews and feedback.
Guopeng Zhang (3):
selftests: cgroup: Add cg_read_key_long_poll() to poll a cgroup key
with retries
selftests: cgroup: make test_memcg_sock robust against delayed sock
stats
selftests: cgroup: Replace sleep with cg_read_key_long_poll() for
waiting on nr_dying_descendants
.../selftests/cgroup/lib/cgroup_util.c | 21 ++++++++++++
.../cgroup/lib/include/cgroup_util.h | 5 +++
tools/testing/selftests/cgroup/test_kmem.c | 33 +++++++++----------
.../selftests/cgroup/test_memcontrol.c | 20 ++++++++++-
4 files changed, 60 insertions(+), 19 deletions(-)
--
2.25.1
[ based on kvm/next ]
Unmapping virtual machine guest memory from the host kernel's direct map
is a successful mitigation against Spectre-style transient execution
issues: if the kernel page tables do not contain entries pointing to
guest memory, then any attempted speculative read through the direct map
will necessarily be blocked by the MMU before any observable
microarchitectural side-effects happen. This means that Spectre-gadgets
and similar cannot be used to target virtual machine memory. Roughly
60% of speculative execution issues fall into this category [1, Table
1].
This patch series extends guest_memfd with the ability to remove its
memory from the host kernel's direct map, to be able to attain the above
protection for KVM guests running inside guest_memfd.
Additionally, a Firecracker branch with support for these VMs can be
found on GitHub [2].
For more details, please refer to the v5 cover letter. No substantial
changes in design have taken place since.
See also related write() syscall support in guest_memfd [3] where
the interoperation between the two features is described.
Changes since v7:
- David: separate patches for adding x86 and ARM support
- Dave/Will: drop support for disabling TLB flushes
v7: https://lore.kernel.org/kvm/20250924151101.2225820-1-patrick.roy@campus.lmu…
v6: https://lore.kernel.org/kvm/20250912091708.17502-1-roypat@amazon.co.uk
v5: https://lore.kernel.org/kvm/20250828093902.2719-1-roypat@amazon.co.uk
v4: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk
RFCv3: https://lore.kernel.org/kvm/20241030134912.515725-1-roypat@amazon.co.uk
RFCv2: https://lore.kernel.org/kvm/20240910163038.1298452-1-roypat@amazon.co.uk
RFCv1: https://lore.kernel.org/kvm/20240709132041.3625501-1-roypat@amazon.co.uk
[1] https://download.vusec.net/papers/quarantine_raid23.pdf
[2] https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hidi…
[3] https://lore.kernel.org/kvm/20251114151828.98165-1-kalyazin@amazon.com
Patrick Roy (13):
x86: export set_direct_map_valid_noflush to KVM module
x86/tlb: export flush_tlb_kernel_range to KVM module
mm: introduce AS_NO_DIRECT_MAP
KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate
KVM: guest_memfd: Add flag to remove from direct map
KVM: x86: define kvm_arch_gmem_supports_no_direct_map()
KVM: arm64: define kvm_arch_gmem_supports_no_direct_map()
KVM: selftests: load elf via bounce buffer
KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd
!= -1
KVM: selftests: Add guest_memfd based vm_mem_backing_src_types
KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing
selftests
KVM: selftests: stuff vm_mem_backing_src_type into vm_shape
KVM: selftests: Test guest execution from direct map removed gmem
Documentation/virt/kvm/api.rst | 22 ++++---
arch/arm64/include/asm/kvm_host.h | 13 ++++
arch/x86/include/asm/kvm_host.h | 9 +++
arch/x86/include/asm/tlbflush.h | 3 +-
arch/x86/mm/pat/set_memory.c | 1 +
arch/x86/mm/tlb.c | 1 +
include/linux/kvm_host.h | 14 ++++
include/linux/pagemap.h | 16 +++++
include/linux/secretmem.h | 18 ------
include/uapi/linux/kvm.h | 1 +
lib/buildid.c | 4 +-
mm/gup.c | 19 ++----
mm/mlock.c | 2 +-
mm/secretmem.c | 8 +--
.../testing/selftests/kvm/guest_memfd_test.c | 17 ++++-
.../testing/selftests/kvm/include/kvm_util.h | 37 ++++++++---
.../testing/selftests/kvm/include/test_util.h | 8 +++
tools/testing/selftests/kvm/lib/elf.c | 8 +--
tools/testing/selftests/kvm/lib/io.c | 23 +++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 59 +++++++++--------
tools/testing/selftests/kvm/lib/test_util.c | 8 +++
tools/testing/selftests/kvm/lib/x86/sev.c | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 1 +
.../selftests/kvm/set_memory_region_test.c | 52 +++++++++++++--
.../kvm/x86/private_mem_conversions_test.c | 7 +-
virt/kvm/guest_memfd.c | 64 +++++++++++++++++--
26 files changed, 314 insertions(+), 102 deletions(-)
base-commit: e0c26d47def7382d7dbd9cad58bc653aed75737a
--
2.50.1
Hi,
the kernel selftests do not currently support glibc v2.35 and older.
This is primarily because glibc v2.36 added support for syscall API
functions such as fsopen, move_mount, fsmount, open_tree, and fsconfig,
and those API functions are now used in several tests.
As result, glibc v2.35 and older are no longer supported, at least not
for affected tests.
Historically the selftest framework implemented such functions with
wrappers named sys_<syscall>. This means that sys_<syscall> and <syscall>
functions are now used in parallel.
I see a number of possibilities to solve the problem.
1) Do nothing. Document somewhere that glibc v2.35 and older is not
supported by the kernel selftests.
2) Document that glibc v2.35 and older is not supported, drop
all wrappers implementing the sys_<syscall> functions, and rename
the calling code to <syscall>.
3) Rename all calls to sys_<syscall> to <syscall> and modify the
wrapper code to only implement those functions for glibc v2.35 and
older.
4) Add defines to the existing wrapper code to also define <syscall>
in addition to sys_<syscall>, and leave the code otherwise alone.
What would be the preferred solution ? My personal preference would be 3),
followed by 4).
Thanks,
Guenter
Hi,
This series fixes a verifier issue with bpf_d_path() and adds a
regression test to cover its use within a hook function.
Patch 1 updates the bpf_d_path() helper prototype so that the second
argument is marked as MEM_WRITE. This makes it explicit to the verifier
that the helper writes into the provided buffer.
Patch 2 extends the existing d_path selftest to cover incorrect verifier
assumptions caused by an incorrect function prototype. The test program calls
bpf_d_path() and checks if the first character of the path is '/'.
It ensures the verifier does not assume the buffer remains unwritten.
Changelog
=========
v4:
- Use the fallocate hook instead of an LSM hook to simplify the selftest,
as suggested by Matt and Alexei.
- Add a utility function in test_d_path.c to load the BPF program,
improving code reuse.
v3:
- Switch the pathname prefix loop to use bpf_for() instead of
#pragma unroll, as suggested by Matt.
- Remove /tmp/bpf_d_path_test in the test cleanup path.
- Add the missing Reviewed-by tags.
v2:
- Merge the new test into the existing d_path selftest rather than
creating new files.
- Add PID filtering in the LSM program to avoid nondeterministic failures
due to unrelated processes triggering bprm_check_security.
- Synchronize child execution using a pipe to ensure deterministic
updates to the PID.
Thanks for your time and reviews.
Shuran Liu (2):
bpf: mark bpf_d_path() buffer as writeable
selftests/bpf: add regression test for bpf_d_path()
kernel/trace/bpf_trace.c | 2 +-
.../testing/selftests/bpf/prog_tests/d_path.c | 90 +++++++++++++++----
.../testing/selftests/bpf/progs/test_d_path.c | 23 +++++
3 files changed, 96 insertions(+), 19 deletions(-)
--
2.52.0
v24:
Took PaulW suggestion for fixing commit desc and checkpatch fixes
in patch #21.
"riscv: kernel command line option to opt out of user cfi"
Collected "Tested-by" tags from Andreas and Valentin. Thanks a lot
for testing it.
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v24:
- Checkpatch fixes in patch # 21.
- Collected "Tested-by" tags.
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v24:
- Link to v23: https://lore.kernel.org/r/20251112-v5_user_cfi_series-v23-0-b55691eacf4f@ri…
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 98 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 25 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2482 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
v25: Removal of `riscv_nousercfi` from `cpufeature.c` and instead placing
it as extern in `usercfi.h` was leading to build error whene cfi config
is not selected. Placed `riscv_nousercfi` outside cfi config ifdef block
in `usercfi.h`
v24:
Took PaulW suggestion for fixing commit desc and checkpatch fixes
in patch #21.
"riscv: kernel command line option to opt out of user cfi"
Collected "Tested-by" tags from Andreas and Valentin. Thanks a lot
for testing it.
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v25:
- fixed build error when cfi config is not selected due to missing symbol
error on `riscv_nousercfi`.
v24:
- Checkpatch fixes in patch # 21.
- Collected "Tested-by" tags.
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v25:
- Link to v24: https://lore.kernel.org/r/20251204-v5_user_cfi_series-v24-0-ada7a3ba14dc@ri…
Changes in v24:
- Link to v23: https://lore.kernel.org/r/20251112-v5_user_cfi_series-v23-0-b55691eacf4f@ri…
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 97 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 25 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2481 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
From: Patrick Roy <roypat(a)amazon.co.uk>
[ based on kvm/next ]
Unmapping virtual machine guest memory from the host kernel's direct map is a
successful mitigation against Spectre-style transient execution issues: If the
kernel page tables do not contain entries pointing to guest memory, then any
attempted speculative read through the direct map will necessarily be blocked
by the MMU before any observable microarchitectural side-effects happen. This
means that Spectre-gadgets and similar cannot be used to target virtual machine
memory. Roughly 60% of speculative execution issues fall into this category [1,
Table 1].
This patch series extends guest_memfd with the ability to remove its memory
from the host kernel's direct map, to be able to attain the above protection
for KVM guests running inside guest_memfd.
Additionally, a Firecracker branch with support for these VMs can be found on
GitHub [2].
For more details, please refer to the v5 cover letter [v5]. No
substantial changes in design have taken place since.
=== Changes Since v6 ===
- Drop patch for passing struct address_space to ->free_folio(), due to
possible races with freeing of the address_space. (Hugh)
- Stop using PG_uptodate / gmem preparedness tracking to keep track of
direct map state. Instead, use the lowest bit of folio->private. (Mike, David)
- Do direct map removal when establishing mapping of gmem folio instead
of at allocation time, due to impossibility of handling direct map
removal errors in kvm_gmem_populate(). (Patrick)
- Do TLB flushes after direct map removal, and provide a module
parameter to opt out from them, and a new patch to export
flush_tlb_kernel_range() to KVM. (Will)
[1]: https://download.vusec.net/papers/quarantine_raid23.pdf
[2]: https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hidi…
[RFCv1]: https://lore.kernel.org/kvm/20240709132041.3625501-1-roypat@amazon.co.uk/
[RFCv2]: https://lore.kernel.org/kvm/20240910163038.1298452-1-roypat@amazon.co.uk/
[RFCv3]: https://lore.kernel.org/kvm/20241030134912.515725-1-roypat@amazon.co.uk/
[v4]: https://lore.kernel.org/kvm/20250221160728.1584559-1-roypat@amazon.co.uk/
[v5]: https://lore.kernel.org/kvm/20250828093902.2719-1-roypat@amazon.co.uk/
[v6]: https://lore.kernel.org/kvm/20250912091708.17502-1-roypat@amazon.co.uk/
Patrick Roy (12):
arch: export set_direct_map_valid_noflush to KVM module
x86/tlb: export flush_tlb_kernel_range to KVM module
mm: introduce AS_NO_DIRECT_MAP
KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate
KVM: guest_memfd: Add flag to remove from direct map
KVM: guest_memfd: add module param for disabling TLB flushing
KVM: selftests: load elf via bounce buffer
KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd
!= -1
KVM: selftests: Add guest_memfd based vm_mem_backing_src_types
KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing
selftests
KVM: selftests: stuff vm_mem_backing_src_type into vm_shape
KVM: selftests: Test guest execution from direct map removed gmem
Documentation/virt/kvm/api.rst | 5 ++
arch/arm64/include/asm/kvm_host.h | 12 ++++
arch/arm64/mm/pageattr.c | 1 +
arch/loongarch/mm/pageattr.c | 1 +
arch/riscv/mm/pageattr.c | 1 +
arch/s390/mm/pageattr.c | 1 +
arch/x86/include/asm/tlbflush.h | 3 +-
arch/x86/mm/pat/set_memory.c | 1 +
arch/x86/mm/tlb.c | 1 +
include/linux/kvm_host.h | 9 +++
include/linux/pagemap.h | 16 +++++
include/linux/secretmem.h | 18 -----
include/uapi/linux/kvm.h | 2 +
lib/buildid.c | 4 +-
mm/gup.c | 19 ++----
mm/mlock.c | 2 +-
mm/secretmem.c | 8 +--
.../testing/selftests/kvm/guest_memfd_test.c | 2 +
.../testing/selftests/kvm/include/kvm_util.h | 37 ++++++++---
.../testing/selftests/kvm/include/test_util.h | 8 +++
tools/testing/selftests/kvm/lib/elf.c | 8 +--
tools/testing/selftests/kvm/lib/io.c | 23 +++++++
tools/testing/selftests/kvm/lib/kvm_util.c | 61 +++++++++--------
tools/testing/selftests/kvm/lib/test_util.c | 8 +++
tools/testing/selftests/kvm/lib/x86/sev.c | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 1 +
.../selftests/kvm/set_memory_region_test.c | 50 ++++++++++++--
.../kvm/x86/private_mem_conversions_test.c | 7 +-
virt/kvm/guest_memfd.c | 66 +++++++++++++++++--
virt/kvm/kvm_main.c | 8 +++
30 files changed, 290 insertions(+), 94 deletions(-)
base-commit: a6ad54137af92535cfe32e19e5f3bc1bb7dbd383
--
2.51.0
Hi Reinette,
On 12/5/2025 8:55 AM, Reinette Chatre wrote:
>> Signed-off-by: Xiaochen Shen <shenxiaochen(a)open-hieco.net>
>> ---
> I think it may help to add a maintainer note here to highlight even though this
> is a fix it is not a candidate for backport since, based on your other series,
> support for Hygon is in process of being added to resctrl.
>
Great suggestion.
I will add a maintainer note as you suggested in v2 patch series.
>> tools/testing/selftests/resctrl/cat_test.c | 4 ++--
>> 1 file changed, 2 insertions(+), 2 deletions(-)
> Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
>
> Reinette
Thank you very much for code review!
Best regards,
Xiaochen Shen
This patch series introduces BPF iterators for wakeup_source, enabling
BPF programs to efficiently traverse a device's wakeup sources.
Currently, inspecting wakeup sources typically involves reading interfaces
like /sys/class/wakeup/* or debugfs. The repeated syscalls to query the
sysfs nodes is inefficient, as there can be hundreds of wakeup_sources, and
each wakeup source have multiple stats, with one sysfs node per stat.
debugfs is unstable and insecure.
This series implements two types of iterators:
1. Standard BPF Iterator: Allows creating a BPF link to iterate over
wakeup sources
2. Open-coded Iterator: Enables the use of wakeup_source iterators directly
within BPF programs
Both iterators utilize pre-existing APIs wakeup_sources_walk_* to traverse
over the SRCU that backs the list of wakeup_sources.
Samuel Wu (4):
bpf: Add wakeup_source iterator
bpf: Open coded BPF for wakeup_sources
selftests/bpf: Add tests for wakeup_sources
selftests/bpf: Open coded BPF wakeup_sources test
kernel/bpf/Makefile | 1 +
kernel/bpf/helpers.c | 3 +
kernel/bpf/wakeup_source_iter.c | 137 ++++++++
.../testing/selftests/bpf/bpf_experimental.h | 5 +
tools/testing/selftests/bpf/config | 1 +
.../bpf/prog_tests/wakeup_source_iter.c | 323 ++++++++++++++++++
.../selftests/bpf/progs/wakeup_source_iter.c | 117 +++++++
7 files changed, 587 insertions(+)
create mode 100644 kernel/bpf/wakeup_source_iter.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/wakeup_source_iter.c
create mode 100644 tools/testing/selftests/bpf/progs/wakeup_source_iter.c
--
2.52.0.177.g9f829587af-goog
Hi,
This series fixes a verifier issue with bpf_d_path() and adds a
regression test to cover its use from an LSM program.
Patch 1 updates the bpf_d_path() helper prototype so that the second
argument is marked as MEM_WRITE. This makes it explicit to the verifier
that the helper writes into the provided buffer.
Patch 2 extends the existing d_path selftest to also cover the LSM
bprm_check_security hook. The LSM program calls bpf_d_path() on the
binary being executed and performs a simple prefix comparison on the
resulting pathname. To avoid nondeterminism, the program filters based
on an expected PID that is populated from userspace before the test
binary is executed, and the parent and child processes are synchronized
through a pipe so that the PID is set before exec. The test now uses
bpf_for() to express the small fixed-iteration loop in a
verifier-friendly way, and it removes the temporary /tmp/bpf_d_path_test
binary in the cleanup path.
Changelog
=========
v3:
- Switch the pathname prefix loop to use bpf_for() instead of
#pragma unroll, as suggested by Matt.
- Remove /tmp/bpf_d_path_test in the test cleanup path.
- Add the missing Reviewed-by tags.
v2:
- Merge the new test into the existing d_path selftest rather than
creating new files.
- Add PID filtering in the LSM program to avoid nondeterministic failures
due to unrelated processes triggering bprm_check_security.
- Synchronize child execution using a pipe to ensure deterministic
updates to the PID.
Thanks for your time and reviews.
Shuran Liu (2):
bpf: mark bpf_d_path() buffer as writeable
selftests/bpf: fix and consolidate d_path LSM regression test
kernel/trace/bpf_trace.c | 2 +-
.../testing/selftests/bpf/prog_tests/d_path.c | 65 +++++++++++++++++++
.../testing/selftests/bpf/progs/test_d_path.c | 33 ++++++++++
3 files changed, 99 insertions(+), 1 deletion(-)
--
2.52.0
From: "Mike Rapoport (Microsoft)" <rppt(a)kernel.org>
Hi,
These patches allow guest_memfd to notify userspace about minor page
faults using userfaultfd and let userspace to resolve these page faults
using UFFDIO_CONTINUE.
To allow UFFDIO_CONTINUE outside of the core mm I added a
get_folio_noalloc() callback to vm_ops that allows an address space
backing a VMA to return a folio that exists in it's page cache (patch 2)
In order for guest_memfd to notify userspace about page faults, there is a
new VM_FAULT_UFFD_MINOR that a ->fault() handler can return to inform the
page fault handler that it needs to call handle_userfault() to complete the
fault (patch 3).
Patch 4 plumbs these new goodies into guest_memfd.
This series is the minimal change I've been able to come up with to allow
integration of guest_memfd with uffd and while refactoring uffd and making
mfill_atomic() flow more linear would have been a nice improvement, it's
way out of the scope of enabling uffd with guest_memfd.
v3 changes:
* rename ->get_folio() to ->get_folio_noalloc()
* fix build errors reported by kbuild
* pull handling of UFFD_MINOR out of hotpath in __do_fault()
* update guest_memfs changes so its ->fault() and ->get_folio_noalloc()
follow the same semantics as shmem and hugetlb.
* s/MISSING/MINOR/g in changelogs
* added review tags
v2: https://lore.kernel.org/all/20251125183840.2368510-1-rppt@kernel.org
* rename ->get_shared_folio() to ->get_folio()
* hardwire VM_FAULF_UFFD_MINOR to 0 when CONFIG_USERFAULTFD=n
v1: https://patch.msgid.link/20251123102707.559422-1-rppt@kernel.org
* Introduce VM_FAULF_UFFD_MINOR to avoid exporting handle_userfault()
* Simplify vma_can_mfill_atomic()
* Rename get_pagecache_folio() to get_shared_folio() and use inode
instead of vma as its argument
rfc: https://patch.msgid.link/20251117114631.2029447-1-rppt@kernel.org
Mike Rapoport (Microsoft) (4):
userfaultfd: move vma_can_userfault out of line
userfaultfd, shmem: use a VMA callback to handle UFFDIO_CONTINUE
mm: introduce VM_FAULT_UFFD_MINOR fault reason
guest_memfd: add support for userfaultfd minor mode
Nikita Kalyazin (1):
KVM: selftests: test userfaultfd minor for guest_memfd
include/linux/mm.h | 9 ++
include/linux/mm_types.h | 10 +-
include/linux/userfaultfd_k.h | 36 +------
mm/memory.c | 5 +-
mm/shmem.c | 20 +++-
mm/userfaultfd.c | 80 ++++++++++++---
.../testing/selftests/kvm/guest_memfd_test.c | 97 +++++++++++++++++++
virt/kvm/guest_memfd.c | 33 ++++++-
8 files changed, 236 insertions(+), 54 deletions(-)
base-commit: ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d
--
2.51.0
v23:
fixed some of the "CHECK:" reported on checkpatch --strict.
Accepted Joel's suggestion for kselftest's Makefile.
CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22: fixing build error due to -march=zicfiss being picked in gcc-13 and above
but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
v21: fixed build errors.
Basics and overview
===================
Software with larger attack surfaces (e.g. network facing apps like databases,
browsers or apps relying on browser runtimes) suffer from memory corruption
issues which can be utilized by attackers to bend control flow of the program
to eventually gain control (by making their payload executable). Attackers are
able to perform such attacks by leveraging call-sites which rely on indirect
calls or return sites which rely on obtaining return address from stack memory.
To mitigate such attacks, risc-v extension zicfilp enforces that all indirect
calls must land on a landing pad instruction `lpad` else cpu will raise software
check exception (a new cpu exception cause code on riscv).
Similarly for return flow, risc-v extension zicfiss extends architecture with
- `sspush` instruction to push return address on a shadow stack
- `sspopchk` instruction to pop return address from shadow stack
and compare with input operand (i.e. return address on stack)
- `sspopchk` to raise software check exception if comparision above
was a mismatch
- Protection mechanism using which shadow stack is not writeable via
regular store instructions
More information an details can be found at extensions github repo [1].
Equivalent to landing pad (zicfilp) on x86 is `ENDBRANCH` instruction in Intel
CET [3] and branch target identification (BTI) [4] on arm.
Similarly x86's Intel CET has shadow stack [5] and arm64 has guarded control
stack (GCS) [6] which are very similar to risc-v's zicfiss shadow stack.
x86 and arm64 support for user mode shadow stack is already in mainline.
Kernel awareness for user control flow integrity
================================================
This series picks up Samuel Holland's envcfg changes [2] as well. So if those are
being applied independently, they should be removed from this series.
Enabling:
In order to maintain compatibility and not break anything in user mode, kernel
doesn't enable control flow integrity cpu extensions on binary by default.
Instead exposes a prctl interface to enable, disable and lock the shadow stack
or landing pad feature for a task. This allows userspace (loader) to enumerate
if all objects in its address space are compiled with shadow stack and landing
pad support and accordingly enable the feature. Additionally if a subsequent
`dlopen` happens on a library, user mode can take a decision again to disable
the feature (if incoming library is not compiled with support) OR terminate the
task (if user mode policy is strict to have all objects in address space to be
compiled with control flow integirty cpu feature). prctl to enable shadow stack
results in allocating shadow stack from virtual memory and activating for user
address space. x86 and arm64 are also following same direction due to similar
reason(s).
clone/fork:
On clone and fork, cfi state for task is inherited by child. Shadow stack is
part of virtual memory and is a writeable memory from kernel perspective
(writeable via a restricted set of instructions aka shadow stack instructions)
Thus kernel changes ensure that this memory is converted into read-only when
fork/clone happens and COWed when fault is taken due to sspush, sspopchk or
ssamoswap. In case `CLONE_VM` is specified and shadow stack is to be enabled,
kernel will automatically allocate a shadow stack for that clone call.
map_shadow_stack:
x86 introduced `map_shadow_stack` system call to allow user space to explicitly
map shadow stack memory in its address space. It is useful to allocate shadow
for different contexts managed by a single thread (green threads or contexts)
risc-v implements this system call as well.
signal management:
If shadow stack is enabled for a task, kernel performs an asynchronous control
flow diversion to deliver the signal and eventually expects userspace to issue
sigreturn so that original execution can be resumed. Even though resume context
is prepared by kernel, it is in user space memory and is subject to memory
corruption and corruption bugs can be utilized by attacker in this race window
to perform arbitrary sigreturn and eventually bypass cfi mechanism.
Another issue is how to ensure that cfi related state on sigcontext area is not
trampled by legacy apps or apps compiled with old kernel headers.
In order to mitigate control-flow hijacting, kernel prepares a token and place
it on shadow stack before signal delivery and places address of token in
sigcontext structure. During sigreturn, kernel obtains address of token from
sigcontext struture, reads token from shadow stack and validates it and only
then allow sigreturn to succeed. Compatiblity issue is solved by adopting
dynamic sigcontext management introduced for vector extension. This series
re-factor the code little bit to allow future sigcontext management easy (as
proposed by Andy Chiu from SiFive)
config and compilation:
Introduce a new risc-v config option `CONFIG_RISCV_USER_CFI`. Selecting this
config option picks the kernel support for user control flow integrity. This
optin is presented only if toolchain has shadow stack and landing pad support.
And is on purpose guarded by toolchain support. Reason being that eventually
vDSO also needs to be compiled in with shadow stack and landing pad support.
vDSO compile patches are not included as of now because landing pad labeling
scheme is yet to settle for usermode runtime.
To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss` in following:
Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst
How to test this series
=======================
Toolchain
---------
$ git clone git@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix=<path-to-where-to-build> --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)
Qemu
----
Get the lastest qemu
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)
Opensbi
-------
$ git clone git@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE=<your riscv toolchain> -j$(nproc) PLATFORM=generic
Linux
-----
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig
$ make ARCH=riscv CROSS_COMPILE=<path-to-cfi-riscv-gnu-toolchain>/build/bin/riscv64-unknown-linux-gnu- -j$(nproc)
Running
-------
Modify your qemu command to have:
-bios <path-to-cfi-opensbi>/build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true
References
==========
[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/all/20240814081126.956287-1-samuel.holland@sifive.c…
[3] - https://lwn.net/Articles/889475/
[4] - https://developer.arm.com/documentation/109576/0100/Branch-Target-Identific…
[5] - https://www.intel.com/content/dam/develop/external/us/en/documents/catc17-i…
[6] - https://lwn.net/Articles/940403/
To: Thomas Gleixner <tglx(a)linutronix.de>
To: Ingo Molnar <mingo(a)redhat.com>
To: Borislav Petkov <bp(a)alien8.de>
To: Dave Hansen <dave.hansen(a)linux.intel.com>
To: x86(a)kernel.org
To: H. Peter Anvin <hpa(a)zytor.com>
To: Andrew Morton <akpm(a)linux-foundation.org>
To: Liam R. Howlett <Liam.Howlett(a)oracle.com>
To: Vlastimil Babka <vbabka(a)suse.cz>
To: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
To: Paul Walmsley <paul.walmsley(a)sifive.com>
To: Palmer Dabbelt <palmer(a)dabbelt.com>
To: Albert Ou <aou(a)eecs.berkeley.edu>
To: Conor Dooley <conor(a)kernel.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Arnd Bergmann <arnd(a)arndb.de>
To: Christian Brauner <brauner(a)kernel.org>
To: Peter Zijlstra <peterz(a)infradead.org>
To: Oleg Nesterov <oleg(a)redhat.com>
To: Eric Biederman <ebiederm(a)xmission.com>
To: Kees Cook <kees(a)kernel.org>
To: Jonathan Corbet <corbet(a)lwn.net>
To: Shuah Khan <shuah(a)kernel.org>
To: Jann Horn <jannh(a)google.com>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Miguel Ojeda <ojeda(a)kernel.org>
To: Alex Gaynor <alex.gaynor(a)gmail.com>
To: Boqun Feng <boqun.feng(a)gmail.com>
To: Gary Guo <gary(a)garyguo.net>
To: Björn Roy Baron <bjorn3_gh(a)protonmail.com>
To: Benno Lossin <benno.lossin(a)proton.me>
To: Andreas Hindborg <a.hindborg(a)kernel.org>
To: Alice Ryhl <aliceryhl(a)google.com>
To: Trevor Gross <tmgross(a)umich.edu>
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-mm(a)kvack.org
Cc: linux-riscv(a)lists.infradead.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-arch(a)vger.kernel.org
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: alistair.francis(a)wdc.com
Cc: richard.henderson(a)linaro.org
Cc: jim.shu(a)sifive.com
Cc: andybnac(a)gmail.com
Cc: kito.cheng(a)sifive.com
Cc: charlie(a)rivosinc.com
Cc: atishp(a)rivosinc.com
Cc: evan(a)rivosinc.com
Cc: cleger(a)rivosinc.com
Cc: alexghiti(a)rivosinc.com
Cc: samitolvanen(a)google.com
Cc: broonie(a)kernel.org
Cc: rick.p.edgecombe(a)intel.com
Cc: rust-for-linux(a)vger.kernel.org
changelog
---------
v23:
- fixed some of the "CHECK:" reported on checkpatch --strict.
- Accepted Joel's suggestion for kselftest's Makefile.
- CONFIG_RISCV_USER_CFI is enabled when zicfiss, zicfilp and fcf-protection
are all present in toolchain
v22:
- CONFIG_RISCV_USER_CFI was by default "n". With dual vdso support it is
default "y" (if toolchain supports it). Fixing build error due to
"-march=zicfiss" being picked in gcc-13 partially. gcc-13 only recognizes the
flag but not actually doing any codegen or recognizing instruction for zicfiss.
Change in v22 makes dependence on `-fcf-protection=full` compiler flag to
ensure that toolchain has support and then only CONFIG_RISCV_USER_CFI will be
visible in menuconfig.
- picked up tags and some cosmetic changes in commit message for dual vdso
patch.
v21:
- Fixing build errors due to changes in arch/riscv/include/asm/vdso.h
Using #ifdef instead of IS_ENABLED in arch/riscv/include/asm/vdso.h
vdso-cfi-offsets.h should be included only when CONFIG_RISCV_USER_CFI
is selected.
v20:
- rebased on v6.18-rc1.
- Added two vDSO support. If `CONFIG_RISCV_USER_CFI` is selected
two vDSOs are compiled (one for hardware prior to RVA23 and one
for RVA23 onwards). Kernel exposes RVA23 vDSO if hardware/cpu
implements zimop else exposes existing vDSO to userspace.
- default selection for `CONFIG_RISCV_USER_CFI` is "Yes".
- replaced "__ASSEMBLY__" with "__ASSEMBLER__"
v19:
- riscv_nousercfi was `int`. changed it to unsigned long.
Thanks to Alex Ghiti for reporting it. It was a bug.
- ELP is cleared on trap entry only when CONFIG_64BIT.
- restore ssp back on return to usermode was being done
before `riscv_v_context_nesting_end` on trap exit path.
If kernel shadow stack were enabled this would result in
kernel operating on user shadow stack and panic (as I found
in my testing of kcfi patch series). So fixed that.
v18:
- rebased on 6.16-rc1
- uprobe handling clears ELP in sstatus image in pt_regs
- vdso was missing shadow stack elf note for object files.
added that. Additional asm file for vdso needed the elf marker
flag. toolchain should complain if `-fcf-protection=full` and
marker is missing for object generated from asm file. Asked
toolchain folks to fix this. Although no reason to gate the merge
on that.
- Split up compile options for march and fcf-protection in vdso
Makefile
- CONFIG_RISCV_USER_CFI option is moved under "Kernel features" menu
Added `arch/riscv/configs/hardening.config` fragment which selects
CONFIG_RISCV_USER_CFI
v17:
- fixed warnings due to empty macros in usercfi.h (reported by alexg)
- fixed prefixes in commit titles reported by alexg
- took below uprobe with fcfi v2 patch from Zong Li and squashed it with
"riscv/traps: Introduce software check exception and uprobe handling"
https://lore.kernel.org/all/20250604093403.10916-1-zong.li@sifive.com/
v16:
- If FWFT is not implemented or returns error for shadow stack activation, then
no_usercfi is set to disable shadow stack. Although this should be picked up
by extension validation and activation. Fixed this bug for zicfilp and zicfiss
both. Thanks to Charlie Jenkins for reporting this.
- If toolchain doesn't support cfi, cfi kselftest shouldn't build. Suggested by
Charlie Jenkins.
- Default for CONFIG_RISCV_USER_CFI is set to no. Charlie/Atish suggested to
keep it off till we have more hardware availibility with RVA23 profile and
zimop/zcmop implemented. Else this will start breaking people's workflow
- Includes the fix if "!RV64 and !SBI" then definitions for FWFT in
asm-offsets.c error.
v15:
- Toolchain has been updated to include `-fcf-protection` flag. This
exists for x86 as well. Updated kernel patches to compile vDSO and
selftest to compile with `fcf-protection=full` flag.
- selecting CONFIG_RISCV_USERCFI selects CONFIG_RISCV_SBI.
- Patch to enable shadow stack for kernel wasn't hidden behind
CONFIG_RISCV_USERCFI and CONFIG_RISCV_SBI. fixed that.
v14:
- rebased on top of palmer/sbi-v3. Thus dropped clement's FWFT patches
Updated RISCV_ISA_EXT_XXXX in hwcap and hwprobe constants.
- Took Radim's suggestions on bitfields.
- Placed cfi_state at the end of thread_info block so that current situation
is not disturbed with respect to member fields of thread_info in single
cacheline.
v13:
- cpu_supports_shadow_stack/cpu_supports_indirect_br_lp_instr uses
riscv_has_extension_unlikely()
- uses nops(count) to create nop slide
- RISCV_ACQUIRE_BARRIER is not needed in `amo_user_shstk`. Removed it
- changed ternaries to simply use implicit casting to convert to bool.
- kernel command line allows to disable zicfilp and zicfiss independently.
updated kernel-parameters.txt.
- ptrace user abi for cfi uses bitmasks instead of bitfields. Added ptrace
kselftest.
- cosmetic and grammatical changes to documentation.
v12:
- It seems like I had accidently squashed arch agnostic indirect branch
tracking prctl and riscv implementation of those prctls. Split them again.
- set_shstk_status/set_indir_lp_status perform CSR writes only when CPU
support is available. As suggested by Zong Li.
- Some minor clean up in kselftests as suggested by Zong Li.
v11:
- patch "arch/riscv: compile vdso with landing pad" was unconditionally
selecting `_zicfilp` for vDSO compile. fixed that. Changed `lpad 1` to
to `lpad 0`.
v10:
- dropped "mm: helper `is_shadow_stack_vma` to check shadow stack vma". This patch
is not that interesting to this patch series for risc-v. There are instances in
arch directories where VM_SHADOW_STACK flag is anyways used. Dropping this patch
to expedite merging in riscv tree.
- Took suggestions from `Clement` on "riscv: zicfiss / zicfilp enumeration" to
validate presence of cfi based on config.
- Added a patch for vDSO to have `lpad 0`. I had omitted this earlier to make sure
we add single vdso object with cfi enabled. But a vdso object with scheme of
zero labeled landing pad is least common denominator and should work with all
objects of zero labeled as well as function-signature labeled objects.
v9:
- rebased on master (39a803b754d5 fix braino in "9p: fix ->rename_sem exclusion")
- dropped "mm: Introduce ARCH_HAS_USER_SHADOW_STACK" (master has it from arm64/gcs)
- dropped "prctl: arch-agnostic prctl for shadow stack" (master has it from arm64/gcs)
v8:
- rebased on palmer/for-next
- dropped samuel holland's `envcfg` context switch patches.
they are in parlmer/for-next
v7:
- Removed "riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv"
Instead using `deactivate_mm` flow to clean up.
see here for more context
https://lore.kernel.org/all/20230908203655.543765-1-rick.p.edgecombe@intel.…
- Changed the header include in `kselftest`. Hopefully this fixes compile
issue faced by Zong Li at SiFive.
- Cleaned up an orphaned change to `mm/mmap.c` in below patch
"riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE"
- Lock interfaces for shadow stack and indirect branch tracking expect arg == 0
Any future evolution of this interface should accordingly define how arg should
be setup.
- `mm/map.c` has an instance of using `VM_SHADOW_STACK`. Fixed it to use helper
`is_shadow_stack_vma`.
- Link to v6: https://lore.kernel.org/r/20241008-v5_user_cfi_series-v6-0-60d9fe073f37@riv…
v6:
- Picked up Samuel Holland's changes as is with `envcfg` placed in
`thread` instead of `thread_info`
- fixed unaligned newline escapes in kselftest
- cleaned up messages in kselftest and included test output in commit message
- fixed a bug in clone path reported by Zong Li
- fixed a build issue if CONFIG_RISCV_ISA_V is not selected
(this was introduced due to re-factoring signal context
management code)
v5:
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
(style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
FWFT
(https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware…)
- Link to v5: https://lore.kernel.org/r/20241001-v5_user_cfi_series-v1-0-3ba65b6e550f@riv…
(Note: I had an issue in my workflow due to which version number wasn't
picked up correctly while sending out patches)
v4:
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.
- Link to v4: https://lore.kernel.org/all/20240912231650.3740732-1-debug@rivosinc.com/
v3:
- envcfg
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.
- dt-bindings
As suggested, split into separate commit. fixed the messaging that spec is
in public review
- arch_is_shadow_stack change
arch_is_shadow_stack changed to vma_is_shadow_stack
- hwprobe
zicfiss / zicfilp if present will get enumerated in hwprobe
- selftests
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.
- Link to v3: https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/
v2:
- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.
- Enabling of control flow integrity for user programs is left to user runtime
- This patch series introduces arch agnostic `prctls` to enable shadow stack
and indirect branch tracking. And implements them on riscv.
---
Changes in v23:
- Link to v22: https://lore.kernel.org/r/20251023-v5_user_cfi_series-v22-0-1935270f7636@ri…
Changes in v22:
- Link to v21: https://lore.kernel.org/r/20251015-v5_user_cfi_series-v21-0-6a07856e90e7@ri…
Changes in v21:
- Link to v20: https://lore.kernel.org/r/20251013-v5_user_cfi_series-v20-0-b9de4be9912e@ri…
Changes in v20:
- Link to v19: https://lore.kernel.org/r/20250731-v5_user_cfi_series-v19-0-09b468d7beab@ri…
Changes in v19:
- Link to v18: https://lore.kernel.org/r/20250711-v5_user_cfi_series-v18-0-a8ee62f9f38e@ri…
Changes in v18:
- Link to v17: https://lore.kernel.org/r/20250604-v5_user_cfi_series-v17-0-4565c2cf869f@ri…
Changes in v17:
- Link to v16: https://lore.kernel.org/r/20250522-v5_user_cfi_series-v16-0-64f61a35eee7@ri…
Changes in v16:
- Link to v15: https://lore.kernel.org/r/20250502-v5_user_cfi_series-v15-0-914966471885@ri…
Changes in v15:
- changelog posted just below cover letter
- Link to v14: https://lore.kernel.org/r/20250429-v5_user_cfi_series-v14-0-5239410d012a@ri…
Changes in v14:
- changelog posted just below cover letter
- Link to v13: https://lore.kernel.org/r/20250424-v5_user_cfi_series-v13-0-971437de586a@ri…
Changes in v13:
- changelog posted just below cover letter
- Link to v12: https://lore.kernel.org/r/20250314-v5_user_cfi_series-v12-0-e51202b53138@ri…
Changes in v12:
- changelog posted just below cover letter
- Link to v11: https://lore.kernel.org/r/20250310-v5_user_cfi_series-v11-0-86b36cbfb910@ri…
Changes in v11:
- changelog posted just below cover letter
- Link to v10: https://lore.kernel.org/r/20250210-v5_user_cfi_series-v10-0-163dcfa31c60@ri…
---
Andy Chiu (1):
riscv: signal: abstract header saving for setup_sigcontext
Deepak Gupta (26):
mm: VM_SHADOW_STACK definition for riscv
dt-bindings: riscv: zicfilp and zicfiss in dt-bindings (extensions.yaml)
riscv: zicfiss / zicfilp enumeration
riscv: zicfiss / zicfilp extension csr and bit definitions
riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
riscv/mm: manufacture shadow stack pte
riscv/mm: teach pte_mkwrite to manufacture shadow stack PTEs
riscv/mm: write protect and shadow stack
riscv/mm: Implement map_shadow_stack() syscall
riscv/shstk: If needed allocate a new shadow stack on clone
riscv: Implements arch agnostic shadow stack prctls
prctl: arch-agnostic prctl for indirect branch tracking
riscv: Implements arch agnostic indirect branch tracking prctls
riscv/traps: Introduce software check exception and uprobe handling
riscv/signal: save and restore of shadow stack for signal
riscv/kernel: update __show_regs to print shadow stack register
riscv/ptrace: riscv cfi status and state via ptrace and in core files
riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
riscv: kernel command line option to opt out of user cfi
riscv: enable kernel access to shadow stack memory via FWFT sbi call
arch/riscv: dual vdso creation logic and select vdso based on hw
riscv: create a config for shadow stack and landing pad instr support
riscv: Documentation for landing pad / indirect branch tracking
riscv: Documentation for shadow stack on riscv
kselftest/riscv: kselftest for user mode cfi
Jim Shu (1):
arch/riscv: compile vdso with landing pad and shadow stack note
Documentation/admin-guide/kernel-parameters.txt | 8 +
Documentation/arch/riscv/index.rst | 2 +
Documentation/arch/riscv/zicfilp.rst | 115 +++++
Documentation/arch/riscv/zicfiss.rst | 179 +++++++
.../devicetree/bindings/riscv/extensions.yaml | 14 +
arch/riscv/Kconfig | 22 +
arch/riscv/Makefile | 8 +-
arch/riscv/configs/hardening.config | 4 +
arch/riscv/include/asm/asm-prototypes.h | 1 +
arch/riscv/include/asm/assembler.h | 44 ++
arch/riscv/include/asm/cpufeature.h | 12 +
arch/riscv/include/asm/csr.h | 16 +
arch/riscv/include/asm/entry-common.h | 2 +
arch/riscv/include/asm/hwcap.h | 2 +
arch/riscv/include/asm/mman.h | 26 +
arch/riscv/include/asm/mmu_context.h | 7 +
arch/riscv/include/asm/pgtable.h | 30 +-
arch/riscv/include/asm/processor.h | 1 +
arch/riscv/include/asm/thread_info.h | 3 +
arch/riscv/include/asm/usercfi.h | 95 ++++
arch/riscv/include/asm/vdso.h | 13 +-
arch/riscv/include/asm/vector.h | 3 +
arch/riscv/include/uapi/asm/hwprobe.h | 2 +
arch/riscv/include/uapi/asm/ptrace.h | 34 ++
arch/riscv/include/uapi/asm/sigcontext.h | 1 +
arch/riscv/kernel/Makefile | 2 +
arch/riscv/kernel/asm-offsets.c | 10 +
arch/riscv/kernel/cpufeature.c | 27 +
arch/riscv/kernel/entry.S | 38 ++
arch/riscv/kernel/head.S | 27 +
arch/riscv/kernel/process.c | 27 +-
arch/riscv/kernel/ptrace.c | 95 ++++
arch/riscv/kernel/signal.c | 148 +++++-
arch/riscv/kernel/sys_hwprobe.c | 2 +
arch/riscv/kernel/sys_riscv.c | 10 +
arch/riscv/kernel/traps.c | 54 ++
arch/riscv/kernel/usercfi.c | 545 +++++++++++++++++++++
arch/riscv/kernel/vdso.c | 7 +
arch/riscv/kernel/vdso/Makefile | 40 +-
arch/riscv/kernel/vdso/flush_icache.S | 4 +
arch/riscv/kernel/vdso/gen_vdso_offsets.sh | 4 +-
arch/riscv/kernel/vdso/getcpu.S | 4 +
arch/riscv/kernel/vdso/note.S | 3 +
arch/riscv/kernel/vdso/rt_sigreturn.S | 4 +
arch/riscv/kernel/vdso/sys_hwprobe.S | 4 +
arch/riscv/kernel/vdso/vgetrandom-chacha.S | 5 +-
arch/riscv/kernel/vdso_cfi/Makefile | 25 +
arch/riscv/kernel/vdso_cfi/vdso-cfi.S | 11 +
arch/riscv/mm/init.c | 2 +-
arch/riscv/mm/pgtable.c | 16 +
include/linux/cpu.h | 4 +
include/linux/mm.h | 7 +
include/uapi/linux/elf.h | 2 +
include/uapi/linux/prctl.h | 27 +
kernel/sys.c | 30 ++
tools/testing/selftests/riscv/Makefile | 2 +-
tools/testing/selftests/riscv/cfi/.gitignore | 2 +
tools/testing/selftests/riscv/cfi/Makefile | 23 +
tools/testing/selftests/riscv/cfi/cfi_rv_test.h | 82 ++++
tools/testing/selftests/riscv/cfi/cfitests.c | 173 +++++++
tools/testing/selftests/riscv/cfi/shadowstack.c | 385 +++++++++++++++
tools/testing/selftests/riscv/cfi/shadowstack.h | 27 +
62 files changed, 2481 insertions(+), 41 deletions(-)
---
base-commit: 3a8660878839faadb4f1a6dd72c3179c1df56787
change-id: 20240930-v5_user_cfi_series-3dc332f8f5b2
--
- debug
test_memcg_sock() currently requires that memory.stat's "sock " counter
is exactly zero immediately after the TCP server exits. On a busy system
this assumption is too strict:
- Socket memory may be freed with a small delay (e.g. RCU callbacks).
- memcg statistics are updated asynchronously via the rstat flushing
worker, so the "sock " value in memory.stat can stay non-zero for a
short period of time even after all socket memory has been uncharged.
As a result, test_memcg_sock() can intermittently fail even though socket
memory accounting is working correctly.
Make the test more robust by polling memory.stat for the "sock "
counter and allowing it some time to drop to zero instead of checking
it only once. The timeout is set to 3 seconds to cover the periodic
rstat flush interval (FLUSH_TIME = 2*HZ by default) plus some
scheduling slack. If the counter does not become zero within the
timeout, the test still fails as before.
On my test system, running test_memcontrol 50 times produced:
- Before this patch: 6/50 runs passed.
- After this patch: 50/50 runs passed.
Suggested-by: Lance Yang <lance.yang(a)linux.dev>
Reviewed-by: Lance Yang <lance.yang(a)linux.dev>
Signed-off-by: Guopeng Zhang <zhangguopeng(a)kylinos.cn>
---
v3:
- Move MEMCG_SOCKSTAT_WAIT_* defines after the #include block as
suggested.
v2:
- Mention the periodic rstat flush interval (FLUSH_TIME = 2*HZ) in
the comment and clarify the rationale for the 3s timeout.
- Replace the hard-coded retry count and wait interval with macros
to avoid magic numbers and make the 3s timeout calculation explicit.
---
.../selftests/cgroup/test_memcontrol.c | 30 ++++++++++++++++++-
1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 4e1647568c5b..8ff7286fc80b 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -21,6 +21,9 @@
#include "kselftest.h"
#include "cgroup_util.h"
+#define MEMCG_SOCKSTAT_WAIT_RETRIES 30 /* 3s total */
+#define MEMCG_SOCKSTAT_WAIT_INTERVAL_US (100 * 1000) /* 100 ms */
+
static bool has_localevents;
static bool has_recursiveprot;
@@ -1384,6 +1387,8 @@ static int test_memcg_sock(const char *root)
int bind_retries = 5, ret = KSFT_FAIL, pid, err;
unsigned short port;
char *memcg;
+ long sock_post = -1;
+ int i;
memcg = cg_name(root, "memcg_test");
if (!memcg)
@@ -1432,7 +1437,30 @@ static int test_memcg_sock(const char *root)
if (cg_read_long(memcg, "memory.current") < 0)
goto cleanup;
- if (cg_read_key_long(memcg, "memory.stat", "sock "))
+ /*
+ * memory.stat is updated asynchronously via the memcg rstat
+ * flushing worker, which runs periodically (every 2 seconds,
+ * see FLUSH_TIME). On a busy system, the "sock " counter may
+ * stay non-zero for a short period of time after the TCP
+ * connection is closed and all socket memory has been
+ * uncharged.
+ *
+ * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
+ * scheduling slack) and require that the "sock " counter
+ * eventually drops to zero.
+ */
+ for (i = 0; i < MEMCG_SOCKSTAT_WAIT_RETRIES; i++) {
+ sock_post = cg_read_key_long(memcg, "memory.stat", "sock ");
+ if (sock_post < 0)
+ goto cleanup;
+
+ if (!sock_post)
+ break;
+
+ usleep(MEMCG_SOCKSTAT_WAIT_INTERVAL_US);
+ }
+
+ if (sock_post)
goto cleanup;
ret = KSFT_PASS;
--
2.25.1
We would like to add support for checkpoint/restoring file descriptors
open on these "unmounted" mounts to CRIU (Checkpoint/Restore in
Userspace) [1].
Currently, we have no way to get mount info for these "unmounted" mounts
since they do appear in /proc/<pid>/mountinfo and statmount does not
work on them, since they do not belong to any mount namespace.
This patch helps us by providing a way to get mountinfo for these
"unmounted" mounts by using a fd on the mount.
Changes from v6 [2] to v7:
* Add kselftests for STATMOUNT_BY_FD flag.
* Instead of renaming mnt_id_req.mnt_ns_fd to mnt_id_req.fd introduce a
union so struct mnt_id_req looks like this:
struct mnt_id_req {
__u32 size;
union {
__u32 mnt_ns_fd;
__u32 mnt_fd;
};
__u64 mnt_id;
__u64 param;
__u64 mnt_ns_id;
};
* In case of STATMOUNT_BY_FD grab mnt_ns inside of do_statmount(),
since we get mnt_ns from mnt, which should happen under namespace lock.
* Remove the modifications made to grab_requested_mnt_ns, those were
never needed.
Changes from v5 [3] to v6:
* Instead of returning "[unmounted]" as the mount point for "unmounted"
mounts, we unset the STATMOUNT_MNT_POINT flag in statmount.mask.
* Instead of returning 0 as the mnt_ns_id for "unmounted" mounts, we
unset the STATMOUNT_MNT_NS_ID flag in statmount.mask.
* Added comment in `do_statmount` clarifying that the caller sets s->mnt
in case of STATMOUNT_BY_FD.
* In `do_statmount` move the mnt_ns_id and mnt_ns_empty() check just
before lookup_mnt_in_ns().
* We took another look at the capability checks for getting information
for "unmounted" mounts using an fd and decided to remove them for the
following reasons:
- All fs related information is available via fstatfs() without any
capability check.
- Mount information is also available via /proc/pid/mountinfo (without
any capability check).
- Given that we have access to a fd on the mount which tells us that
we had access to the mount at some point (or someone that had access
gave us the fd). So, we should be able to access mount info.
Changes from v4 [4] to v5:
Check only for s->root.mnt to be NULL instead of checking for both
s->root.mnt and s->root.dentry (I did not find a case where only one of
them would be NULL).
* Only allow system root (CAP_SYS_ADMIN in init_user_ns) to call
statmount() on fd's on "unmounted" mounts. We (mostly Pavel) spent some
time thinking about how our previous approach (of checking the opener's
file credentials) caused problems.
Please take a look at the linked pictures they describe everything more
clearly.
Case 1: A fd is on a normal mount (Link to Picture: [5])
Consider, a situation where we have two processes P1 and P2 and a file
F1. F1 is opened on mount ns M1 by P1. P1 is nested inside user
namespace U1 and U2. P2 is also in U1. P2 is also in a pid namespace and
mount namespace separate from M1.
P1 sends F1 to P2 (using a unix socket). But, P2 is unable to call
statmount() on F1 because since it is a separate pid and mount
namespace. This is good and expected.
Case 2: A fd is on a "unmounted" mount (Link to Picture: [6])
Consider a similar situation as Case 1. But now F1 is on a mounted that
has been "unmounted". Now, since we used openers credentials to check
for permissions P2 ends up having the ability call statmount() and get
mount info for this "unmounted" mount.
Hence, It is better to restrict the ability to call statmount() on fds
on "unmounted" mounts to system root only (There could also be other
cases than the one described above).
Changes from v3 [7] to v4:
* Change the string returned when there is no mountpoint to be
"[unmounted]" instead of "[detached]".
* Remove the new DEFINE_FREE put_file and use the one already present in
include/linux/file.h (fput) [8].
* Inside listmount consistently pass 0 in flags to copy_mnt_id_req and
prepare_klistmount()->grab_requested_mnt_ns() and remove flags from the
prepare_klistmount prototype.
* If STATMOUNT_BY_FD is set, check for mnt_ns_id == 0 && mnt_id == 0.
Changes from v2 [9] to v3:
* Rename STATMOUNT_FD flag to STATMOUNT_BY_FD.
* Fixed UAF bug caused by the reference to fd_mount being bound by scope
of CLASS(fd_raw, f)(kreq.fd) by using fget_raw instead.
* Reused @spare parameter in mnt_id_req instead of adding new fields to
the struct.
Changes from v1 [10] to v2:
v1 of this patchset, took a different approach and introduced a new
umount_mnt_ns, to which "unmounted" mounts would be moved to (instead of
their namespace being NULL) thus allowing them to be still available via
statmount.
Introducing umount_mnt_ns complicated namespace locking and modified
performance sensitive code [11] and it was agreed upon that fd-based
statmount would be better.
This code is also available on github [12].
[1]: https://github.com/checkpoint-restore/criu/pull/2754
[2]: https://lore.kernel.org/all/20251118084836.2114503-1-b.sachdev1904@gmail.co…
[3]: https://lore.kernel.org/criu/20251109053921.1320977-2-b.sachdev1904@gmail.c…
[4]: https://lore.kernel.org/all/20251029052037.506273-2-b.sachdev1904@gmail.com/
[5]: https://github.com/bsach64/linux/blob/statmount-fd-v5/fd_on_normal_mount.png
[6]: https://github.com/bsach64/linux/blob/statmount-fd-v5/file_on_unmounted_mou…
[7]: https://lore.kernel.org/all/20251024181443.786363-1-b.sachdev1904@gmail.com/
[8]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/inc…
[9]: https://lore.kernel.org/linux-fsdevel/20251011124753.1820802-1-b.sachdev190…
[10]: https://lore.kernel.org/linux-fsdevel/20251002125422.203598-1-b.sachdev1904…
[11]: https://lore.kernel.org/linux-fsdevel/7e4d9eb5-6dde-4c59-8ee3-358233f082d0@…
[12]: https://github.com/bsach64/linux/tree/statmount-fd-v7
Bhavik Sachdev (3):
statmount: permission check should return EPERM
statmount: accept fd as a parameter
selftests: statmount: tests for STATMOUNT_BY_FD
fs/namespace.c | 102 ++++---
include/uapi/linux/mount.h | 10 +-
.../filesystems/statmount/statmount.h | 15 +-
.../filesystems/statmount/statmount_test.c | 261 +++++++++++++++++-
.../filesystems/statmount/statmount_test_ns.c | 101 ++++++-
5 files changed, 430 insertions(+), 59 deletions(-)
--
2.52.0
This series adds the base support to preserve a VFIO device file across
a Live Update. "Base support" means that this allows userspace to
safetly preserve a VFIO device file with LIVEUPDATE_SESSION_PRESERVE_FD
and retrieve a preserved VFIO device file with
LIVEUPDATE_SESSION_RETRIEVE_FD, but the device itself is not preserved
in a fully running state across Live Update.
This series unblocks 2 parallel but related streams of work:
- iommufd preservation across Live Update. This work spans iommufd,
the IOMMU subsystem, and IOMMU drivers [1]
- Preservation of VFIO device state across Live Update (config space,
BAR addresses, power state, SR-IOV state, etc.). This work spans both
VFIO and the core PCI subsystem.
While we need all of the above to fully preserve a VFIO device across a
Live Update without disrupting the workload on the device, this series
aims to be functional and safe enough to merge as the first incremental
step toward that goal.
Areas for Discussion
--------------------
BDF Stability across Live Update
The PCI support for tracking preserved devices across a Live Update to
prevent auto-probing relies on PCI segment numbers and BDFs remaining
stable. For now I have disallowed VFs, as the BDFs assigned to VFs can
vary depending on how the kernel chooses to allocate bus numbers. For
non-VFs I am wondering if there is any more needed to ensure BDF
stability across Live Update.
While we would like to support many different systems and
configurations in due time (including preserving VFs), I'd like to
keep this first serses constrained to simple use-cases.
FLB Locking
I don't see a way to properly synchronize pci_flb_finish() with
pci_liveupdate_incoming_is_preserved() since the incoming FLB mutex is
dropped by liveupdate_flb_get_incoming() when it returns the pointer
to the object, and taking pci_flb_incoming_lock in pci_flb_finish()
could result in a deadlock due to reversing the lock ordering.
FLB Retrieving
The first patch of this series includes a fix to prevent an FLB from
being retrieved again it is finished. I am wondering if this is the
right approach or if subsystems are expected to stop calling
liveupdate_flb_get_incoming() after an FLB is finished.
Testing
-------
The patches at the end of this series provide comprehensive selftests
for the new code added by this series. The selftests have been validated
in both a VM environment using a virtio-net PCIe device, and in a
baremetal environment on an Intel EMR server with an Intel DSA device.
Here is an example of how to run the new selftests:
vfio_pci_liveupdate_uapi_test:
$ tools/testing/selftests/vfio/scripts/setup.sh 0000:00:04.0
$ tools/testing/selftests/vfio/vfio_pci_liveupdate_uapi_test 0000:00:04.0
$ tools/testing/selftests/vfio/scripts/cleanup.sh
vfio_pci_liveupdate_kexec_test:
$ tools/testing/selftests/vfio/scripts/setup.sh 0000:00:04.0
$ tools/testing/selftests/vfio/vfio_pci_liveupdate_kexec_test --stage 1 0000:00:04.0
$ kexec [...] # NOTE: distro-dependent
$ tools/testing/selftests/vfio/scripts/setup.sh 0000:00:04.0
$ tools/testing/selftests/vfio/vfio_pci_liveupdate_kexec_test --stage 2 0000:00:04.0
$ tools/testing/selftests/vfio/scripts/cleanup.sh
Dependencies
------------
This series was constructed on top of several in-flight series and on
top of mm-nonmm-unstable [2].
+-- This series
|
+-- [PATCH v2 00/18] vfio: selftests: Support for multi-device tests
| https://lore.kernel.org/kvm/20251112192232.442761-1-dmatlack@google.com/
|
+-- [PATCH v3 0/4] vfio: selftests: update DMA mapping tests to use queried IOVA ranges
| https://lore.kernel.org/kvm/20251111-iova-ranges-v3-0-7960244642c5@fb.com/
|
+-- [PATCH v8 0/2] Live Update: File-Lifecycle-Bound (FLB) State
| https://lore.kernel.org/linux-mm/20251125225006.3722394-1-pasha.tatashin@so…
|
+-- [PATCH v8 00/18] Live Update Orchestrator
| https://lore.kernel.org/linux-mm/20251125165850.3389713-1-pasha.tatashin@so…
|
To simplify checking out the code, this series can be found on GitHub:
https://github.com/dmatlack/linux/tree/liveupdate/vfio/cdev/v1
Changelog
---------
v1:
- Rebase series on top of LUOv8 and VFIO selftests improvements
- Drop commits to preserve config space fields across Live Update.
These changes require changes to the PCI layer. For exmaple,
preserving rbars could lead to an inconsistent device state until
device BARs addresses are preserved across Live Update.
- Drop commits to preserve Bus Master Enable on the device. There's no
reason to preserve this until iommufd preservation is fully working.
Furthermore, preserving Bus Master Enable could lead to memory
corruption when the device if the device is bound to the default
identity-map domain after Live Update.
- Drop commits to preserve saved PCI state. This work is not needed
until we are ready to preserve the device's config space, and
requires more thought to make the PCI state data layout ABI-friendly.
- Add support to skip auto-probing devices that are preserved by VFIO
to avoid them getting bound to a different driver by the next kernel.
- Restrict device preservation further (no VFs, no intel-graphics).
- Various refactoring and small edits to improve readability and
eliminate code duplication.
rfc: https://lore.kernel.org/kvm/20251018000713.677779-1-vipinsh@google.com/
Cc: Saeed Mahameed <saeedm(a)nvidia.com>
Cc: Adithya Jayachandran <ajayachandra(a)nvidia.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Parav Pandit <parav(a)nvidia.com>
Cc: Leon Romanovsky <leonro(a)nvidia.com>
Cc: William Tu <witu(a)nvidia.com>
Cc: Jacob Pan <jacob.pan(a)linux.microsoft.com>
Cc: Lukas Wunner <lukas(a)wunner.de>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: Pratyush Yadav <pratyush(a)kernel.org>
Cc: Samiullah Khawaja <skhawaja(a)google.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Josh Hilke <jrhilke(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
[1] https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@googl…
[2] https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git/log/?h=mm-nonmm…
David Matlack (12):
liveupdate: luo_flb: Prevent retrieve() after finish()
PCI: Add API to track PCI devices preserved across Live Update
PCI: Require driver_override for incoming Live Update preserved
devices
vfio/pci: Notify PCI subsystem about devices preserved across Live
Update
vfio: Enforce preserved devices are retrieved via
LIVEUPDATE_SESSION_RETRIEVE_FD
vfio/pci: Store Live Update state in struct vfio_pci_core_device
vfio: selftests: Add Makefile support for TEST_GEN_PROGS_EXTENDED
vfio: selftests: Add vfio_pci_liveupdate_uapi_test
vfio: selftests: Expose iommu_modes to tests
vfio: selftests: Expose low-level helper routines for setting up
struct vfio_pci_device
vfio: selftests: Verify that opening VFIO device fails during Live
Update
vfio: selftests: Add continuous DMA to vfio_pci_liveupdate_kexec_test
Vipin Sharma (9):
vfio/pci: Register a file handler with Live Update Orchestrator
vfio/pci: Preserve vfio-pci device files across Live Update
vfio/pci: Retrieve preserved device files after Live Update
vfio/pci: Skip reset of preserved device after Live Update
selftests/liveupdate: Move luo_test_utils.* into a reusable library
selftests/liveupdate: Add helpers to preserve/retrieve FDs
vfio: selftests: Build liveupdate library in VFIO selftests
vfio: selftests: Initialize vfio_pci_device using a VFIO cdev FD
vfio: selftests: Add vfio_pci_liveupdate_kexec_test
MAINTAINERS | 1 +
drivers/pci/Makefile | 1 +
drivers/pci/liveupdate.c | 248 ++++++++++++++++
drivers/pci/pci-driver.c | 12 +-
drivers/vfio/device_cdev.c | 25 +-
drivers/vfio/group.c | 9 +
drivers/vfio/pci/Makefile | 1 +
drivers/vfio/pci/vfio_pci.c | 11 +-
drivers/vfio/pci/vfio_pci_core.c | 23 +-
drivers/vfio/pci/vfio_pci_liveupdate.c | 278 ++++++++++++++++++
drivers/vfio/pci/vfio_pci_priv.h | 16 +
drivers/vfio/vfio.h | 13 -
drivers/vfio/vfio_main.c | 22 +-
include/linux/kho/abi/pci.h | 53 ++++
include/linux/kho/abi/vfio_pci.h | 45 +++
include/linux/liveupdate.h | 3 +
include/linux/pci.h | 38 +++
include/linux/vfio.h | 51 ++++
include/linux/vfio_pci_core.h | 7 +
kernel/liveupdate/luo_flb.c | 4 +
tools/testing/selftests/liveupdate/.gitignore | 1 +
tools/testing/selftests/liveupdate/Makefile | 14 +-
.../include/libliveupdate.h} | 11 +-
.../selftests/liveupdate/lib/libliveupdate.mk | 20 ++
.../{luo_test_utils.c => lib/liveupdate.c} | 43 ++-
.../selftests/liveupdate/luo_kexec_simple.c | 2 +-
.../selftests/liveupdate/luo_multi_session.c | 2 +-
tools/testing/selftests/vfio/Makefile | 23 +-
.../vfio/lib/include/libvfio/iommu.h | 2 +
.../lib/include/libvfio/vfio_pci_device.h | 8 +
tools/testing/selftests/vfio/lib/iommu.c | 4 +-
.../selftests/vfio/lib/vfio_pci_device.c | 60 +++-
.../vfio/vfio_pci_liveupdate_kexec_test.c | 255 ++++++++++++++++
.../vfio/vfio_pci_liveupdate_uapi_test.c | 93 ++++++
34 files changed, 1313 insertions(+), 86 deletions(-)
create mode 100644 drivers/pci/liveupdate.c
create mode 100644 drivers/vfio/pci/vfio_pci_liveupdate.c
create mode 100644 include/linux/kho/abi/pci.h
create mode 100644 include/linux/kho/abi/vfio_pci.h
rename tools/testing/selftests/liveupdate/{luo_test_utils.h => lib/include/libliveupdate.h} (80%)
create mode 100644 tools/testing/selftests/liveupdate/lib/libliveupdate.mk
rename tools/testing/selftests/liveupdate/{luo_test_utils.c => lib/liveupdate.c} (89%)
create mode 100644 tools/testing/selftests/vfio/vfio_pci_liveupdate_kexec_test.c
create mode 100644 tools/testing/selftests/vfio/vfio_pci_liveupdate_uapi_test.c
--
2.52.0.487.g5c8c507ade-goog
Hi Linus,
Please pull the following kunit next update for Linux 6.19-rc1.
Makes filter parameters configurable via Kconfig.
Adds description of kunit.enable parameter documentation.
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 3a8660878839faadb4f1a6dd72c3179c1df56787:
Linux 6.18-rc1 (2025-10-12 13:42:36 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-kunit-6.19-rc1
for you to fetch changes up to 7bc16e72ddb993d706f698c2f6cee694e485f557:
kunit: Make filter parameters configurable via Kconfig (2025-11-14 11:02:34 -0700)
----------------------------------------------------------------
linux_kselftest-kunit-6.19-rc1
Makes filter parameters configurable via Kconfig.
Adds description of kunit.enable parameter documentation.
----------------------------------------------------------------
Thomas Weißschuh (1):
kunit: Make filter parameters configurable via Kconfig
Yuya Ishikawa (1):
Documentation: kunit: add description of kunit.enable parameter
Documentation/dev-tools/kunit/run_manual.rst | 6 ++++++
lib/kunit/Kconfig | 24 ++++++++++++++++++++++++
lib/kunit/executor.c | 8 +++++---
3 files changed, 35 insertions(+), 3 deletions(-)
----------------------------------------------------------------
Hi Linus,
Please pull this kselftest next update for Linux 6.19-rc1
Adds basic test for trace_marker_raw file to tracing selftest.
Fixes invalid array access in printf dma_map_benchmark selftest.
Adds tprobe enable/disable testcase to tracing selftest.
Updates fprobe selftest for ftrace based fprobe.
diff is attached.
thanks,
-- Shuah
----------------------------------------------------------------
The following changes since commit 3a8660878839faadb4f1a6dd72c3179c1df56787:
Linux 6.18-rc1 (2025-10-12 13:42:36 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest tags/linux_kselftest-next-6.19-rc1
for you to fetch changes up to a2f7990d330937a204b86b9cafbfef82f87a8693:
selftests: tracing: Update fprobe selftest for ftrace based fprobe (2025-11-19 15:55:14 -0700)
----------------------------------------------------------------
linux_kselftest-next-6.19-rc1
Adds basic test for trace_marker_raw file to tracing selftest.
Fixes invalid array access in printf dma_map_benchmark selftest.
Adds tprobe enable/disable testcase to tracing selftest.
Updates fprobe selftest for ftrace based fprobe.
----------------------------------------------------------------
Brendan Jackman (1):
selftests/run_kselftest.sh: exit with error if tests fail
Masami Hiramatsu (Google) (2):
selftests: tracing: Add tprobe enable/disable testcase
selftests: tracing: Update fprobe selftest for ftrace based fprobe
Steven Rostedt (1):
selftests/tracing: Add basic test for trace_marker_raw file
Zhang Chujun (1):
selftests/dma: fix invalid array access in printf
tools/testing/selftests/dma/dma_map_benchmark.c | 2 +-
.../ftrace/test.d/00basic/trace_marker_raw.tc | 107 +++++++++++++++++++++
.../ftrace/test.d/dynevent/add_remove_fprobe.tc | 18 +---
.../test.d/dynevent/enable_disable_tprobe.tc | 40 ++++++++
tools/testing/selftests/kselftest/runner.sh | 14 ++-
tools/testing/selftests/run_kselftest.sh | 14 +++
6 files changed, 176 insertions(+), 19 deletions(-)
create mode 100644 tools/testing/selftests/ftrace/test.d/00basic/trace_marker_raw.tc
create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/enable_disable_tprobe.tc
----------------------------------------------------------------
Hi,
when trying to build tools/testing/selftests, I get a lot of warnings such as
mount-notify_test.c: In function ‘fanotify_fsmount’:
mount-notify_test.c:360:14: warning: implicit declaration of function ‘fsopen’; did you mean ‘fdopen’?
and subsequent build errors.
testing/selftests/filesystems/mount-notify/mount-notify_test.c:360: undefined reference to `fsopen'
/usr/bin/ld: tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c:363: undefined reference to `fsconfig'
/usr/bin/ld:tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c:366: undefined reference to `fsmount'
/usr/bin/ld: tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c:371: undefined reference to `move_mount'
This does not just affect a single file, but several of them.
What am I missing ? Is there some magic needed to build the selftests ?
Thanks,
Guenter
nolibc currently uses 32-bit types for various APIs. These are
problematic as their reduced value range can lead to truncated values.
Intended for 6.19.
Signed-off-by: Thomas Weißschuh <linux(a)weissschuh.net>
---
Changes in v2:
- Drop already applied ino_t and off_t patches.
- Also handle 'struct timeval'.
- Make the progression of the series a bit clearer.
- Add compatibility assertions.
- Link to v1: https://lore.kernel.org/r/20251029-nolibc-uapi-types-v1-0-e79de3b215d8@weis…
---
Thomas Weißschuh (13):
tools/nolibc/poll: use kernel types for system call invocations
tools/nolibc/poll: drop __NR_poll fallback
tools/nolibc/select: drop non-pselect based implementations
tools/nolibc/time: drop invocation of gettimeofday system call
tools/nolibc: prefer explicit 64-bit time-related system calls
tools/nolibc/gettimeofday: avoid libgcc 64-bit divisions
tools/nolibc/select: avoid libgcc 64-bit multiplications
tools/nolibc: use custom structs timespec and timeval
tools/nolibc: always use 64-bit time types
selftests/nolibc: test compatibility of nolibc and kernel time types
tools/nolibc: remove time conversions
tools/nolibc: add __nolibc_static_assert()
selftests/nolibc: add static assertions around time types handling
tools/include/nolibc/arch-s390.h | 3 +
tools/include/nolibc/compiler.h | 2 +
tools/include/nolibc/poll.h | 14 ++--
tools/include/nolibc/std.h | 2 +-
tools/include/nolibc/sys/select.h | 25 ++-----
tools/include/nolibc/sys/time.h | 6 +-
tools/include/nolibc/sys/timerfd.h | 32 +++------
tools/include/nolibc/time.h | 102 +++++++++------------------
tools/include/nolibc/types.h | 17 ++++-
tools/testing/selftests/nolibc/nolibc-test.c | 27 +++++++
10 files changed, 107 insertions(+), 123 deletions(-)
---
base-commit: 586e8d5137dfcddfccca44c3b992b92d2be79347
change-id: 20251001-nolibc-uapi-types-1c072d10fcc7
Best regards,
--
Thomas Weißschuh <linux(a)weissschuh.net>
The binding file 'keystone.txt' has been converted to a DT schema.
The current binding is located at:
Documentation/devicetree/bindings/arm/ti/ti,keystone.yaml
This change was made in https://lore.kernel.org/all/20250806212824.1635084-1-robh@kernel.org/
and merged in commit '20b3c9a403ee23e57a7e6bf5370ca438c3cd2e99'
Signed-off-by: Soham Metha <sohammetha01(a)gmail.com>
---
Documentation/arch/arm/keystone/overview.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Documentation/arch/arm/keystone/overview.rst b/Documentation/arch/arm/keystone/overview.rst
index cd90298c493c..bf791b2fc43f 100644
--- a/Documentation/arch/arm/keystone/overview.rst
+++ b/Documentation/arch/arm/keystone/overview.rst
@@ -65,7 +65,7 @@ specified through DTS. Following are the DTS used:
The device tree documentation for the keystone machines are located at
- Documentation/devicetree/bindings/arm/keystone/keystone.txt
+ Documentation/devicetree/bindings/arm/ti/ti,keystone.yaml
Document Author
---------------
--
2.34.1
Fix compilation error in UPROBE_setup caused by pointer type mismatch
in ternary expression. The probed_uretprobe and probed_uprobe function
pointers have different type attributes (__attribute__((nocf_check))),
which causes the conditional operator to fail with:
seccomp_bpf.c:5175:74: error: pointer type mismatch in conditional
expression [-Wincompatible-pointer-types]
Cast both function pointers to 'const void *' to match the expected
parameter type of get_uprobe_offset(), resolving the type mismatch
while preserving the function selection logic.
Signed-off-by: Nirbhay Sharma <nirbhay.lkd(a)gmail.com>
---
tools/testing/selftests/seccomp/seccomp_bpf.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 874f17763536..e13ffe18ef95 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -5172,7 +5172,8 @@ FIXTURE_SETUP(UPROBE)
ASSERT_GE(bit, 0);
}
- offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
+ offset = get_uprobe_offset(variant->uretprobe ?
+ (const void *)probed_uretprobe : (const void *)probed_uprobe);
ASSERT_GE(offset, 0);
if (variant->uretprobe)
--
2.48.1
This unintended LRU eviction issue was observed while developing the
selftest for
"[PATCH bpf-next v10 0/8] bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags for percpu maps" [1].
When updating an existing element in lru_hash or lru_percpu_hash maps,
the current implementation calls prealloc_lru_pop() to get a new node
before checking if the key already exists. If the map is full, this
triggers LRU eviction and removes an existing element, even though the
update operation only needs to modify the value in-place.
In the selftest, this was to be worked around by reserving an extra entry to
avoid triggering eviction in __htab_lru_percpu_map_update_elem().
However, the underlying issue remains problematic because:
1. Users may unexpectedly lose entries when updating existing keys in a
full map.
2. The eviction overhead is unnecessary for existing key updates.
This patchset fixes the issue by first checking if the key exists before
allocating a new node. If the key is found, update the value in-place,
refresh the LRU reference, and return immediately without triggering any
eviction. Only proceed with node allocation if the key does not exist.
Links:
[1] https://lore.kernel.org/bpf/20251117162033.6296-1-leon.hwang@linux.dev/
Leon Hwang (3):
bpf: Avoid unintended eviction when updating lru_hash maps
bpf: Avoid unintended eviction when updating lru_percpu_hash maps
selftests/bpf: Add tests to verify no unintended eviction when
updating lru hash maps
kernel/bpf/hashtab.c | 43 +++++++++++
.../selftests/bpf/prog_tests/htab_update.c | 73 +++++++++++++++++++
2 files changed, 116 insertions(+)
--
2.52.0
Hi all,
This patch series introduces improvements to the cgroup selftests by adding helper functions to better handle
asynchronous updates in cgroup statistics. These changes are especially useful for managing cgroup stats like
memory.stat and cgroup.stat, which can be affected by delays (e.g., RCPU callbacks and asynchronous rstat flushing).
v4:
- Patch 1/3: Adds the `cg_read_key_long_poll()` helper to poll cgroup keys with retries and configurable intervals.
- Patch 2/3: Updates `test_memcg_sock()` to use `cg_read_key_long_poll()` for handling delayed "sock" counter updates in memory.stat.
- Patch 3/3: Replaces `sleep` and retry logic in `test_kmem_dead_cgroups()` with `cg_read_key_long_poll()` for waiting on `nr_dying_descendants`.
v3:
- Move `MEMCG_SOCKSTAT_WAIT_*` defines after the `#include` block as suggested.
v2:
- Clarify the rationale for the 3s timeout and mention the periodic rstat flush interval (FLUSH_TIME = 2*HZ) in the comment.
- Replace hardcoded retry count and wait interval with macros to avoid magic numbers and make the timeout calculation explicit.
Thanks to Michal Koutný for the suggestion to introduce the polling helper, and to Lance Yang for the review.
Guopeng Zhang (3):
selftests: cgroup: Add cg_read_key_long_poll() to poll a cgroup key
with retries
selftests: cgroup: make test_memcg_sock robust against delayed sock
stats
selftests: cgroup: Replace sleep with cg_read_key_long_poll() for
waiting on nr_dying_descendants
.../selftests/cgroup/lib/cgroup_util.c | 21 +++++++++++++
.../cgroup/lib/include/cgroup_util.h | 5 +++
tools/testing/selftests/cgroup/test_kmem.c | 31 ++++++++-----------
.../selftests/cgroup/test_memcontrol.c | 20 +++++++++++-
4 files changed, 58 insertions(+), 19 deletions(-)
--
2.25.1
Documentation/networking/switchdev.rst is quite strict on how VLAN
uppers on bridged ports should work:
- with VLAN filtering turned off, the bridge will process all ingress traffic
for the port, except for the traffic tagged with a VLAN ID destined for a
VLAN upper. (...)
- with VLAN filtering turned on, these VLAN devices can be created as long as
the bridge does not have an existing VLAN entry with the same VID on any
bridge port. (...)
This means that VLAN tagged traffic matching a VLAN upper is never
forwarded from that port (unless the VLAN upper itself is bridged).
It does *not* mean that VLAN tagged traffic matching a VLAN upper is not
forwarded to that port anymore, as VLAN uppers only consume ingressing
traffic.
Currently, there is no way to tell dsa drivers that a VLAN on a
bridged port is for a VLAN upper and should not be processed by the
bridge.
Both adding a VLAN to a bridge port of bridge and adding a VLAN upper to
a bridged port of a VLAN-aware bridge will call
dsa_switch_ops::port_vlan_add(), with no way for the driver to know
which is which. In case of VLAN-unaware bridges, there is likely no
dsa_switch_ops::port_vlan_add() call at all for the VLAN upper.
But even if DSA told drivers which type of VLAN this is, most devices
likely would not support configuring forwarding per VLAN per port.
So in order to prevent the configuration of setups with unintended
forwarding between ports:
* deny configuring more than one VLAN upper on bridged ports per VLAN on
VLAN filtering bridges
* deny configuring any VLAN uppers on bridged ports on VLAN non
filtering bridges
* And consequently, disallow disabling filtering as long as there are
any VLAN uppers configured on bridged ports
An alternative solution suggested by switchdev.rst would be to treat
these ports as standalone, and do the filtering/forwarding in software.
But likely DSA supported switches are used on low power devices, where
the performance impact from this would be large.
To verify that this is needed, add appropriate selftests to
no_forwarding to verify either VLAN uppers are denied, or VLAN traffic
is not unexpectedly (still) forwarded.
These test succeed with a veth-backed software bridge, but fail on a b53
device without the DSA changes applied.
While going through the code, I also found one corner case where it was
possible to add bridge VLANs shared with VLAN uppers, while adding
VLAN uppers shared with bridge VLANs was properly denied. This is the
first patch as this seems to be like the least controversial.
Still sent as a RFC/RFT for now due to the potential impact, though a
preliminary test didn't should any failures with
bridge_vlan_{un,}aware.sh and local_termination.sh selftests on
BCM63268.
Also since net-next is closed (though I'm not sure yet if this is net or
net-next material, since this just properly prevents broken setups).
Changes v1 -> v2:
* added selftests for both VLAN-aware and VLAN-unaware bridges
* actually disallow VLAN uppers on VLAN-unware bridges, not disallow
more than one
* fixed the description of VLAN upper notification behaviour of DSA with
filtering disabled
Jonas Gorski (5):
net: dsa: deny bridge VLAN with existing 8021q upper on any port
net: dsa: deny multiple 8021q uppers on bridged ports for the same
VLAN
selftests: no_forwarding: test VLAN uppers on VLAN aware bridged ports
net: dsa: deny 8021q uppers on vlan unaware bridged ports
selftests: no_forwarding: test VLAN uppers on VLAN-unaware bridged
ports
net/dsa/port.c | 23 +---
net/dsa/user.c | 51 ++++++---
.../selftests/net/forwarding/no_forwarding.sh | 107 ++++++++++++++----
3 files changed, 127 insertions(+), 54 deletions(-)
base-commit: 0177f0f07886e54e12c6f18fa58f63e63ddd3c58
--
2.43.0
In commit 0297cdc12a87 ("KVM: selftests: Add option to rseq test to
override /dev/cpu_dma_latency"), a 'break' is missed before the option
'l' in the argument parsing loop, which leads to an unexpected core
dump in atoi_paranoid(). It tries to get the latency from non-existent
argument.
host$ ./rseq_test -u
Random seed: 0x6b8b4567
Segmentation fault (core dumped)
Add a 'break' before the option 'l' in the argument parsing loop to avoid
the unexpected core dump.
Fixes: 0297cdc12a87 ("KVM: selftests: Add option to rseq test to override /dev/cpu_dma_latency")
Cc: stable(a)vger.kernel.org # v6.15+
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
---
tools/testing/selftests/kvm/rseq_test.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/kvm/rseq_test.c b/tools/testing/selftests/kvm/rseq_test.c
index 1375fca80bcdb..f80ad6b47d16b 100644
--- a/tools/testing/selftests/kvm/rseq_test.c
+++ b/tools/testing/selftests/kvm/rseq_test.c
@@ -215,6 +215,7 @@ int main(int argc, char *argv[])
switch (opt) {
case 'u':
skip_sanity_check = true;
+ break;
case 'l':
latency = atoi_paranoid(optarg);
break;
--
2.51.1
Dear all,
This patchset is just a respin of my latest PR to net-next, including all
modifications requested by Jakub and Sabrina.
However, this time I am also adding patches targeting selftest/net/ovpn, as
they come in handy for testing the new features (originally I wanted
them to be a separate PR, but it doesn't indeed make a lot of sense).
This said, since these kselftest patches are quite invasive, I didn't
feel confident with sending them in a PR right away, but I rather wanted
some feedback from Sabrina and Shuah first, if possible.
So here we go.
Once I get some approval on this batch, I'll send then send them all
to net-next again as PRv2.
Thanks a lot!
Regards,
Antonio Quartulli (1):
selftests: ovpn: allow compiling ovpn-cli.c with mbedtls3
Qingfang Deng (1):
ovpn: pktid: use bitops.h API
Ralf Lici (10):
selftests: ovpn: add notification parsing and matching
ovpn: notify userspace on client float event
ovpn: add support for asymmetric peer IDs
selftests: ovpn: check asymmetric peer-id
selftests: ovpn: add test for the FW mark feature
ovpn: consolidate crypto allocations in one chunk
ovpn: use bound device in UDP when available
selftests: ovpn: add test for bound device
ovpn: use bound address in UDP when available
selftests: ovpn: add test for bound address
Sabrina Dubroca (1):
ovpn: use correct array size to parse nested attributes in
ovpn_nl_key_swap_doit
Documentation/netlink/specs/ovpn.yaml | 23 +-
drivers/net/ovpn/crypto_aead.c | 162 +++++++---
drivers/net/ovpn/io.c | 8 +-
drivers/net/ovpn/netlink-gen.c | 13 +-
drivers/net/ovpn/netlink-gen.h | 6 +-
drivers/net/ovpn/netlink.c | 98 +++++-
drivers/net/ovpn/netlink.h | 2 +
drivers/net/ovpn/peer.c | 6 +
drivers/net/ovpn/peer.h | 4 +-
drivers/net/ovpn/pktid.c | 11 +-
drivers/net/ovpn/pktid.h | 2 +-
drivers/net/ovpn/skb.h | 13 +-
drivers/net/ovpn/udp.c | 10 +-
include/uapi/linux/ovpn.h | 2 +
tools/testing/selftests/net/ovpn/Makefile | 17 +-
.../selftests/net/ovpn/check_requirements.py | 37 +++
tools/testing/selftests/net/ovpn/common.sh | 60 +++-
tools/testing/selftests/net/ovpn/data64.key | 6 +-
.../selftests/net/ovpn/json/peer0-float.json | 9 +
.../selftests/net/ovpn/json/peer0.json | 6 +
.../selftests/net/ovpn/json/peer1-float.json | 1 +
.../selftests/net/ovpn/json/peer1.json | 1 +
.../selftests/net/ovpn/json/peer2-float.json | 1 +
.../selftests/net/ovpn/json/peer2.json | 1 +
.../selftests/net/ovpn/json/peer3-float.json | 1 +
.../selftests/net/ovpn/json/peer3.json | 1 +
.../selftests/net/ovpn/json/peer4-float.json | 1 +
.../selftests/net/ovpn/json/peer4.json | 1 +
.../selftests/net/ovpn/json/peer5-float.json | 1 +
.../selftests/net/ovpn/json/peer5.json | 1 +
.../selftests/net/ovpn/json/peer6-float.json | 1 +
.../selftests/net/ovpn/json/peer6.json | 1 +
tools/testing/selftests/net/ovpn/ovpn-cli.c | 281 +++++++++++-------
.../selftests/net/ovpn/requirements.txt | 1 +
.../testing/selftests/net/ovpn/tcp_peers.txt | 11 +-
.../selftests/net/ovpn/test-bind-addr.sh | 10 +
tools/testing/selftests/net/ovpn/test-bind.sh | 117 ++++++++
.../selftests/net/ovpn/test-close-socket.sh | 2 +-
tools/testing/selftests/net/ovpn/test-mark.sh | 81 +++++
tools/testing/selftests/net/ovpn/test.sh | 57 +++-
.../testing/selftests/net/ovpn/udp_peers.txt | 12 +-
41 files changed, 855 insertions(+), 224 deletions(-)
create mode 100755 tools/testing/selftests/net/ovpn/check_requirements.py
create mode 100644 tools/testing/selftests/net/ovpn/json/peer0-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer0.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer1-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer1.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer2-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer2.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer3-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer3.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer4-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer4.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer5-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer5.json
create mode 120000 tools/testing/selftests/net/ovpn/json/peer6-float.json
create mode 100644 tools/testing/selftests/net/ovpn/json/peer6.json
create mode 120000 tools/testing/selftests/net/ovpn/requirements.txt
create mode 100755 tools/testing/selftests/net/ovpn/test-bind-addr.sh
create mode 100755 tools/testing/selftests/net/ovpn/test-bind.sh
create mode 100755 tools/testing/selftests/net/ovpn/test-mark.sh
--
2.51.2
Dear friends,
this patch series adds support for nested seccomp listeners. It allows container
runtimes and other sandboxing software to install seccomp listeners on top of
existing ones, which is useful for nested LXC containers and other similar use-cases.
I decided to go with conservative approach and limit the maximum number of nested listeners
to 8 per seccomp filter chain (MAX_LISTENERS_PER_PATH). This is done to avoid dynamic memory
allocations in the very hot __seccomp_filter() function, where we use a preallocated static
array on the stack to track matched listeners. 8 nested listeners should be enough for
almost any practical scenarios.
Expecting potential discussions around this patch series, I'm going to present a talk
at LPC 2025 about the design and implementation details of this feature [1].
Git tree (based on for-next/seccomp):
v2: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v2
current: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners
Changelog for version 2:
- add some explanatory comments
- add RWB tags from Tycho Andersen (thanks, Tycho! ;) )
- CC-ed Aleksa as he might be interested in this stuff too
Links to previous versions:
v1: https://lore.kernel.org/all/20251201122406.105045-1-aleksandr.mikhalitsyn@c…
tree: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v1
Link: https://lpc.events/event/19/contributions/2241/ [1]
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: bpf(a)vger.kernel.org
Cc: Kees Cook <kees(a)kernel.org>
Cc: Andy Lutomirski <luto(a)amacapital.net>
Cc: Will Drewry <wad(a)chromium.org>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Aleksa Sarai <cyphar(a)cyphar.com>
Cc: Tycho Andersen <tycho(a)tycho.pizza>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Stéphane Graber <stgraber(a)stgraber.org>
Alexander Mikhalitsyn (6):
seccomp: remove unused argument from seccomp_do_user_notification
seccomp: prepare seccomp_run_filters() to support more than one
listener
seccomp: limit number of listeners in seccomp tree
seccomp: handle multiple listeners case
seccomp: relax has_duplicate_listeners check
tools/testing/selftests/seccomp: test nested listeners
.../userspace-api/seccomp_filter.rst | 6 +
include/linux/seccomp.h | 3 +-
include/uapi/linux/seccomp.h | 13 +-
kernel/seccomp.c | 116 +++++++++++--
tools/include/uapi/linux/seccomp.h | 13 +-
tools/testing/selftests/seccomp/seccomp_bpf.c | 162 ++++++++++++++++++
6 files changed, 286 insertions(+), 27 deletions(-)
--
2.43.0
This patchset introduces target resume capability to netconsole allowing
it to recover targets when underlying low-level interface comes back
online.
The patchset starts by refactoring netconsole state representation in
order to allow representing deactivated targets (targets that are
disabled due to interfaces unregister).
It then modifies netconsole to handle NETDEV_REGISTER events for such
targets, setups netpoll and forces the device UP. Targets are matched with
incoming interfaces depending on how they were bound in netconsole
(by mac or interface name). For these reasons, we also attempt resuming
on NETDEV_CHANGENAME.
The patchset includes a selftest that validates netconsole target state
transitions and that target is functional after resumed.
Signed-off-by: Andre Carvalho <asantostc(a)gmail.com>
---
Changes in v8:
- Handle NETDEV_REGISTER/CHANGENAME instead of NETDEV_UP (and force the device
UP), to increase the chances of succesfully resuming a target. This
requires using a workqueue instead of inline in the event notifier as
we can't UP the device otherwise.
- Link to v7: https://lore.kernel.org/r/20251126-netcons-retrigger-v7-0-1d86dba83b1c@gmai…
Changes in v7:
- selftest: use ${EXIT_STATUS} instead of ${ksft_pass} to avoid
shellcheck warning
- Link to v6: https://lore.kernel.org/r/20251121-netcons-retrigger-v6-0-9c03f5a2bd6f@gmai…
Changes in v6:
- Rebase on top of net-next to resolve conflicts, no functional changes.
- Link to v5: https://lore.kernel.org/r/20251119-netcons-retrigger-v5-0-2c7dda6055d6@gmai…
Changes in v5:
- patch 3: Set (de)enslaved target as DISABLED instead of DEACTIVATED to prevent
resuming it.
- selftest: Fix test cleanup by moving trap line to outside of loop and remove
unneeded 'local' keyword
- Rename maybe_resume_target to resume_target, add netconsole_ prefix to
process_resumable_targets.
- Hold device reference before calling __netpoll_setup.
- Link to v4: https://lore.kernel.org/r/20251116-netcons-retrigger-v4-0-5290b5f140c2@gmai…
Changes in v4:
- Simplify selftest cleanup, removing trap setup in loop.
- Drop netpoll helper (__setup_netpoll_hold) and manage reference inside
netconsole.
- Move resume_list processing logic to separate function.
- Link to v3: https://lore.kernel.org/r/20251109-netcons-retrigger-v3-0-1654c280bbe6@gmai…
Changes in v3:
- Resume by mac or interface name depending on how target was created.
- Attempt to resume target without holding target list lock, by moving
the target to a temporary list. This is required as netpoll may
attempt to allocate memory.
- Link to v2: https://lore.kernel.org/r/20250921-netcons-retrigger-v2-0-a0e84006237f@gmai…
Changes in v2:
- Attempt to resume target in the same thread, instead of using
workqueue .
- Add wrapper around __netpoll_setup (patch 4).
- Renamed resume_target to maybe_resume_target and moved conditionals to
inside its implementation, keeping code more clear.
- Verify that device addr matches target mac address when target was
setup using mac.
- Update selftest to cover targets bound by mac and interface name.
- Fix typo in selftest comment and sort tests alphabetically in
Makefile.
- Link to v1:
https://lore.kernel.org/r/20250909-netcons-retrigger-v1-0-3aea904926cf@gmai…
---
Andre Carvalho (3):
netconsole: convert 'enabled' flag to enum for clearer state management
netconsole: resume previously deactivated target
selftests: netconsole: validate target resume
Breno Leitao (2):
netconsole: add target_state enum
netconsole: add STATE_DEACTIVATED to track targets disabled by low level
drivers/net/netconsole.c | 171 +++++++++++++++++----
tools/testing/selftests/drivers/net/Makefile | 1 +
.../selftests/drivers/net/lib/sh/lib_netcons.sh | 35 ++++-
.../selftests/drivers/net/netcons_resume.sh | 97 ++++++++++++
4 files changed, 270 insertions(+), 34 deletions(-)
---
base-commit: ed01d2069e8b40eb283050b7119c25a67542a585
change-id: 20250816-netcons-retrigger-a4f547bfc867
Best regards,
--
Andre Carvalho <asantostc(a)gmail.com>
When replying to a ICMPv6 echo request that comes from localhost address
the right output ifindex is 1 (lo) and not rt6i_idev dev index. Use the
skb device ifindex instead. This fixes pinging to a local address from
localhost source address.
$ ping6 -I ::1 2001:1:1::2 -c 3
PING 2001:1:1::2 (2001:1:1::2) from ::1 : 56 data bytes
64 bytes from 2001:1:1::2: icmp_seq=1 ttl=64 time=0.037 ms
64 bytes from 2001:1:1::2: icmp_seq=2 ttl=64 time=0.069 ms
64 bytes from 2001:1:1::2: icmp_seq=3 ttl=64 time=0.122 ms
2001:1:1::2 ping statistics
3 packets transmitted, 3 received, 0% packet loss, time 2032ms
rtt min/avg/max/mdev = 0.037/0.076/0.122/0.035 ms
Fixes: 1b70d792cf67 ("ipv6: Use rt6i_idev index for echo replies to a local address")
Signed-off-by: Fernando Fernandez Mancera <fmancera(a)suse.de>
---
net/ipv6/icmp.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 5d2f90babaa5..5de254043133 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -965,7 +965,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
- fl6.flowi6_oif = icmp6_iif(skb);
+ fl6.flowi6_oif = ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LOOPBACK ?
+ skb->dev->ifindex :
+ icmp6_iif(skb);
fl6.fl6_icmp_type = type;
fl6.flowi6_mark = mark;
fl6.flowi6_uid = sock_net_uid(net, NULL);
--
2.51.1
Hi,
This series fixes issues in the devlink_rate_tc_bw.py selftest and
introduces a new Iperf3Runner that helps with measurement handling.
Thanks,
Carolina
V3:
- Replace generic Exception with specific exception types in load.py.
V2:
- Insert the test in the correct sorted position.
Carolina Jubran (6):
selftests: drv-net: Add devlink_rate_tc_bw.py to TEST_PROGS
selftests: drv-net: introduce Iperf3Runner for measurement use cases
selftests: drv-net: Use Iperf3Runner in devlink_rate_tc_bw.py
selftests: drv-net: Set shell=True for sysfs writes in
devlink_rate_tc_bw.py
selftests: drv-net: Fix and clarify TC bandwidth split in
devlink_rate_tc_bw.py
selftests: drv-net: Fix tolerance calculation in devlink_rate_tc_bw.py
.../testing/selftests/drivers/net/hw/Makefile | 1 +
.../drivers/net/hw/devlink_rate_tc_bw.py | 174 ++++++++----------
.../drivers/net/hw/lib/py/__init__.py | 5 +-
.../selftests/drivers/net/lib/py/__init__.py | 5 +-
.../selftests/drivers/net/lib/py/load.py | 84 ++++++++-
5 files changed, 157 insertions(+), 112 deletions(-)
--
2.38.1
We'll need to do a lot more feature handling to test HW-GRO and LRO.
Clean up the feature handling for SW GRO a bit to let the next commit
focus on the new test cases, only.
Make sure HW GRO-like features are not enabled for the SW tests.
Be more careful about changing features as "nothing changed"
situations may result in non-zero error code from ethtool.
Don't disable TSO on the local interface (receiver) when running over
netdevsim, we just want GSO to break up the segments on the sender.
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/drivers/net/gro.py | 38 ++++++++++++++++++++--
1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/gro.py b/tools/testing/selftests/drivers/net/gro.py
index ba83713bf7b5..6d633bdc7e67 100755
--- a/tools/testing/selftests/drivers/net/gro.py
+++ b/tools/testing/selftests/drivers/net/gro.py
@@ -20,7 +20,7 @@ coalescing behavior.
import os
from lib.py import ksft_run, ksft_exit, ksft_pr
from lib.py import NetDrvEpEnv, KsftXfailEx
-from lib.py import cmd, defer, bkg, ip
+from lib.py import cmd, defer, bkg, ethtool, ip
from lib.py import ksft_variants
@@ -70,6 +70,27 @@ from lib.py import ksft_variants
defer(ip, f"link set dev {dev['ifname']} mtu {dev['mtu']}", host=host)
+def _set_ethtool_feat(dev, current, feats, host=None):
+ s2n = {True: "on", False: "off"}
+
+ new = ["-K", dev]
+ old = ["-K", dev]
+ no_change = True
+ for name, state in feats.items():
+ new += [name, s2n[state]]
+ old += [name, s2n[not state]]
+
+ if current[name]["active"] != state:
+ no_change = False
+ if current[name]["fixed"]:
+ raise KsftXfailEx(f"Device does not support {name}")
+ if no_change:
+ return
+
+ ethtool(" ".join(new), host=host)
+ defer(ethtool, " ".join(old), host=host)
+
+
def _setup(cfg, test_name):
""" Setup hardware loopback mode for GRO testing. """
@@ -77,6 +98,11 @@ from lib.py import ksft_variants
cfg.bin_local = cfg.test_dir / "gro"
cfg.bin_remote = cfg.remote.deploy(cfg.bin_local)
+ if not hasattr(cfg, "feat"):
+ cfg.feat = ethtool(f"-k {cfg.ifname}", json=True)[0]
+ cfg.remote_feat = ethtool(f"-k {cfg.remote_ifname}",
+ host=cfg.remote, json=True)[0]
+
# "large" test needs at least 4k MTU
if test_name == "large":
_set_mtu_restore(cfg.dev, 4096, None)
@@ -88,15 +114,21 @@ from lib.py import ksft_variants
_write_defer_restore(cfg, flush_path, "200000", defer_undo=True)
_write_defer_restore(cfg, irq_path, "10", defer_undo=True)
+ _set_ethtool_feat(cfg.ifname, cfg.feat,
+ {"generic-receive-offload": True,
+ "rx-gro-hw": False,
+ "large-receive-offload": False})
+
try:
# Disable TSO for local tests
cfg.require_nsim() # will raise KsftXfailEx if not running on nsim
- cmd(f"ethtool -K {cfg.ifname} gro on tso off")
- cmd(f"ethtool -K {cfg.remote_ifname} gro on tso off", host=cfg.remote)
+ _set_ethtool_feat(cfg.remote_ifname, cfg.remote_feat, {"tso": False},
+ host=cfg.remote)
except KsftXfailEx:
pass
+
def _gro_variants():
"""Generator that yields all combinations of protocol and test types."""
--
2.51.1
Following up on the old discussion [1]. Let the BaseExceptions out of
defer()'ed cleanup. And handle it in the main loop. This allows us to
exit the tests if user hit Ctrl-C during defer().
Link: https://lore.kernel.org/20251119063228.3adfd743@kernel.org # [1]
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: willemb(a)google.com
CC: petrm(a)nvidia.com
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/net/lib/py/ksft.py | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py
index ebd82940ee50..531e7fa1b3ea 100644
--- a/tools/testing/selftests/net/lib/py/ksft.py
+++ b/tools/testing/selftests/net/lib/py/ksft.py
@@ -163,7 +163,7 @@ KSFT_DISRUPTIVE = True
entry = global_defer_queue.pop()
try:
entry.exec_only()
- except BaseException:
+ except Exception:
ksft_pr(f"Exception while handling defer / cleanup (callback {i} of {qlen_start})!")
tb = traceback.format_exc()
for line in tb.strip().split('\n'):
@@ -333,7 +333,21 @@ KsftCaseFunction = namedtuple("KsftCaseFunction",
KSFT_RESULT = False
cnt_key = 'fail'
- ksft_flush_defer()
+ try:
+ ksft_flush_defer()
+ except BaseException as e:
+ tb = traceback.format_exc()
+ for line in tb.strip().split('\n'):
+ ksft_pr("Exception|", line)
+ if isinstance(e, KeyboardInterrupt):
+ ksft_pr()
+ ksft_pr("WARN: defer() interrupted, cleanup may be incomplete.")
+ ksft_pr(" Attempting to finish cleanup before exiting.")
+ ksft_pr(" Interrupt again to exit immediately.")
+ ksft_pr()
+ stop = True
+ # Flush was interrupted, try to finish the job best we can
+ ksft_flush_defer()
if not cnt_key:
cnt_key = 'pass' if KSFT_RESULT else 'fail'
--
2.51.1
This removes some noise that can be distracting while looking at
selftests by redirecting socat stderr to /dev/null.
Before this commit, netcons_basic would output:
Running with target mode: basic (ipv6)
2025/11/29 12:08:03 socat[259] W exiting on signal 15
2025/11/29 12:08:03 socat[271] W exiting on signal 15
basic : ipv6 : Test passed
Running with target mode: basic (ipv4)
2025/11/29 12:08:05 socat[329] W exiting on signal 15
2025/11/29 12:08:05 socat[322] W exiting on signal 15
basic : ipv4 : Test passed
Running with target mode: extended (ipv6)
2025/11/29 12:08:08 socat[386] W exiting on signal 15
2025/11/29 12:08:08 socat[386] W exiting on signal 15
2025/11/29 12:08:08 socat[380] W exiting on signal 15
extended : ipv6 : Test passed
Running with target mode: extended (ipv4)
2025/11/29 12:08:10 socat[440] W exiting on signal 15
2025/11/29 12:08:10 socat[435] W exiting on signal 15
2025/11/29 12:08:10 socat[435] W exiting on signal 15
extended : ipv4 : Test passed
After these changes, output looks like:
Running with target mode: basic (ipv6)
basic : ipv6 : Test passed
Running with target mode: basic (ipv4)
basic : ipv4 : Test passed
Running with target mode: extended (ipv6)
extended : ipv6 : Test passed
Running with target mode: extended (ipv4)
extended : ipv4 : Test passed
Signed-off-by: Andre Carvalho <asantostc(a)gmail.com>
---
tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index 87f89fd92f8c..ae8abff4be40 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -249,7 +249,7 @@ function listen_port_and_save_to() {
# Just wait for 2 seconds
timeout 2 ip netns exec "${NAMESPACE}" \
- socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}"
+ socat "${SOCAT_MODE}":"${PORT}",fork "${OUTPUT}" 2> /dev/null
}
# Only validate that the message arrived properly
---
base-commit: ff736a286116d462a4067ba258fa351bc0b4ed80
change-id: 20251129-netcons-socat-noise-00e7ccf29560
Best regards,
--
Andre Carvalho <asantostc(a)gmail.com>
New NIPA installation had been reporting a few flaky tests.
arp_ndisc_evict_nocarrier is most flaky of them all.
I suspect that the flakiness is due to udev swapping the MAC
addresses on the interfaces. Extend the message in
arp_ndisc_evict_nocarrier to hint at this potential issue.
Having the neigh get fail right after ping is rather unusual,
unless udev changes the MAC addr causing a flush in the meantime.
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh
index 92eb880c52f2..00758f00efbf 100755
--- a/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh
+++ b/tools/testing/selftests/net/arp_ndisc_evict_nocarrier.sh
@@ -75,7 +75,7 @@ setup_v4() {
ip neigh get $V4_ADDR1 dev veth0 >/dev/null 2>&1
if [ $? -ne 0 ]; then
cleanup_v4
- echo "failed"
+ echo "failed; is the system using MACAddressPolicy=persistent ?"
exit 1
fi
--
2.51.1
If PAGE_SIZE is larger than 4k and if you have a system with a
large number of CPUs, this test can require a very large amount
of memory leading to oom-killer firing. Given the type of allocation,
the kernel won't have anything to kill, causing the system to
stall. Add a parameter to the test_vmalloc driver to represent the
number of times a percpu object will be allocated. Calculate this
in test_vmalloc.sh to be 90% of available memory or the current
default of 35000, whichever is smaller.
Signed-off-by: Audra Mitchell <audra(a)redhat.com>
---
lib/test_vmalloc.c | 11 +++++---
tools/testing/selftests/mm/test_vmalloc.sh | 31 +++++++++++++++++++---
2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index 2815658ccc37..67e53cd6b619 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -57,6 +57,9 @@ __param(int, run_test_mask, 7,
/* Add a new test case description here. */
);
+__param(int, nr_pcpu_objects, 35000,
+ "Number of pcpu objects to allocate for pcpu_alloc_test");
+
/*
* This is for synchronization of setup phase.
*/
@@ -292,24 +295,24 @@ pcpu_alloc_test(void)
size_t size, align;
int i;
- pcpu = vmalloc(sizeof(void __percpu *) * 35000);
+ pcpu = vmalloc(sizeof(void __percpu *) * nr_pcpu_objects);
if (!pcpu)
return -1;
- for (i = 0; i < 35000; i++) {
+ for (i = 0; i < nr_pcpu_objects; i++) {
size = get_random_u32_inclusive(1, PAGE_SIZE / 4);
/*
* Maximum PAGE_SIZE
*/
- align = 1 << get_random_u32_inclusive(1, 11);
+ align = 1 << get_random_u32_inclusive(1, PAGE_SHIFT - 1);
pcpu[i] = __alloc_percpu(size, align);
if (!pcpu[i])
rv = -1;
}
- for (i = 0; i < 35000; i++)
+ for (i = 0; i < nr_pcpu_objects; i++)
free_percpu(pcpu[i]);
vfree(pcpu);
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
index d39096723fca..b23d705bf570 100755
--- a/tools/testing/selftests/mm/test_vmalloc.sh
+++ b/tools/testing/selftests/mm/test_vmalloc.sh
@@ -13,6 +13,9 @@ TEST_NAME="vmalloc"
DRIVER="test_${TEST_NAME}"
NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
+# Default number of times we allocate percpu objects:
+NR_PCPU_OBJECTS=35000
+
# 1 if fails
exitcode=1
@@ -27,6 +30,8 @@ PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
+PCPU_OBJ_PARAM="nr_pcpu_objects=$NR_PCPU_OBJECTS"
+
check_test_requirements()
{
uid=$(id -u)
@@ -47,12 +52,30 @@ check_test_requirements()
fi
}
+check_memory_requirement()
+{
+ # The pcpu_alloc_test allocates nr_pcpu_objects per cpu. If the
+ # PAGE_SIZE is on the larger side it is easier to set a value
+ # that can cause oom events during testing. Since we are
+ # testing the functionality of vmalloc and not the oom-killer,
+ # calculate what is 90% of available memory and divide it by
+ # the number of online CPUs.
+ pages=$(($(getconf _AVPHYS_PAGES) * 90 / 100 / $NUM_CPUS))
+
+ if (($pages < $NR_PCPU_OBJECTS)); then
+ echo "Updated nr_pcpu_objects to 90% of available memory."
+ echo "nr_pcpu_objects is now set to: $pages."
+ PCPU_OBJ_PARAM="nr_pcpu_objects=$pages"
+ fi
+}
+
run_performance_check()
{
echo "Run performance tests to evaluate how fast vmalloc allocation is."
echo "It runs all test cases on one single CPU with sequential order."
- modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+ check_memory_requirement
+ modprobe $DRIVER $PERF_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
echo "Done."
echo "Check the kernel message buffer to see the summary."
}
@@ -63,7 +86,8 @@ run_stability_check()
echo "available test cases are run by NUM_CPUS workers simultaneously."
echo "It will take time, so be patient."
- modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+ check_memory_requirement
+ modprobe $DRIVER $STRESS_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
echo "Done."
echo "Check the kernel ring buffer to see the summary."
}
@@ -74,7 +98,8 @@ run_smoke_check()
echo "Please check $0 output how it can be used"
echo "for deep performance analysis as well as stress testing."
- modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+ check_memory_requirement
+ modprobe $DRIVER $SMOKE_PARAM $PCPU_OBJ_PARAM > /dev/null 2>&1
echo "Done."
echo "Check the kernel ring buffer to see the summary."
}
--
2.51.0
Dear friends,
this patch series adds support for nested seccomp listeners. It allows container
runtimes and other sandboxing software to install seccomp listeners on top of
existing ones, which is useful for nested LXC containers and other similar use-cases.
I decided to go with conservative approach and limit the maximum number of nested listeners
to 8 per seccomp filter chain (MAX_LISTENERS_PER_PATH). This is done to avoid dynamic memory
allocations in the very hot __seccomp_filter() function, where we use a preallocated static
array on the stack to track matched listeners. 8 nested listeners should be enough for
almost any practical scenarios.
Expecting potential discussions around this patch series, I'm going to present a talk
at LPC 2025 about the design and implementation details of this feature [1].
Git tree (based on for-next/seccomp):
v1: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners.v1
current: https://github.com/mihalicyn/linux/commits/seccomp.mult.listeners
Link: https://lpc.events/event/19/contributions/2241/ [1]
Cc: linux-doc(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: bpf(a)vger.kernel.org
Cc: Kees Cook <kees(a)kernel.org>
Cc: Andy Lutomirski <luto(a)amacapital.net>
Cc: Will Drewry <wad(a)chromium.org>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Tycho Andersen <tycho(a)tycho.pizza>
Cc: Andrei Vagin <avagin(a)gmail.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Stéphane Graber <stgraber(a)stgraber.org>
Alexander Mikhalitsyn (6):
seccomp: remove unused argument from seccomp_do_user_notification
seccomp: prepare seccomp_run_filters() to support more than one
listener
seccomp: limit number of listeners in seccomp tree
seccomp: handle multiple listeners case
seccomp: relax has_duplicate_listeners check
tools/testing/selftests/seccomp: test nested listeners
.../userspace-api/seccomp_filter.rst | 6 +
include/linux/seccomp.h | 3 +-
include/uapi/linux/seccomp.h | 13 +-
kernel/seccomp.c | 99 +++++++++--
tools/include/uapi/linux/seccomp.h | 13 +-
tools/testing/selftests/seccomp/seccomp_bpf.c | 162 ++++++++++++++++++
6 files changed, 269 insertions(+), 27 deletions(-)
--
2.43.0
Currently, x86, Riscv, Loongarch use the Generic Entry which makes
maintainers' work easier and codes more elegant. arm64 has already
successfully switched to the Generic IRQ Entry in commit
b3cf07851b6c ("arm64: entry: Switch to generic IRQ entry"), it is
time to completely convert arm64 to Generic Entry.
The goal is to bring arm64 in line with other architectures that already
use the generic entry infrastructure, reducing duplicated code and
making it easier to share future changes in entry/exit paths, such as
"Syscall User Dispatch".
This patch set is rebased on v6.18-rc6.
The performance benchmarks from perf bench basic syscall on
real hardware are below:
| Metric | W/O Generic Framework | With Generic Framework | Change |
| ---------- | --------------------- | ---------------------- | ------ |
| Total time | 2.813 [sec] | 2.930 [sec] | ↑4% |
| usecs/op | 0.281349 | 0.293006 | ↑4% |
| ops/sec | 3,554,299 | 3,412,894 | ↓4% |
Compared to earlier with arch specific handling, the performance decreased
by approximately 4%.
It was tested ok with following test cases on QEMU virt platform:
- Perf tests.
- Different `dynamic preempt` mode switch.
- Pseudo NMI tests.
- Stress-ng CPU stress test.
- Hackbench stress test.
- MTE test case in Documentation/arch/arm64/memory-tagging-extension.rst
and all test cases in tools/testing/selftests/arm64/mte/*.
- "sud" selftest testcase.
- get_set_sud, get_syscall_info, set_syscall_info, peeksiginfo
in tools/testing/selftests/ptrace.
- breakpoint_test_arm64 in selftests/breakpoints.
- syscall-abi and ptrace in tools/testing/selftests/arm64/abi
- fp-ptrace, sve-ptrace, za-ptrace in selftests/arm64/fp.
- vdso_test_getrandom in tools/testing/selftests/vDSO
- Strace tests.
The test QEMU configuration is as follows:
qemu-system-aarch64 \
-M virt,gic-version=3,virtualization=on,mte=on \
cpu max,pauth-impdef=on \
kernel Image \
smp 8,sockets=1,cores=4,threads=2 \
m 512m \
nographic \
no-reboot \
device virtio-rng-pci \
append "root=/dev/vda rw console=ttyAMA0 kgdboc=ttyAMA0,115200 \
earlycon preempt=voluntary irqchip.gicv3_pseudo_nmi=1" \
drive if=none,file=images/rootfs.ext4,format=raw,id=hd0 \
device virtio-blk-device,drive=hd0 \
Changes in v8:
- Rename "report_syscall_enter()" to "report_syscall_entry()".
- Add ptrace_save_reg() to avoid duplication.
- Remove unused _TIF_WORK_MASK in a standalone patch.
- Align syscall_trace_enter() return value with the generic version.
- Use "scno" instead of regs->syscallno in el0_svc_common().
- Move rseq_syscall() ahead in a standalone patch to clarify it clearly.
- Rename "syscall_trace_exit()" to "syscall_exit_work()".
- Keep the goto in el0_svc_common().
- No argument was passed to __secure_computing() and check -1 not -1L.
- Remove "Add has_syscall_work() helper" patch.
- Move "Add syscall_exit_to_user_mode_prepare() helper" patch later.
- Add miss header for asm/entry-common.h.
- Update the implementation of arch_syscall_is_vdso_sigreturn().
- Add "ARCH_SYSCALL_WORK_EXIT" to be defined as "SECCOMP | SYSCALL_EMU"
to keep the behaviour unchanged.
- Add more testcases test.
- Add Reviewed-by.
- Update the commit message.
- Link to v7: https://lore.kernel.org/all/20251117133048.53182-1-ruanjinjie@huawei.com/
Chanegs in v7:
- Support "Syscall User Dispatch" by implementing
arch_syscall_is_vdso_sigreturn() as kemal suggested.
- Add aarch64 support for "sud" selftest testcase, which tested ok with
the patch series.
- Fix the kernel test robot warning for arch_ptrace_report_syscall_entry()
and arch_ptrace_report_syscall_exit() in asm/entry-common.h.
- Add perf syscall performance test.
- Link to v6: https://lore.kernel.org/all/20250916082611.2972008-1-ruanjinjie@huawei.com/
Changes in v6:
- Rebased on v6.17-rc5-next as arm64 generic irq entry has merged.
- Update the commit message.
- Link to v5: https://lore.kernel.org/all/20241206101744.4161990-1-ruanjinjie@huawei.com/
Changes in v5:
- Not change arm32 and keep inerrupts_enabled() macro for gicv3 driver.
- Move irqentry_state definition into arch/arm64/kernel/entry-common.c.
- Avoid removing the __enter_from_*() and __exit_to_*() wrappers.
- Update "irqentry_state_t ret/irq_state" to "state"
to keep it consistently.
- Use generic irq entry header for PREEMPT_DYNAMIC after split
the generic entry.
- Also refactor the ARM64 syscall code.
- Introduce arch_ptrace_report_syscall_entry/exit(), instead of
arch_pre/post_report_syscall_entry/exit() to simplify code.
- Make the syscall patches clear separation.
- Update the commit message.
- Link to v4: https://lore.kernel.org/all/20241025100700.3714552-1-ruanjinjie@huawei.com/
Changes in v4:
- Rework/cleanup split into a few patches as Mark suggested.
- Replace interrupts_enabled() macro with regs_irqs_disabled(), instead
of left it here.
- Remove rcu and lockdep state in pt_regs by using temporary
irqentry_state_t as Mark suggested.
- Remove some unnecessary intermediate functions to make it clear.
- Rework preempt irq and PREEMPT_DYNAMIC code
to make the switch more clear.
- arch_prepare_*_entry/exit() -> arch_pre_*_entry/exit().
- Expand the arch functions comment.
- Make arch functions closer to its caller.
- Declare saved_reg in for block.
- Remove arch_exit_to_kernel_mode_prepare(), arch_enter_from_kernel_mode().
- Adjust "Add few arch functions to use generic entry" patch to be
the penultimate.
- Update the commit message.
- Add suggested-by.
- Link to v3: https://lore.kernel.org/all/20240629085601.470241-1-ruanjinjie@huawei.com/
Changes in v3:
- Test the MTE test cases.
- Handle forget_syscall() in arch_post_report_syscall_entry()
- Make the arch funcs not use __weak as Thomas suggested, so move
the arch funcs to entry-common.h, and make arch_forget_syscall() folded
in arch_post_report_syscall_entry() as suggested.
- Move report_single_step() to thread_info.h for arm64
- Change __always_inline() to inline, add inline for the other arch funcs.
- Remove unused signal.h for entry-common.h.
- Add Suggested-by.
- Update the commit message.
Changes in v2:
- Add tested-by.
- Fix a bug that not call arch_post_report_syscall_entry() in
syscall_trace_enter() if ptrace_report_syscall_entry() return not zero.
- Refactor report_syscall().
- Add comment for arch_prepare_report_syscall_exit().
- Adjust entry-common.h header file inclusion to alphabetical order.
- Update the commit message.
Jinjie Ruan (11):
arm64: Remove unused _TIF_WORK_MASK
arm64/ptrace: Split report_syscall()
arm64/ptrace: Refactor syscall_trace_enter/exit()
arm64: ptrace: Move rseq_syscall() before audit_syscall_exit()
arm64: syscall: Rework el0_svc_common()
arm64/ptrace: Return early for ptrace_report_syscall_entry() error
arm64/ptrace: Expand secure_computing() in place
arm64/ptrace: Use syscall_get_arguments() heleper
entry: Split syscall_exit_to_user_mode_work() for arch reuse
entry: Add arch_ptrace_report_syscall_entry/exit()
arm64: entry: Convert to generic entry
kemal (1):
selftests: sud_test: Support aarch64
arch/arm64/Kconfig | 2 +-
arch/arm64/include/asm/entry-common.h | 76 ++++++++++++++++
arch/arm64/include/asm/syscall.h | 21 ++++-
arch/arm64/include/asm/thread_info.h | 22 +----
arch/arm64/kernel/debug-monitors.c | 7 ++
arch/arm64/kernel/ptrace.c | 90 -------------------
arch/arm64/kernel/signal.c | 2 +-
arch/arm64/kernel/syscall.c | 25 ++----
include/linux/entry-common.h | 35 +++++---
kernel/entry/syscall-common.c | 43 ++++++++-
.../syscall_user_dispatch/sud_test.c | 4 +
11 files changed, 179 insertions(+), 148 deletions(-)
--
2.34.1
From: "Mike Rapoport (Microsoft)" <rppt(a)kernel.org>
Hi,
These patches allow guest_memfd to notify userspace about minor page
faults using userfaultfd and let userspace to resolve these page faults
using UFFDIO_CONTINUE.
To allow UFFDIO_CONTINUE outside of the core mm I added a get_shmem_folio()
callback to vm_ops that allows an address space backing a VMA to return a
folio that exists in it's page cache (patch 2)
In order for guest_memfd to notify userspace about page faults, there is a
new VM_FAULT_UFFD_MINOR that a ->fault() handler can return to inform the
page fault handler that it needs to call handle_userfault() to complete the
fault (patch 3).
Patch 4 plumbs these new goodies into guest_memfd.
This series is the minimal change I've been able to come up with to allow
integration of guest_memfd with uffd and while refactoring uffd and making
mfill_atomic() flow more linear would have been a nice improvement, it's
way out of the scope of enabling uffd with guest_memfd.
v2 changes:
* rename ->get_shared_folio() to ->get_folio()
* hardwire VM_FAULF_UFFD_MINOR to 0 when CONFIG_USERFAULTFD=n
v1: https://patch.msgid.link/20251123102707.559422-1-rppt@kernel.org
* Introduce VM_FAULF_UFFD_MINOR to avoid exporting handle_userfault()
* Simplify vma_can_mfill_atomic()
* Rename get_pagecache_folio() to get_shared_folio() and use inode
instead of vma as its argument
rfc: https://patch.msgid.link/20251117114631.2029447-1-rppt@kernel.org
Mike Rapoport (Microsoft) (4):
userfaultfd: move vma_can_userfault out of line
userfaultfd, shmem: use a VMA callback to handle UFFDIO_CONTINUE
mm: introduce VM_FAULT_UFFD_MINOR fault reason
guest_memfd: add support for userfaultfd minor mode
Nikita Kalyazin (1):
KVM: selftests: test userfaultfd minor for guest_memfd
include/linux/mm.h | 9 ++
include/linux/mm_types.h | 10 +-
include/linux/userfaultfd_k.h | 36 +-----
mm/memory.c | 2 +
mm/shmem.c | 20 +++-
mm/userfaultfd.c | 80 +++++++++++---
.../testing/selftests/kvm/guest_memfd_test.c | 103 ++++++++++++++++++
virt/kvm/guest_memfd.c | 28 +++++
8 files changed, 236 insertions(+), 52 deletions(-)
base-commit: 6a23ae0a96a600d1d12557add110e0bb6e32730c
--
2.50.1
Hello,
this is a (late) v2 to my first attempt to convert the test_tc_edt
script to test_progs. This new version is way simpler, thanks to
Martin's suggestion about properly using the existing network_helpers
rather than reinventing the wheel. It also fixes a small bug in the
measured effective rate.
The converted test roughly follows the original script logic, with two
veths in two namespaces, a TCP connection between a client and a server,
and the client pushing a specific amount of data. Time is recorded
before and after the transmission to compute the effective rate.
There are two knobs driving the robustness of the test in CI:
- the amount of pushed data (the higher, the more precise is the
effective rate)
- the tolerated error margin
The original test was configured with a 20s duration and a 1% error
margin. The new test is configured with 1MB of data being pushed and a
2% error margin, to:
- make the duration tolerable in CI
- while keeping enough margin for rate measure fluctuations depending on
the CI machines load
This has been run multiple times locally to ensure that those values are
sane, and once in CI before sending the series, but I suggest to let it
live a few days in CI to see how it really behaves.
Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com>
---
Changes in v2:
- drop custom client/server management
- update bpf program now that server pushes data
- fix effective rate computation
- Link to v1: https://lore.kernel.org/r/20251031-tc_edt-v1-0-5d34a5823144@bootlin.com
---
Alexis Lothoré (eBPF Foundation) (4):
selftests/bpf: rename test_tc_edt.bpf.c section to expose program type
selftests/bpf: integrate test_tc_edt into test_progs
selftests/bpf: remove test_tc_edt.sh
selftests/bpf: do not hardcode target rate in test_tc_edt BPF program
tools/testing/selftests/bpf/Makefile | 2 -
.../testing/selftests/bpf/prog_tests/test_tc_edt.c | 145 +++++++++++++++++++++
tools/testing/selftests/bpf/progs/test_tc_edt.c | 11 +-
tools/testing/selftests/bpf/test_tc_edt.sh | 100 --------------
4 files changed, 151 insertions(+), 107 deletions(-)
---
base-commit: 233a075a1b27070af76d64541cf001340ecff917
change-id: 20251030-tc_edt-3ea8e8d3d14e
Best regards,
--
Alexis Lothoré, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com
Jakub reported increased flakiness in bond_macvlan_ipvlan.sh on regular
kernel, while the tests consistently pass on a debug kernel. This suggests
a timing-sensitive issue.
To mitigate this, introduce a short sleep before each xvlan_over_bond
connectivity check. The delay helps ensure neighbor and route cache
have fully converged before verifying connectivity.
The sleep interval is kept minimal since check_connection() is invoked
nearly 100 times during the test.
Fixes: 246af950b940 ("selftests: bonding: add macvlan over bond testing")
Reported-by: Jakub Kicinski <kuba(a)kernel.org>
Closes: https://lore.kernel.org/netdev/20251114082014.750edfad@kernel.org
Signed-off-by: Hangbin Liu <liuhangbin(a)gmail.com>
---
.../testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
index c4711272fe45..559f300f965a 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh
@@ -30,6 +30,7 @@ check_connection()
local message=${3}
RET=0
+ sleep 0.25
ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null
check_err $? "ping failed"
log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}"
--
2.50.1
This series adds support for tests that use multiple devices, and adds
one new test, vfio_pci_device_init_perf_test, which measures parallel
device initialization time to demonstrate the improvement from commit
e908f58b6beb ("vfio/pci: Separate SR-IOV VF dev_set").
This series also breaks apart the monolithic vfio_util.h and
vfio_pci_device.c into separate files, to account for all the new code.
This required quite a bit of code motion so the diffstat looks large.
The final layout is more granular and provides a better separation of
the IOMMU code from the device code.
Final layout:
C files:
- tools/testing/selftests/vfio/lib/libvfio.c
- tools/testing/selftests/vfio/lib/iommu.c
- tools/testing/selftests/vfio/lib/iova_allocator.c
- tools/testing/selftests/vfio/lib/vfio_pci_device.c
- tools/testing/selftests/vfio/lib/vfio_pci_driver.c
H files:
- tools/testing/selftests/vfio/lib/include/libvfio.h
- tools/testing/selftests/vfio/lib/include/libvfio/assert.h
- tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
- tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
- tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
- tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
Notably, vfio_util.h is now gone and replaced with libvfio.h.
This series is based on vfio/next plus Alex Mastro's series to add the
IOVA allocator [1]. It should apply cleanly to vfio/next once Alex's
series makes its way to vfio/next via Linus' tree.
This series can be found on GitHub:
https://github.com/dmatlack/linux/tree/vfio/selftests/init_perf_test/v4
[1] https://lore.kernel.org/kvm/20251111-iova-ranges-v3-0-7960244642c5@fb.com/
Cc: Alex Mastro <amastro(a)fb.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Josh Hilke <jrhilke(a)google.com>
Cc: Raghavendra Rao Ananta <rananta(a)google.com>
Cc: Vipin Sharma <vipinsh(a)google.com>
v4:
- Collect Reviewed-bys from Raghavendra
- Check iommu is not null in vfio_pci_device_init() (Raghavendra)
- Do not include any fixes for the kernel-test-robot errors on riscv.
These issues also exist when building the VFIO selftests without this
series using kernel-test-robot's cross compiler. The issue appears to
be that the cross compiler used by ktr does not have libc, which the
selftests depend on.
v3: https://lore.kernel.org/kvm/20251121181429.1421717-1-dmatlack@google.com/
- Replace literal with NSEC_PER_SEC (Alex Mastro)
- Fix Makefile accumulate vs. assignment (Alex Mastro)
v2: https://lore.kernel.org/kvm/20251112192232.442761-1-dmatlack@google.com/
- Require tests to call iommu_init() and manage struct iommu objects
rather than implicitly doing it in vfio_pci_device_init().
- Drop all the device wrappers for IOMMU methods and require tests to
interact with the iommu_*() helper functions directly.
- Add a commit to eliminate INVALID_IOVA. This is a simple cleanup I've
been meaning to make.
- Upgrade some driver logging to error (Raghavendra)
- Remove plurality from helper function that fetches BDF from
environment variable (Raghavendra)
- Fix cleanup.sh to only delete the device directory when cleaning up
all devices (Raghavendra)
v1: https://lore.kernel.org/kvm/20251008232531.1152035-1-dmatlack@google.com/
David Matlack (18):
vfio: selftests: Move run.sh into scripts directory
vfio: selftests: Split run.sh into separate scripts
vfio: selftests: Allow passing multiple BDFs on the command line
vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode
vfio: selftests: Introduce struct iommu
vfio: selftests: Support multiple devices in the same
container/iommufd
vfio: selftests: Eliminate overly chatty logging
vfio: selftests: Prefix logs with device BDF where relevant
vfio: selftests: Upgrade driver logging to dev_err()
vfio: selftests: Rename struct vfio_dma_region to dma_region
vfio: selftests: Move IOMMU library code into iommu.c
vfio: selftests: Move IOVA allocator into iova_allocator.c
vfio: selftests: Stop passing device for IOMMU operations
vfio: selftests: Rename vfio_util.h to libvfio.h
vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c
vfio: selftests: Split libvfio.h into separate header files
vfio: selftests: Eliminate INVALID_IOVA
vfio: selftests: Add vfio_pci_device_init_perf_test
tools/testing/selftests/vfio/Makefile | 10 +-
.../selftests/vfio/lib/drivers/dsa/dsa.c | 36 +-
.../selftests/vfio/lib/drivers/ioat/ioat.c | 18 +-
.../selftests/vfio/lib/include/libvfio.h | 26 +
.../vfio/lib/include/libvfio/assert.h | 54 ++
.../vfio/lib/include/libvfio/iommu.h | 76 +++
.../vfio/lib/include/libvfio/iova_allocator.h | 23 +
.../lib/include/libvfio/vfio_pci_device.h | 125 ++++
.../lib/include/libvfio/vfio_pci_driver.h | 97 +++
.../selftests/vfio/lib/include/vfio_util.h | 331 -----------
tools/testing/selftests/vfio/lib/iommu.c | 465 +++++++++++++++
.../selftests/vfio/lib/iova_allocator.c | 94 +++
tools/testing/selftests/vfio/lib/libvfio.c | 78 +++
tools/testing/selftests/vfio/lib/libvfio.mk | 5 +-
.../selftests/vfio/lib/vfio_pci_device.c | 556 +-----------------
.../selftests/vfio/lib/vfio_pci_driver.c | 16 +-
tools/testing/selftests/vfio/run.sh | 109 ----
.../testing/selftests/vfio/scripts/cleanup.sh | 41 ++
tools/testing/selftests/vfio/scripts/lib.sh | 42 ++
tools/testing/selftests/vfio/scripts/run.sh | 16 +
tools/testing/selftests/vfio/scripts/setup.sh | 48 ++
.../selftests/vfio/vfio_dma_mapping_test.c | 46 +-
.../selftests/vfio/vfio_iommufd_setup_test.c | 2 +-
.../vfio/vfio_pci_device_init_perf_test.c | 168 ++++++
.../selftests/vfio/vfio_pci_device_test.c | 12 +-
.../selftests/vfio/vfio_pci_driver_test.c | 51 +-
26 files changed, 1482 insertions(+), 1063 deletions(-)
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/assert.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
delete mode 100644 tools/testing/selftests/vfio/lib/include/vfio_util.h
create mode 100644 tools/testing/selftests/vfio/lib/iommu.c
create mode 100644 tools/testing/selftests/vfio/lib/iova_allocator.c
create mode 100644 tools/testing/selftests/vfio/lib/libvfio.c
delete mode 100755 tools/testing/selftests/vfio/run.sh
create mode 100755 tools/testing/selftests/vfio/scripts/cleanup.sh
create mode 100755 tools/testing/selftests/vfio/scripts/lib.sh
create mode 100755 tools/testing/selftests/vfio/scripts/run.sh
create mode 100755 tools/testing/selftests/vfio/scripts/setup.sh
create mode 100644 tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c
base-commit: fa804aa4ac1b091ef2ec2981f08a1c28aaeba8e7
prerequisite-patch-id: dcf23dcc1198960bda3102eefaa21df60b2e4c54
prerequisite-patch-id: e32e56d5bf7b6c7dd40d737aa3521560407e00f5
prerequisite-patch-id: 4f79a41bf10a4c025ba5f433551b46035aa15878
prerequisite-patch-id: f903a45f0c32319138cd93a007646ab89132b18c
--
2.52.0.487.g5c8c507ade-goog
The exception vector constants CP_VECTOR, HV_VECTOR, VC_VECTOR, and
SX_VECTOR are used in ex_str(), but the header that defines
them is not included. Other exception vectors are picked up through
indirect includes, but these four are not, which leads to unresolved
identifiers during selftest builds.
lib/x86/processor.c: In function ‘ex_str’:
lib/x86/processor.c:52:17: error: ‘CP_VECTOR’ undeclared
lib/x86/processor.c:53:17: error: ‘HV_VECTOR’ undeclared
lib/x86/processor.c:54:17: error: ‘VC_VECTOR’ undeclared
lib/x86/processor.c:55:17: error: ‘SX_VECTOR’ undeclared
These vector definitions live in:
tools/arch/x86/include/uapi/asm/kvm.h
Add the missing include the userspace API exception vector constants.
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
tools/testing/selftests/kvm/lib/x86/processor.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index b418502c5ecc..fb589f07f2a4 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -4,6 +4,7 @@
*/
#include "linux/bitmap.h"
+#include "uapi/asm/kvm.h"
#include "test_util.h"
#include "kvm_util.h"
#include "pmu.h"
--
2.51.1
This patchset introduces target resume capability to netconsole allowing
it to recover targets when underlying low-level interface comes back
online.
The patchset starts by refactoring netconsole state representation in
order to allow representing deactivated targets (targets that are
disabled due to interfaces going down).
It then modifies netconsole to handle NETDEV_UP events for such targets
and setups netpoll. Targets are matched with incoming interfaces
depending on how they were initially bound in netconsole (by mac or
interface name).
The patchset includes a selftest that validates netconsole target state
transitions and that target is functional after resumed.
Signed-off-by: Andre Carvalho <asantostc(a)gmail.com>
---
Changes in v7:
- selftest: use ${EXIT_STATUS} instead of ${ksft_pass} to avoid
shellcheck warning
- Link to v6: https://lore.kernel.org/r/20251121-netcons-retrigger-v6-0-9c03f5a2bd6f@gmai…
Changes in v6:
- Rebase on top of net-next to resolve conflicts, no functional changes.
- Link to v5: https://lore.kernel.org/r/20251119-netcons-retrigger-v5-0-2c7dda6055d6@gmai…
Changes in v5:
- patch 3: Set (de)enslaved target as DISABLED instead of DEACTIVATED to prevent
resuming it.
- selftest: Fix test cleanup by moving trap line to outside of loop and remove
unneeded 'local' keyword
- Rename maybe_resume_target to resume_target, add netconsole_ prefix to
process_resumable_targets.
- Hold device reference before calling __netpoll_setup.
- Link to v4: https://lore.kernel.org/r/20251116-netcons-retrigger-v4-0-5290b5f140c2@gmai…
Changes in v4:
- Simplify selftest cleanup, removing trap setup in loop.
- Drop netpoll helper (__setup_netpoll_hold) and manage reference inside
netconsole.
- Move resume_list processing logic to separate function.
- Link to v3: https://lore.kernel.org/r/20251109-netcons-retrigger-v3-0-1654c280bbe6@gmai…
Changes in v3:
- Resume by mac or interface name depending on how target was created.
- Attempt to resume target without holding target list lock, by moving
the target to a temporary list. This is required as netpoll may
attempt to allocate memory.
- Link to v2: https://lore.kernel.org/r/20250921-netcons-retrigger-v2-0-a0e84006237f@gmai…
Changes in v2:
- Attempt to resume target in the same thread, instead of using
workqueue .
- Add wrapper around __netpoll_setup (patch 4).
- Renamed resume_target to maybe_resume_target and moved conditionals to
inside its implementation, keeping code more clear.
- Verify that device addr matches target mac address when target was
setup using mac.
- Update selftest to cover targets bound by mac and interface name.
- Fix typo in selftest comment and sort tests alphabetically in
Makefile.
- Link to v1:
https://lore.kernel.org/r/20250909-netcons-retrigger-v1-0-3aea904926cf@gmai…
---
Andre Carvalho (3):
netconsole: convert 'enabled' flag to enum for clearer state management
netconsole: resume previously deactivated target
selftests: netconsole: validate target resume
Breno Leitao (2):
netconsole: add target_state enum
netconsole: add STATE_DEACTIVATED to track targets disabled by low level
drivers/net/netconsole.c | 155 +++++++++++++++++----
tools/testing/selftests/drivers/net/Makefile | 1 +
.../selftests/drivers/net/lib/sh/lib_netcons.sh | 35 ++++-
.../selftests/drivers/net/netcons_resume.sh | 97 +++++++++++++
4 files changed, 254 insertions(+), 34 deletions(-)
---
base-commit: ab084f0b8d6d2ee4b1c6a28f39a2a7430bdfa7f0
change-id: 20250816-netcons-retrigger-a4f547bfc867
Best regards,
--
Andre Carvalho <asantostc(a)gmail.com>
This series fixes two issues in the bonding 802.3ad implementation
related to port state management and churn detection:
1. When disabling a port, we need to set AD_RX_PORT_DISABLED to ensure
proper state machine transitions, preventing ports from getting stuck
in AD_RX_CURRENT state.
2. The ad_churn_machine implementation is restructured to follow IEEE
802.1AX-2014 specifications correctly. The current implementation has
several issues: it doesn't transition to "none" state immediately when
synchronization is achieved, and can get stuck in churned state in
multi-aggregator scenarios.
3. Selftests are enhanced to validate both mux state machine and churn
state logic under aggregator selection and failover scenarios.
These changes ensure proper LACP state machine behavior and fix issues
where ports could remain in incorrect states during aggregator failover.
Hangbin Liu (3):
bonding: set AD_RX_PORT_DISABLED when disabling a port
bonding: restructure ad_churn_machine
selftests: bonding: add mux and churn state testing
drivers/net/bonding/bond_3ad.c | 105 ++++++++++++++----
.../selftests/drivers/net/bonding/Makefile | 2 +-
...nd_lacp_prio.sh => bond_lacp_ad_select.sh} | 73 ++++++++++++
3 files changed, 159 insertions(+), 21 deletions(-)
rename tools/testing/selftests/drivers/net/bonding/{bond_lacp_prio.sh => bond_lacp_ad_select.sh} (64%)
--
2.50.1
Note: it's net/ only bits and doesn't include changes, which shoulf be
merged separately and are posted separately. The full branch for
convenience is at [1], and the patch is here:
https://lore.kernel.org/io-uring/7486ab32e99be1f614b3ef8d0e9bc77015b173f7.1…
Many modern NICs support configurable receive buffer lengths, and zcrx and
memory providers can use buffers larger than 4K/PAGE_SIZE on x86 to improve
performance. When paired with hw-gro larger rx buffer sizes can drastically
reduce the number of buffers traversing the stack and save a lot of processing
time. It also allows to give to users larger contiguous chunks of data. The
idea was first floated around by Saeed during netdev conf 2024 and was
asked about by a few folks.
Single stream benchmarks showed up to ~30% CPU util improvement.
E.g. comparison for 4K vs 32K buffers using a 200Gbit NIC:
packets=23987040 (MB=2745098), rps=199559 (MB/s=22837)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 1.53 0.00 27.78 2.72 1.31 66.45 0.22
packets=24078368 (MB=2755550), rps=200319 (MB/s=22924)
CPU %usr %nice %sys %iowait %irq %soft %idle
0 0.69 0.00 8.26 31.65 1.83 57.00 0.57
This series adds net infrastructure for memory providers configuring
the size and implements it for bnxt. It's an opt-in feature for drivers,
they should advertise support for the parameter in the qops and must check
if the hardware supports the given size. It's limited to memory providers
as it drastically simplifies implementation. It doesn't affect the fast
path zcrx uAPI, and the sizes is defined in zcrx terms, which allows it
to be flexible and adjusted in the future, see Patch 7 for details.
A liburing example can be found at [2]
full branch:
[1] https://github.com/isilence/linux.git zcrx/large-buffers-v6
Liburing example:
[2] https://github.com/isilence/liburing.git zcrx/rx-buf-len
---
The following changes since commit ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d:
Linux 6.18-rc7 (2025-11-23 14:53:16 -0800)
are available in the Git repository at:
https://github.com/isilence/linux.git tags/net-queue-rx-buf-len-v6
for you to fetch changes up to ef9cc9f58d60656a8afef1fc84f066aeb7b27378:
selftests: iou-zcrx: test large chunk sizes (2025-11-27 13:19:32 +0000)
v6: - Update docs and add a selftest
v5: https://lore.kernel.org/netdev/cover.1760440268.git.asml.silence@gmail.com/
- Remove all unnecessary bits like configuration via netlink, and
multi-stage queue configuration.
v4: https://lore.kernel.org/all/cover.1760364551.git.asml.silence@gmail.com/
- Update fbnic qops
- Propagate max buf len for hns3
- Use configured buf size in __bnxt_alloc_rx_netmem
- Minor stylistic changes
v3: https://lore.kernel.org/all/cover.1755499375.git.asml.silence@gmail.com/
- Rebased, excluded zcrx specific patches
- Set agg_size_fac to 1 on warning
v2: https://lore.kernel.org/all/cover.1754657711.git.asml.silence@gmail.com/
- Add MAX_PAGE_ORDER check on pp init
- Applied comments rewording
- Adjust pp.max_len based on order
- Patch up mlx5 queue callbacks after rebase
- Minor ->queue_mgmt_ops refactoring
- Rebased to account for both fill level and agg_size_fac
- Pass providers buf length in struct pp_memory_provider_params and
apply it in __netdev_queue_confi().
- Use ->supported_ring_params to validate drivers support of set
qcfg parameters.
Jakub Kicinski (1):
eth: bnxt: adjust the fill level of agg queues with larger buffers
Pavel Begunkov (7):
net: page_pool: sanitise allocation order
net: memzero mp params when closing a queue
net: let pp memory provider to specify rx buf len
eth: bnxt: store rx buffer size per queue
eth: bnxt: allow providers to set rx buf size
io_uring/zcrx: document area chunking parameter
selftests: iou-zcrx: test large chunk sizes
Documentation/networking/iou-zcrx.rst | 20 +++
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 118 ++++++++++++++----
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 6 +-
drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.h | 2 +-
include/net/netdev_queues.h | 9 ++
include/net/page_pool/types.h | 1 +
net/core/netdev_rx_queue.c | 14 ++-
net/core/page_pool.c | 3 +
.../selftests/drivers/net/hw/iou-zcrx.c | 72 +++++++++--
.../selftests/drivers/net/hw/iou-zcrx.py | 37 ++++++
11 files changed, 235 insertions(+), 49 deletions(-)
--
2.52.0
Hi,
This series fixes issues in the devlink_rate_tc_bw.py selftest and
introduces a new Iperf3Runner that helps with measurement handling.
Thanks,
Carolina
V2:
- Insert the test in the correct sorted position.
Carolina Jubran (6):
selftests: drv-net: Add devlink_rate_tc_bw.py to TEST_PROGS
selftests: drv-net: introduce Iperf3Runner for measurement use cases
selftests: drv-net: Use Iperf3Runner in devlink_rate_tc_bw.py
selftests: drv-net: Set shell=True for sysfs writes in
devlink_rate_tc_bw.py
selftests: drv-net: Fix and clarify TC bandwidth split in
devlink_rate_tc_bw.py
selftests: drv-net: Fix tolerance calculation in devlink_rate_tc_bw.py
.../testing/selftests/drivers/net/hw/Makefile | 1 +
.../drivers/net/hw/devlink_rate_tc_bw.py | 174 ++++++++----------
.../drivers/net/hw/lib/py/__init__.py | 5 +-
.../selftests/drivers/net/lib/py/__init__.py | 5 +-
.../selftests/drivers/net/lib/py/load.py | 84 ++++++++-
5 files changed, 157 insertions(+), 112 deletions(-)
--
2.38.1
The "struct alg" object contains a union of 3 xfrm structures:
union {
struct xfrm_algo;
struct xfrm_algo_aead;
struct xfrm_algo_auth;
}
All of them end with a flexible array member used to store key material,
but the flexible array appears at *different offsets* in each struct.
bcz of this, union itself is of variable-sized & Placing it above
char buf[...] triggers:
ipsec.c:835:5: warning: field 'u' with variable sized type 'union
(unnamed union at ipsec.c:831:3)' not at the end of a struct or class
is a GNU extension [-Wgnu-variable-sized-type-not-at-end]
835 | } u;
| ^
one fix is to use "TRAILING_OVERLAP()" which works with one flexible
array member only.
But In "struct alg" flexible array member exists in all union members,
but not at the same offset, so TRAILING_OVERLAP cannot be applied.
so the fix is to explicitly overlay the key buffer at the correct offset
for the largest union member (xfrm_algo_auth). This ensures that the
flexible-array region and the fixed buffer line up.
No functional change.
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
tools/testing/selftests/net/ipsec.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index 0ccf484b1d9d..f4afef51b930 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -43,6 +43,10 @@
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER)
+#endif
+
#define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */
#define MAX_PAYLOAD 2048
#define XFRM_ALGO_KEY_BUF_SIZE 512
@@ -827,13 +831,16 @@ static int xfrm_fill_key(char *name, char *buf,
static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
struct xfrm_desc *desc)
{
- struct {
+ union {
union {
struct xfrm_algo alg;
struct xfrm_algo_aead aead;
struct xfrm_algo_auth auth;
} u;
- char buf[XFRM_ALGO_KEY_BUF_SIZE];
+ struct {
+ unsigned char __offset_to_FAM[offsetof(struct xfrm_algo_auth, alg_key)];
+ char buf[XFRM_ALGO_KEY_BUF_SIZE];
+ };
} alg = {};
size_t alen, elen, clen, aelen;
unsigned short type;
--
2.52.0
I've removed the RFC tag from this version of the series, but the items
that I'm looking for feedback on remains the same:
- The userspace ABI, in particular:
- The vector length used for the SVE registers, access to the SVE
registers and access to ZA and (if available) ZT0 depending on
the current state of PSTATE.{SM,ZA}.
- The use of a single finalisation for both SVE and SME.
- The addition of control for enabling fine grained traps in a similar
manner to FGU but without the UNDEF, I'm not clear if this is desired
at all and at present this requires symmetric read and write traps like
FGU. That seemed like it might be desired from an implementation
point of view but we already have one case where we enable an
asymmetric trap (for ARM64_WORKAROUND_AMPERE_AC03_CPU_38) and it
seems generally useful to enable asymmetrically.
This series implements support for SME use in non-protected KVM guests.
Much of this is very similar to SVE, the main additional challenge that
SME presents is that it introduces a new vector length similar to the
SVE vector length and two new controls which change the registers seen
by guests:
- PSTATE.ZA enables the ZA matrix register and, if SME2 is supported,
the ZT0 LUT register.
- PSTATE.SM enables streaming mode, a new floating point mode which
uses the SVE register set with the separately configured SME vector
length. In streaming mode implementation of the FFR register is
optional.
It is also permitted to build systems which support SME without SVE, in
this case when not in streaming mode no SVE registers or instructions
are available. Further, there is no requirement that there be any
overlap in the set of vector lengths supported by SVE and SME in a
system, this is expected to be a common situation in practical systems.
Since there is a new vector length to configure we introduce a new
feature parallel to the existing SVE one with a new pseudo register for
the streaming mode vector length. Due to the overlap with SVE caused by
streaming mode rather than finalising SME as a separate feature we use
the existing SVE finalisation to also finalise SME, a new define
KVM_ARM_VCPU_VEC is provided to help make user code clearer. Finalising
SVE and SME separately would introduce complication with register access
since finalising SVE makes the SVE registers writeable by userspace and
doing multiple finalisations results in an error being reported.
Dealing with a state where the SVE registers are writeable due to one of
SVE or SME being finalised but may have their VL changed by the other
being finalised seems like needless complexity with minimal practical
utility, it seems clearer to just express directly that only one
finalisation can be done in the ABI.
Access to the floating point registers follows the architecture:
- When both SVE and SME are present:
- If PSTATE.SM == 0 the vector length used for the Z and P registers
is the SVE vector length.
- If PSTATE.SM == 1 the vector length used for the Z and P registers
is the SME vector length.
- If only SME is present:
- If PSTATE.SM == 0 the Z and P registers are inaccessible and the
floating point state accessed via the encodings for the V registers.
- If PSTATE.SM == 1 the vector length used for the Z and P registers
- The SME specific ZA and ZT0 registers are only accessible if SVCR.ZA is 1.
The VMM must understand this, in particular when loading state SVCR
should be configured before other state. It should be noted that while
the architecture refers to PSTATE.SM and PSTATE.ZA these PSTATE bits are
not preserved in SPSR_ELx, they are only accessible via SVCR.
There are a large number of subfeatures for SME, most of which only
offer additional instructions but some of which (SME2 and FA64) add
architectural state. These are configured via the ID registers as per
usual.
Protected KVM supported, with the implementation maintaining the
existing restriction that the hypervisor will refuse to run if streaming
mode or ZA is enabled. This both simplfies the code and avoids the need
to allocate storage for host ZA and ZT0 state, there seems to be little
practical use case for supporting this and the memory usage would be
non-trivial.
The new KVM_ARM_VCPU_VEC feature and ZA and ZT0 registers have not been
added to the get-reg-list selftest, the idea of supporting additional
features there without restructuring the program to generate all
possible feature combinations has been rejected. I will post a separate
series which does that restructuring.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
Changes in v8:
- Small fixes in ABI documentation.
- Link to v7: https://lore.kernel.org/r/20250822-kvm-arm64-sme-v7-0-7a65d82b8b10@kernel.o…
Changes in v7:
- Rebase onto v6.17-rc1.
- Handle SMIDR_EL1 as a VM wide ID register and use this in feat_sme_smps().
- Expose affinity fields in SMIDR_EL1.
- Remove SMPRI_EL1 from vcpu_sysreg, the value is always 0 currently.
- Prevent userspace writes to SMPRIMAP_EL2.
- Link to v6: https://lore.kernel.org/r/20250625-kvm-arm64-sme-v6-0-114cff4ffe04@kernel.o…
Changes in v6:
- Rebase onto v6.16-rc3.
- Link to v5: https://lore.kernel.org/r/20250417-kvm-arm64-sme-v5-0-f469a2d5f574@kernel.o…
Changes in v5:
- Rebase onto v6.15-rc2.
- Add pKVM guest support.
- Always restore SVCR.
- Link to v4: https://lore.kernel.org/r/20250214-kvm-arm64-sme-v4-0-d64a681adcc2@kernel.o…
Changes in v4:
- Rebase onto v6.14-rc2 and Mark Rutland's fixes.
- Expose SME to nested guests.
- Additional cleanups and test fixes following on from the rebase.
- Flush register state on VMM PSTATE.{SM,ZA}.
- Link to v3: https://lore.kernel.org/r/20241220-kvm-arm64-sme-v3-0-05b018c1ffeb@kernel.o…
Changes in v3:
- Rebase onto v6.12-rc2.
- Link to v2: https://lore.kernel.org/r/20231222-kvm-arm64-sme-v2-0-da226cb180bb@kernel.o…
Changes in v2:
- Rebase onto v6.7-rc3.
- Configure subfeatures based on host system only.
- Complete nVHE support.
- There was some snafu with sending v1 out, it didn't make it to the
lists but in case it hit people's inboxes I'm sending as v2.
---
Mark Brown (29):
arm64/sysreg: Update SMIDR_EL1 to DDI0601 2025-06
arm64/fpsimd: Update FA64 and ZT0 enables when loading SME state
arm64/fpsimd: Decide to save ZT0 and streaming mode FFR at bind time
arm64/fpsimd: Check enable bit for FA64 when saving EFI state
arm64/fpsimd: Determine maximum virtualisable SME vector length
KVM: arm64: Introduce non-UNDEF FGT control
KVM: arm64: Pay attention to FFR parameter in SVE save and load
KVM: arm64: Pull ctxt_has_ helpers to start of sysreg-sr.h
KVM: arm64: Move SVE state access macros after feature test macros
KVM: arm64: Rename SVE finalization constants to be more general
KVM: arm64: Document the KVM ABI for SME
KVM: arm64: Define internal features for SME
KVM: arm64: Rename sve_state_reg_region
KVM: arm64: Store vector lengths in an array
KVM: arm64: Implement SME vector length configuration
KVM: arm64: Support SME control registers
KVM: arm64: Support TPIDR2_EL0
KVM: arm64: Support SME identification registers for guests
KVM: arm64: Support SME priority registers
KVM: arm64: Provide assembly for SME register access
KVM: arm64: Support userspace access to streaming mode Z and P registers
KVM: arm64: Flush register state on writes to SVCR.SM and SVCR.ZA
KVM: arm64: Expose SME specific state to userspace
KVM: arm64: Context switch SME state for guests
KVM: arm64: Handle SME exceptions
KVM: arm64: Expose SME to nested guests
KVM: arm64: Provide interface for configuring and enabling SME for guests
KVM: arm64: selftests: Add SME system registers to get-reg-list
KVM: arm64: selftests: Add SME to set_id_regs test
Documentation/virt/kvm/api.rst | 115 ++++++++---
arch/arm64/include/asm/fpsimd.h | 26 +++
arch/arm64/include/asm/kvm_emulate.h | 6 +
arch/arm64/include/asm/kvm_host.h | 169 ++++++++++++---
arch/arm64/include/asm/kvm_hyp.h | 5 +-
arch/arm64/include/asm/kvm_pkvm.h | 2 +-
arch/arm64/include/asm/vncr_mapping.h | 2 +
arch/arm64/include/uapi/asm/kvm.h | 33 +++
arch/arm64/kernel/cpufeature.c | 2 -
arch/arm64/kernel/fpsimd.c | 89 ++++----
arch/arm64/kvm/arm.c | 10 +
arch/arm64/kvm/config.c | 8 +-
arch/arm64/kvm/fpsimd.c | 28 ++-
arch/arm64/kvm/guest.c | 252 ++++++++++++++++++++---
arch/arm64/kvm/handle_exit.c | 14 ++
arch/arm64/kvm/hyp/fpsimd.S | 28 ++-
arch/arm64/kvm/hyp/include/hyp/switch.h | 175 ++++++++++++++--
arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 110 ++++++----
arch/arm64/kvm/hyp/nvhe/hyp-main.c | 86 ++++++--
arch/arm64/kvm/hyp/nvhe/pkvm.c | 85 ++++++--
arch/arm64/kvm/hyp/nvhe/switch.c | 4 +-
arch/arm64/kvm/hyp/nvhe/sys_regs.c | 6 +
arch/arm64/kvm/hyp/vhe/switch.c | 17 +-
arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 7 +
arch/arm64/kvm/nested.c | 3 +-
arch/arm64/kvm/reset.c | 156 ++++++++++----
arch/arm64/kvm/sys_regs.c | 141 ++++++++++++-
arch/arm64/tools/sysreg | 8 +-
include/uapi/linux/kvm.h | 1 +
tools/testing/selftests/kvm/arm64/get-reg-list.c | 15 +-
tools/testing/selftests/kvm/arm64/set_id_regs.c | 27 ++-
31 files changed, 1327 insertions(+), 303 deletions(-)
---
base-commit: 062b3e4a1f880f104a8d4b90b767788786aa7b78
change-id: 20230301-kvm-arm64-sme-06a1246d3636
Best regards,
--
Mark Brown <broonie(a)kernel.org>
Extend fprobe to support list-style filters and explicit entry/exit suffixes.
Currentyl, fprobe only supports a single symbol (or wildcard) per event.
This patch allows users to specify a comma-separated list of symbols.
New Syntax:
- f:[GRP/][EVENT] func1,func2,func3:entry
- f:[GRP/][EVENT] func1,func2,func3:exit
Logic changes:
- Refactor parsing logic into 'parse_fprobe_spec'
- Support '!' prefix for exclusion
- Disable BTF lookup ('ctx->funcname = NULL') when a list or wildcard is used,
as a single function signature cannot apply to multiple functions.
- Reject legacy '%return' suffix when used with lists or wildcards
- Update tracefs/README
Testing:
Verified on x86_64 via QEMU. Checked registration of lists, exclusions, and
explicit suffixes. Verified rejection of invalid syntax including trailing
commas and mixed legacy/new syntax.
Seokwoo Chung (Ryan) (3):
docs: tracing/fprobe: Document list filters and :entry/:exit
tracing/fprobe: Support comma-separated symbols and :entry/:exit
selftests/ftrace: Add accept cases for fprobe list syntax
Changes in v4:
- Added validation to reject trailing commas (empty tokens) in symbol lists
- Added vaildation to reject mixed of list syntax with %return suffix
- Refactored parse_fprobe_spec to user __free(kfree) for automatic memory
cleanup
- Removed the now-unused parse_symbol_and_return function to avoid compiler
warnings.
- Tigtened %return detection to ensure it only matches as a strict suffix, not a
substring
- Link to v3: https://lore.kernel.org/lkml/20250904103219.f4937968362bfff1ecd3f004@kernel…
Documentation/trace/fprobetrace.rst | 17 +-
kernel/trace/trace.c | 3 +-
kernel/trace/trace_fprobe.c | 209 ++++++++++++++----
.../ftrace/test.d/dynevent/fprobe_list.tc | 92 ++++++++
4 files changed, 269 insertions(+), 52 deletions(-)
create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/fprobe_list.tc
--
2.43.0
The unix_connreset.c test included <stdlib.h>, but no symbol from that
header is used. This causes a fatal build error under certain
linux-next configurations where stdlib.h is not available.
Remove the unused include to fix the build.
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/r/202511221800.hcgCKvVa-lkp@intel.com/
Signed-off-by: Sunday Adelodun <adelodunolaoluwa(a)yahoo.com>
---
tools/testing/selftests/net/af_unix/unix_connreset.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/tools/testing/selftests/net/af_unix/unix_connreset.c b/tools/testing/selftests/net/af_unix/unix_connreset.c
index bffef2b54bfd..9844e829aed5 100644
--- a/tools/testing/selftests/net/af_unix/unix_connreset.c
+++ b/tools/testing/selftests/net/af_unix/unix_connreset.c
@@ -14,7 +14,6 @@
*/
#define _GNU_SOURCE
-#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
--
2.43.0
From: Hui Zhu <zhuhui(a)kylinos.cn>
This series proposes adding eBPF support to the Linux memory
controller, enabling dynamic and extensible memory management
policies at runtime.
Background
The memory controller (memcg) currently provides fixed memory
accounting and reclamation policies through static kernel code.
This limits flexibility for specialized workloads and use cases
that require custom memory management strategies.
By enabling eBPF programs to hook into key memory control
operations, administrators can implement custom policies without
recompiling the kernel, while maintaining the safety guarantees
provided by the BPF verifier.
Use Cases
1. Custom memory reclamation strategies for specialized workloads
2. Dynamic memory pressure monitoring and telemetry
3. Memory accounting adjustments based on runtime conditions
4. Integration with container orchestration systems for
intelligent resource management
5. Research and experimentation with novel memory management
algorithms
Design Overview
This series introduces:
1. A new BPF struct ops type (`memcg_ops`) that allows eBPF
programs to implement custom behavior for memory charging
operations.
2. A hook point in the `try_charge_memcg()` fast path that
invokes registered eBPF programs to determine if custom
memory management should be applied.
3. The eBPF handler can inspect memory cgroup context and
optionally modify certain parameters (e.g., `nr_pages` for
reclamation size).
4. A reference counting mechanism using `percpu_ref` to safely
manage the lifecycle of registered eBPF struct ops instances.
5. Configuration via `CONFIG_MEMCG_BPF` to allow disabling this
feature at build time.
Implementation Details
- Uses BPF struct ops for a cleaner integration model
- Leverages static branch keys for minimal overhead when feature
is unused
- RCU synchronization ensures safe replacement of handlers
- Sample eBPF program demonstrates monitoring capabilities
- Comprehensive selftest suite validates core functionality
Performance Considerations
- Zero overhead when feature is disabled or no eBPF program is
loaded (static branch is disabled)
- Minimal overhead when enabled: one indirect function call per
charge attempt
- eBPF programs run under the restrictions of the BPF verifier
Patch Overview
PATCH 1/3: Core kernel implementation
- Adds eBPF struct ops support to memcg
- Introduces CONFIG_MEMCG_BPF option
- Implements safe registration/unregistration mechanism
PATCH 2/3: Selftest suite
- prog_tests/memcg_ops.c: Test entry points
- progs/memcg_ops.bpf.c: Test eBPF program
- Validates load, attach, and single-handler constraints
PATCH 3/3: Sample userspace program
- samples/bpf/memcg_printk.bpf.c: Monitoring eBPF program
- samples/bpf/memcg_printk.c: Userspace loader
- Demonstrates real-world usage and debugging capabilities
Open Questions & Discussion Points
1. Should the eBPF handler have access to additional memory
cgroup state? Current design exposes minimal context to
reduce attack surface.
2. Are there other memory control operations that would benefit
from eBPF extensibility (e.g., uncharge, reclaim)?
3. Should there be permission checks or restrictions on who can
load memcg eBPF programs? Currently inherits BPF's
CAP_PERFMON/CAP_SYS_ADMIN requirements.
4. How should we handle multiple eBPF programs trying to
register? Current implementation allows only one active
handler.
5. Is the current exposed context in `try_charge_memcg` struct
sufficient, or should additional fields be added?
Testing
The selftests provide comprehensive coverage of the core
functionality. The sample program can be used for manual
testing and as a reference for implementing additional
monitoring tools.
Hui Zhu (3):
memcg: add eBPF struct ops support for memory charging
selftests/bpf: add memcg eBPF struct ops test
samples/bpf: add example memcg eBPF program
MAINTAINERS | 5 +
init/Kconfig | 38 ++++
mm/Makefile | 1 +
mm/memcontrol.c | 26 ++-
mm/memcontrol_bpf.c | 200 ++++++++++++++++++
mm/memcontrol_bpf.h | 103 +++++++++
samples/bpf/Makefile | 2 +
samples/bpf/memcg_printk.bpf.c | 30 +++
samples/bpf/memcg_printk.c | 82 +++++++
.../selftests/bpf/prog_tests/memcg_ops.c | 117 ++++++++++
tools/testing/selftests/bpf/progs/memcg_ops.c | 20 ++
11 files changed, 617 insertions(+), 7 deletions(-)
create mode 100644 mm/memcontrol_bpf.c
create mode 100644 mm/memcontrol_bpf.h
create mode 100644 samples/bpf/memcg_printk.bpf.c
create mode 100644 samples/bpf/memcg_printk.c
create mode 100644 tools/testing/selftests/bpf/prog_tests/memcg_ops.c
create mode 100644 tools/testing/selftests/bpf/progs/memcg_ops.c
--
2.43.0
This patch set introduces the BPF_F_CPU and BPF_F_ALL_CPUS flags for
percpu maps, as the requirement of BPF_F_ALL_CPUS flag for percpu_array
maps was discussed in the thread of
"[PATCH bpf-next v3 0/4] bpf: Introduce global percpu data"[1].
The goal of BPF_F_ALL_CPUS flag is to reduce data caching overhead in light
skeletons by allowing a single value to be reused to update values across all
CPUs. This avoids the M:N problem where M cached values are used to update a
map on N CPUs kernel.
The BPF_F_CPU flag is accompanied by *flags*-embedded cpu info, which
specifies the target CPU for the operation:
* For lookup operations: the flag field alongside cpu info enable querying
a value on the specified CPU.
* For update operations: the flag field alongside cpu info enable
updating value for specified CPU.
Links:
[1] https://lore.kernel.org/bpf/20250526162146.24429-1-leon.hwang@linux.dev/
Changes:
v11 -> v12:
* Dropped the v11 changes.
* Stabilized the lru_percpu_hash map test by keeping an extra spare entry,
which can be used temporarily during updates to avoid unintended LRU
evictions.
v10 -> v11:
* Support the combination of BPF_EXIST and BPF_F_CPU/BPF_F_ALL_CPUS for
update operations.
* Fix unstable lru_percpu_hash map test using the combination of
BPF_EXIST and BPF_F_CPU/BPF_F_ALL_CPUS to avoid LRU eviction
(reported by Alexei).
v9 -> v10:
* Add tests to verify array and hash maps do not support BPF_F_CPU and
BPF_F_ALL_CPUS flags.
* Address comment from Andrii:
* Copy map value using copy_map_value_long for percpu_cgroup_storage
maps in a separate patch.
v8 -> v9:
* Change value type from u64 to u32 in selftests.
* Address comments from Andrii:
* Keep value_size unaligned and update everywhere for consistency when
cpu flags are specified.
* Update value by getting pointer for percpu hash and percpu
cgroup_storage maps.
v7 -> v8:
* Address comments from Andrii:
* Check BPF_F_LOCK when update percpu_array, percpu_hash and
lru_percpu_hash maps.
* Refactor flags check in __htab_map_lookup_and_delete_batch().
* Keep value_size unaligned and copy value using copy_map_value() in
__htab_map_lookup_and_delete_batch() when BPF_F_CPU is specified.
* Update warn message in libbpf's validate_map_op().
* Update comment of libbpf's bpf_map__lookup_elem().
v6 -> v7:
* Get correct value size for percpu_hash and lru_percpu_hash in
update_batch API.
* Set 'count' as 'max_entries' in test cases for lookup_batch API.
* Address comment from Alexei:
* Move cpu flags check into bpf_map_check_op_flags().
v5 -> v6:
* Move bpf_map_check_op_flags() from 'bpf.h' to 'syscall.c'.
* Address comments from Alexei:
* Drop the refactoring code of data copying logic for percpu maps.
* Drop bpf_map_check_op_flags() wrappers.
v4 -> v5:
* Address comments from Andrii:
* Refactor data copying logic for all percpu maps.
* Drop this_cpu_ptr() micro-optimization.
* Drop cpu check in libbpf's validate_map_op().
* Enhance bpf_map_check_op_flags() using *allowed flags* instead of
'extra_flags_mask'.
v3 -> v4:
* Address comments from Andrii:
* Remove unnecessary map_type check in bpf_map_value_size().
* Reduce code churn.
* Remove unnecessary do_delete check in
__htab_map_lookup_and_delete_batch().
* Introduce bpf_percpu_copy_to_user() and bpf_percpu_copy_from_user().
* Rename check_map_flags() to bpf_map_check_op_flags() with
extra_flags_mask.
* Add human-readable pr_warn() explanations in validate_map_op().
* Use flags in bpf_map__delete_elem() and
bpf_map__lookup_and_delete_elem().
* Drop "for alignment reasons".
v3 link: https://lore.kernel.org/bpf/20250821160817.70285-1-leon.hwang@linux.dev/
v2 -> v3:
* Address comments from Alexei:
* Use BPF_F_ALL_CPUS instead of BPF_ALL_CPUS magic.
* Introduce these two cpu flags for all percpu maps.
* Address comments from Jiri:
* Reduce some unnecessary u32 cast.
* Refactor more generic map flags check function.
* A code style issue.
v2 link: https://lore.kernel.org/bpf/20250805163017.17015-1-leon.hwang@linux.dev/
v1 -> v2:
* Address comments from Andrii:
* Embed cpu info as high 32 bits of *flags* totally.
* Use ERANGE instead of E2BIG.
* Few format issues.
Leon Hwang (7):
bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array
maps
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash
and lru_percpu_hash maps
bpf: Copy map value using copy_map_value_long for
percpu_cgroup_storage maps
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for
percpu_cgroup_storage maps
libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps
selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags
include/linux/bpf-cgroup.h | 4 +-
include/linux/bpf.h | 35 +-
include/uapi/linux/bpf.h | 2 +
kernel/bpf/arraymap.c | 29 +-
kernel/bpf/hashtab.c | 94 +++--
kernel/bpf/local_storage.c | 27 +-
kernel/bpf/syscall.c | 37 +-
tools/include/uapi/linux/bpf.h | 2 +
tools/lib/bpf/bpf.h | 8 +
tools/lib/bpf/libbpf.c | 26 +-
tools/lib/bpf/libbpf.h | 21 +-
.../selftests/bpf/prog_tests/percpu_alloc.c | 328 ++++++++++++++++++
.../selftests/bpf/progs/percpu_alloc_array.c | 32 ++
13 files changed, 560 insertions(+), 85 deletions(-)
--
2.51.2
This patchset introduces target resume capability to netconsole allowing
it to recover targets when underlying low-level interface comes back
online.
The patchset starts by refactoring netconsole state representation in
order to allow representing deactivated targets (targets that are
disabled due to interfaces going down).
It then modifies netconsole to handle NETDEV_UP events for such targets
and setups netpoll. Targets are matched with incoming interfaces
depending on how they were initially bound in netconsole (by mac or
interface name).
The patchset includes a selftest that validates netconsole target state
transitions and that target is functional after resumed.
Signed-off-by: Andre Carvalho <asantostc(a)gmail.com>
---
Changes in v6:
- Rebase on top of net-next to resolve conflicts, no functional changes.
- Link to v5: https://lore.kernel.org/r/20251119-netcons-retrigger-v5-0-2c7dda6055d6@gmai…
Changes in v5:
- patch 3: Set (de)enslaved target as DISABLED instead of DEACTIVATED to prevent
resuming it.
- selftest: Fix test cleanup by moving trap line to outside of loop and remove
unneeded 'local' keyword
- Rename maybe_resume_target to resume_target, add netconsole_ prefix to
process_resumable_targets.
- Hold device reference before calling __netpoll_setup.
- Link to v4: https://lore.kernel.org/r/20251116-netcons-retrigger-v4-0-5290b5f140c2@gmai…
Changes in v4:
- Simplify selftest cleanup, removing trap setup in loop.
- Drop netpoll helper (__setup_netpoll_hold) and manage reference inside
netconsole.
- Move resume_list processing logic to separate function.
- Link to v3: https://lore.kernel.org/r/20251109-netcons-retrigger-v3-0-1654c280bbe6@gmai…
Changes in v3:
- Resume by mac or interface name depending on how target was created.
- Attempt to resume target without holding target list lock, by moving
the target to a temporary list. This is required as netpoll may
attempt to allocate memory.
- Link to v2: https://lore.kernel.org/r/20250921-netcons-retrigger-v2-0-a0e84006237f@gmai…
Changes in v2:
- Attempt to resume target in the same thread, instead of using
workqueue .
- Add wrapper around __netpoll_setup (patch 4).
- Renamed resume_target to maybe_resume_target and moved conditionals to
inside its implementation, keeping code more clear.
- Verify that device addr matches target mac address when target was
setup using mac.
- Update selftest to cover targets bound by mac and interface name.
- Fix typo in selftest comment and sort tests alphabetically in
Makefile.
- Link to v1:
https://lore.kernel.org/r/20250909-netcons-retrigger-v1-0-3aea904926cf@gmai…
---
Andre Carvalho (3):
netconsole: convert 'enabled' flag to enum for clearer state management
netconsole: resume previously deactivated target
selftests: netconsole: validate target resume
Breno Leitao (2):
netconsole: add target_state enum
netconsole: add STATE_DEACTIVATED to track targets disabled by low level
drivers/net/netconsole.c | 155 +++++++++++++++++----
tools/testing/selftests/drivers/net/Makefile | 1 +
.../selftests/drivers/net/lib/sh/lib_netcons.sh | 35 ++++-
.../selftests/drivers/net/netcons_resume.sh | 97 +++++++++++++
4 files changed, 254 insertions(+), 34 deletions(-)
---
base-commit: e2c20036a8879476c88002730d8a27f4e3c32d4b
change-id: 20250816-netcons-retrigger-a4f547bfc867
Best regards,
--
Andre Carvalho <asantostc(a)gmail.com>
This series is the start of adding full DMABUF support to
iommufd. Currently it is limited to only work with VFIO's DMABUF exporter.
It sits on top of Leon's series to add a DMABUF exporter to VFIO:
https://lore.kernel.org/all/20251120-dmabuf-vfio-v9-0-d7f71607f371@nvidia.c…
The existing IOMMU_IOAS_MAP_FILE is enhanced to detect DMABUF fd's, but
otherwise works the same as it does today for a memfd. The user can select
a slice of the FD to map into the ioas and if the underliyng alignment
requirements are met it will be placed in the iommu_domain.
Though limited, it is enough to allow a VMM like QEMU to connect MMIO BAR
memory from VFIO to an iommu_domain controlled by iommufd. This is used
for PCI Peer to Peer support in VMs, and is the last feature that the VFIO
type 1 container has that iommufd couldn't do.
The VFIO type1 version extracts raw PFNs from VMAs, which has no lifetime
control and is a use-after-free security problem.
Instead iommufd relies on revokable DMABUFs. Whenever VFIO thinks there
should be no access to the MMIO it can shoot down the mapping in iommufd
which will unmap it from the iommu_domain. There is no automatic remap,
this is a safety protocol so the kernel doesn't get stuck. Userspace is
expected to know it is doing something that will revoke the dmabuf and
map/unmap it around the activity. Eg when QEMU goes to issue FLR it should
do the map/unmap to iommufd.
Since DMABUF is missing some key general features for this use case it
relies on a "private interconnect" between VFIO and iommufd via the
vfio_pci_dma_buf_iommufd_map() call.
The call confirms the DMABUF has revoke semantics and delivers a phys_addr
for the memory suitable for use with iommu_map().
Medium term there is a desire to expand the supported DMABUFs to include
GPU drivers to support DPDK/SPDK type use cases so future series will work
to add a general concept of revoke and a general negotiation of
interconnect to remove vfio_pci_dma_buf_iommufd_map().
I also plan another series to modify iommufd's vfio_compat to
transparently pull a dmabuf out of a VFIO VMA to emulate more of the uAPI
of type1.
The latest series for interconnect negotation to exchange a phys_addr is:
https://lore.kernel.org/r/20251027044712.1676175-1-vivek.kasireddy@intel.com
And the discussion for design of revoke is here:
https://lore.kernel.org/dri-devel/20250114173103.GE5556@nvidia.com/
This is on github: https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf
v2:
- Rebase on Leon's v9
- Fix mislocking in an iopt_fill_domain() error path
- Revise the comments around how the sub page offset works
- Remove a useless WARN_ON in iopt_pages_rw_access()
- Fixed missed memory free in the selftest
v1: https://patch.msgid.link/r/0-v1-64bed2430cdb+31b-iommufd_dmabuf_jgg@nvidia.…
Jason Gunthorpe (9):
vfio/pci: Add vfio_pci_dma_buf_iommufd_map()
iommufd: Add DMABUF to iopt_pages
iommufd: Do not map/unmap revoked DMABUFs
iommufd: Allow a DMABUF to be revoked
iommufd: Allow MMIO pages in a batch
iommufd: Have pfn_reader process DMABUF iopt_pages
iommufd: Have iopt_map_file_pages convert the fd to a file
iommufd: Accept a DMABUF through IOMMU_IOAS_MAP_FILE
iommufd/selftest: Add some tests for the dmabuf flow
drivers/iommu/iommufd/io_pagetable.c | 78 +++-
drivers/iommu/iommufd/io_pagetable.h | 54 ++-
drivers/iommu/iommufd/ioas.c | 8 +-
drivers/iommu/iommufd/iommufd_private.h | 14 +-
drivers/iommu/iommufd/iommufd_test.h | 10 +
drivers/iommu/iommufd/main.c | 10 +
drivers/iommu/iommufd/pages.c | 414 ++++++++++++++++--
drivers/iommu/iommufd/selftest.c | 143 ++++++
drivers/vfio/pci/vfio_pci_dmabuf.c | 34 ++
include/linux/vfio_pci_core.h | 4 +
tools/testing/selftests/iommu/iommufd.c | 43 ++
tools/testing/selftests/iommu/iommufd_utils.h | 44 ++
12 files changed, 786 insertions(+), 70 deletions(-)
base-commit: f836737ed56db9e2d5b047c56a31e05af0f3f116
--
2.43.0
The devpts_pts selftest has an ifdef in case an architecture does not
define TIOCGPTPEER, but the handling for this is broken since we need
errno to be set to EINVAL in order to skip the test as we should. Given
that this ioctl() has been defined since v4.15 we may as well just assume
it's there rather than write handling code which will probably never get
used.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
tools/testing/selftests/filesystems/devpts_pts.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/tools/testing/selftests/filesystems/devpts_pts.c b/tools/testing/selftests/filesystems/devpts_pts.c
index b1fc9b916ace..cad7da1bd7ca 100644
--- a/tools/testing/selftests/filesystems/devpts_pts.c
+++ b/tools/testing/selftests/filesystems/devpts_pts.c
@@ -100,7 +100,7 @@ static int resolve_procfd_symlink(int fd, char *buf, size_t buflen)
static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
{
int ret;
- int master = -1, slave = -1, fret = -1;
+ int master = -1, slave, fret = -1;
master = open(ptmx, O_RDWR | O_NOCTTY | O_CLOEXEC);
if (master < 0) {
@@ -119,9 +119,7 @@ static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
goto do_cleanup;
}
-#ifdef TIOCGPTPEER
slave = ioctl(master, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC);
-#endif
if (slave < 0) {
if (errno == EINVAL) {
fprintf(stderr, "TIOCGPTPEER is not supported. "
---
base-commit: ac3fd01e4c1efce8f2c054cdeb2ddd2fc0fb150d
change-id: 20251126-selftests-filesystems-devpts-tiocgptpeer-fbd30e579859
Best regards,
--
Mark Brown <broonie(a)kernel.org>
The series is separated from [1] to show the independency and compare
potential use cases easier. This use case uses the primitive revocable
APIs directly. It relies on the revocable core part [2].
It tries to fix an UAF in the fops of cros_ec_chardev after the
underlying protocol device has gone by using revocable.
The file operations make sure the resources are available when using them.
Even though it has the finest grain for accessing the resources, it makes
the user code verbose. Per feedback from the community, I'm looking for
some subsystem level helpers so that user code can be simlper.
The 1st patch converts existing protocol devices to resource providers
of cros_ec_device.
The 2nd patch converts cros_ec_chardev to a resource consumer of
cros_ec_device to fix the UAF.
[1] https://lore.kernel.org/chrome-platform/20251016054204.1523139-1-tzungbi@ke…
[2] https://lore.kernel.org/chrome-platform/20251106152330.11733-1-tzungbi@kern…
v6:
- New, separated from an existing series.
Tzung-Bi Shih (2):
platform/chrome: Protect cros_ec_device lifecycle with revocable
platform/chrome: cros_ec_chardev: Consume cros_ec_device via revocable
drivers/platform/chrome/cros_ec.c | 5 ++
drivers/platform/chrome/cros_ec_chardev.c | 71 ++++++++++++++++-----
include/linux/platform_data/cros_ec_proto.h | 4 ++
3 files changed, 65 insertions(+), 15 deletions(-)
--
2.48.1
For a while now we have supported file handles for pidfds. This has
proven to be very useful.
Extend the concept to cover namespaces as well. After this patchset it
is possible to encode and decode namespace file handles using the
commong name_to_handle_at() and open_by_handle_at() apis.
Namespaces file descriptors can already be derived from pidfds which
means they aren't subject to overmount protection bugs. IOW, it's
irrelevant if the caller would not have access to an appropriate
/proc/<pid>/ns/ directory as they could always just derive the namespace
based on a pidfd already.
It has the same advantage as pidfds. It's possible to reliably and for
the lifetime of the system refer to a namespace without pinning any
resources and to compare them.
Permission checking is kept simple. If the caller is located in the
namespace the file handle refers to they are able to open it otherwise
they must hold privilege over the owning namespace of the relevant
namespace.
Both the network namespace and the mount namespace already have an
associated cookie that isn't recycled and is fully exposed to userspace.
Move this into ns_common and use the same id space for all namespaces so
they can trivially and reliably be compared.
There's more coming based on the iterator infrastructure but the series
is large enough and focuses on file handles.
Extensive selftests included. I still have various other test-suites to
run but it holds up so far.
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
---
Christian Brauner (32):
pidfs: validate extensible ioctls
nsfs: validate extensible ioctls
block: use extensible_ioctl_valid()
ns: move to_ns_common() to ns_common.h
nsfs: add nsfs.h header
ns: uniformly initialize ns_common
mnt: use ns_common_init()
ipc: use ns_common_init()
cgroup: use ns_common_init()
pid: use ns_common_init()
time: use ns_common_init()
uts: use ns_common_init()
user: use ns_common_init()
net: use ns_common_init()
ns: remove ns_alloc_inum()
nstree: make iterator generic
mnt: support iterator
cgroup: support iterator
ipc: support iterator
net: support iterator
pid: support iterator
time: support iterator
userns: support iterator
uts: support iterator
ns: add to_<type>_ns() to respective headers
nsfs: add current_in_namespace()
nsfs: support file handles
nsfs: support exhaustive file handles
nsfs: add missing id retrieval support
tools: update nsfs.h uapi header
selftests/namespaces: add identifier selftests
selftests/namespaces: add file handle selftests
block/blk-integrity.c | 8 +-
fs/fhandle.c | 6 +
fs/internal.h | 1 +
fs/mount.h | 10 +-
fs/namespace.c | 156 +--
fs/nsfs.c | 266 +++-
fs/pidfs.c | 2 +-
include/linux/cgroup.h | 5 +
include/linux/exportfs.h | 6 +
include/linux/fs.h | 14 +
include/linux/ipc_namespace.h | 5 +
include/linux/ns_common.h | 29 +
include/linux/nsfs.h | 40 +
include/linux/nsproxy.h | 11 -
include/linux/nstree.h | 89 ++
include/linux/pid_namespace.h | 5 +
include/linux/proc_ns.h | 32 +-
include/linux/time_namespace.h | 9 +
include/linux/user_namespace.h | 5 +
include/linux/utsname.h | 5 +
include/net/net_namespace.h | 6 +
include/uapi/linux/fcntl.h | 1 +
include/uapi/linux/nsfs.h | 12 +-
init/main.c | 2 +
ipc/msgutil.c | 1 +
ipc/namespace.c | 12 +-
ipc/shm.c | 2 +
kernel/Makefile | 2 +-
kernel/cgroup/cgroup.c | 2 +
kernel/cgroup/namespace.c | 24 +-
kernel/nstree.c | 233 ++++
kernel/pid_namespace.c | 13 +-
kernel/time/namespace.c | 23 +-
kernel/user_namespace.c | 17 +-
kernel/utsname.c | 28 +-
net/core/net_namespace.c | 59 +-
tools/include/uapi/linux/nsfs.h | 23 +-
tools/testing/selftests/namespaces/.gitignore | 2 +
tools/testing/selftests/namespaces/Makefile | 7 +
tools/testing/selftests/namespaces/config | 7 +
.../selftests/namespaces/file_handle_test.c | 1410 ++++++++++++++++++++
tools/testing/selftests/namespaces/nsid_test.c | 986 ++++++++++++++
42 files changed, 3306 insertions(+), 270 deletions(-)
---
base-commit: 8f5ae30d69d7543eee0d70083daf4de8fe15d585
change-id: 20250905-work-namespace-c68826dda0d4
From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com>
Hello,
Plesae find the v5 AccECN case handling patch series, which covers
several excpetional case handling of Accurate ECN spec (RFC9768),
adds new identifiers to be used by CC modules, adds ecn_delta into
rate_sample, and keeps the ACE counter for computation, etc.
This patch series is part of the full AccECN patch series, which is available at
https://github.com/L4STeam/linux-net-next/commits/upstream_l4steam/
Best regards,
Chia-Yu
---
v6:
- Update comment in #3 to highlight RX path is only used for virtio-net (Paolo Abeni <pabeni(a)redhat.com>)
- Rename TCP_CONG_WANTS_ECT_1 to TCP_CONG_ECT_1_NEGOTIATION to distiguish from TCP_CONG_ECT_1_ESTABLISH (Paolo Abeni <pabeni(a)redhat.com>)
- Move TCP_CONG_ECT_1_ESTABLISH in #6 to latter patch series (Paolo Abeni <pabeni(a)redhat.com>)
- Add new synack_type instead of moving the increment of num_retran in #9 (Paolo Abeni <pabeni(a)redhat.com>)
- Use new synack_type TCP_SYNACK_RETRANS and num_retrans for SYN/ACK retx fallbackk for AccECN in #10 (Paolo Abeni <pabeni(a)redhat.com>)
- Do not cast const struct into non-const in #11, and set AccECN fail mode after tcp_rtx_synack() (Paolo Abeni <pabeni(a)redhat.com>)
v5:
- Move previous #11 in v4 in latter patch after discussion with RFC author.
- Add #3 to update the comments for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN. (Parav Pandit <parav(a)nvidia.com>)
- Add gro self-test for TCP CWR flag in #4. (Eric Dumazet <edumazet(a)google.com>)
- Add fixes: tag into #7 (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message of #8 and if condition check (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #13 (Paolo Abeni <pabeni(a)redhat.com>)
v4:
- Add previous #13 in v2 back after dicussion with the RFC author.
- Add TCP_ACCECN_OPTION_PERSIST to tcp_ecn_option sysctl to ignore AccECN fallback policy on sending AccECN option.
v3:
- Add additional min() check if pkts_acked_ewma is not initialized in #1. (Paolo Abeni <pabeni(a)redhat.com>)
- Change TCP_CONG_WANTS_ECT_1 into individual flag add helper function INET_ECN_xmit_wants_ect_1() in #3. (Paolo Abeni <pabeni(a)redhat.com>)
- Add empty line between variable declarations and code in #4. (Paolo Abeni <pabeni(a)redhat.com>)
- Update commit message to fix old AccECN commits in #5. (Paolo Abeni <pabeni(a)redhat.com>)
- Remove unnecessary brackets in #10. (Paolo Abeni <pabeni(a)redhat.com>)
- Move patch #3 in v2 to a later Prague patch serise and remove patch #13 in v2. (Paolo Abeni <pabeni(a)redhat.com>)
---
Chia-Yu Chang (12):
net: update commnets for SKB_GSO_TCP_ECN and SKB_GSO_TCP_ACCECN
selftests/net: gro: add self-test for TCP CWR flag
tcp: ECT_1_NEGOTIATION and NEEDS_ACCECN identifiers
tcp: disable RFC3168 fallback identifier for CC modules
tcp: accecn: handle unexpected AccECN negotiation feedback
tcp: accecn: retransmit downgraded SYN in AccECN negotiation
tcp: add TCP_SYNACK_RETRANS synack_type
tcp: accecn: retransmit SYN/ACK without AccECN option or non-AccECN
SYN/ACK
tcp: accecn: unset ECT if receive or send ACE=0 in AccECN negotiaion
tcp: accecn: fallback outgoing half link to non-AccECN
tcp: accecn: detect loss ACK w/ AccECN option and add
TCP_ACCECN_OPTION_PERSIST
tcp: accecn: enable AccECN
Ilpo Järvinen (2):
tcp: try to avoid safer when ACKs are thinned
gro: flushing when CWR is set negatively affects AccECN
Documentation/networking/ip-sysctl.rst | 4 +-
.../networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/skbuff.h | 14 ++-
include/linux/tcp.h | 4 +-
include/net/inet_ecn.h | 20 +++-
include/net/tcp.h | 32 ++++++-
include/net/tcp_ecn.h | 92 ++++++++++++++-----
net/ipv4/inet_connection_sock.c | 4 +
net/ipv4/sysctl_net_ipv4.c | 4 +-
net/ipv4/tcp.c | 2 +
net/ipv4/tcp_cong.c | 5 +-
net/ipv4/tcp_input.c | 37 +++++++-
net/ipv4/tcp_minisocks.c | 46 +++++++---
net/ipv4/tcp_offload.c | 3 +-
net/ipv4/tcp_output.c | 32 ++++---
net/ipv4/tcp_timer.c | 3 +
tools/testing/selftests/net/gro.c | 80 +++++++++++-----
17 files changed, 295 insertions(+), 88 deletions(-)
--
2.34.1
This patch set introduces the BPF_F_CPU and BPF_F_ALL_CPUS flags for
percpu maps, as the requirement of BPF_F_ALL_CPUS flag for percpu_array
maps was discussed in the thread of
"[PATCH bpf-next v3 0/4] bpf: Introduce global percpu data"[1].
The goal of BPF_F_ALL_CPUS flag is to reduce data caching overhead in light
skeletons by allowing a single value to be reused to update values across all
CPUs. This avoids the M:N problem where M cached values are used to update a
map on N CPUs kernel.
The BPF_F_CPU flag is accompanied by *flags*-embedded cpu info, which
specifies the target CPU for the operation:
* For lookup operations: the flag field alongside cpu info enable querying
a value on the specified CPU.
* For update operations: the flag field alongside cpu info enable
updating value for specified CPU.
Links:
[1] https://lore.kernel.org/bpf/20250526162146.24429-1-leon.hwang@linux.dev/
Changes:
v10 -> v11:
* Support the combination of BPF_EXIST and BPF_F_CPU/BPF_F_ALL_CPUS for
update operations.
* Fix unstable lru_percpu_hash map test using the combination of
BPF_EXIST and BPF_F_CPU/BPF_F_ALL_CPUS to avoid LRU eviction
(reported by Alexei).
v9 -> v10:
* Add tests to verify array and hash maps do not support BPF_F_CPU and
BPF_F_ALL_CPUS flags.
* Address comment from Andrii:
* Copy map value using copy_map_value_long for percpu_cgroup_storage
maps in a separate patch.
v8 -> v9:
* Change value type from u64 to u32 in selftests.
* Address comments from Andrii:
* Keep value_size unaligned and update everywhere for consistency when
cpu flags are specified.
* Update value by getting pointer for percpu hash and percpu
cgroup_storage maps.
v7 -> v8:
* Address comments from Andrii:
* Check BPF_F_LOCK when update percpu_array, percpu_hash and
lru_percpu_hash maps.
* Refactor flags check in __htab_map_lookup_and_delete_batch().
* Keep value_size unaligned and copy value using copy_map_value() in
__htab_map_lookup_and_delete_batch() when BPF_F_CPU is specified.
* Update warn message in libbpf's validate_map_op().
* Update comment of libbpf's bpf_map__lookup_elem().
v6 -> v7:
* Get correct value size for percpu_hash and lru_percpu_hash in
update_batch API.
* Set 'count' as 'max_entries' in test cases for lookup_batch API.
* Address comment from Alexei:
* Move cpu flags check into bpf_map_check_op_flags().
v5 -> v6:
* Move bpf_map_check_op_flags() from 'bpf.h' to 'syscall.c'.
* Address comments from Alexei:
* Drop the refactoring code of data copying logic for percpu maps.
* Drop bpf_map_check_op_flags() wrappers.
v4 -> v5:
* Address comments from Andrii:
* Refactor data copying logic for all percpu maps.
* Drop this_cpu_ptr() micro-optimization.
* Drop cpu check in libbpf's validate_map_op().
* Enhance bpf_map_check_op_flags() using *allowed flags* instead of
'extra_flags_mask'.
v3 -> v4:
* Address comments from Andrii:
* Remove unnecessary map_type check in bpf_map_value_size().
* Reduce code churn.
* Remove unnecessary do_delete check in
__htab_map_lookup_and_delete_batch().
* Introduce bpf_percpu_copy_to_user() and bpf_percpu_copy_from_user().
* Rename check_map_flags() to bpf_map_check_op_flags() with
extra_flags_mask.
* Add human-readable pr_warn() explanations in validate_map_op().
* Use flags in bpf_map__delete_elem() and
bpf_map__lookup_and_delete_elem().
* Drop "for alignment reasons".
v3 link: https://lore.kernel.org/bpf/20250821160817.70285-1-leon.hwang@linux.dev/
v2 -> v3:
* Address comments from Alexei:
* Use BPF_F_ALL_CPUS instead of BPF_ALL_CPUS magic.
* Introduce these two cpu flags for all percpu maps.
* Address comments from Jiri:
* Reduce some unnecessary u32 cast.
* Refactor more generic map flags check function.
* A code style issue.
v2 link: https://lore.kernel.org/bpf/20250805163017.17015-1-leon.hwang@linux.dev/
v1 -> v2:
* Address comments from Andrii:
* Embed cpu info as high 32 bits of *flags* totally.
* Use ERANGE instead of E2BIG.
* Few format issues.
Leon Hwang (8):
bpf: Introduce internal bpf_map_check_op_flags helper function
bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array
maps
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash
and lru_percpu_hash maps
bpf: Copy map value using copy_map_value_long for
percpu_cgroup_storage maps
bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for
percpu_cgroup_storage maps
libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps
selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags
include/linux/bpf-cgroup.h | 4 +-
include/linux/bpf.h | 44 ++-
include/uapi/linux/bpf.h | 2 +
kernel/bpf/arraymap.c | 32 +-
kernel/bpf/hashtab.c | 96 +++--
kernel/bpf/local_storage.c | 27 +-
kernel/bpf/syscall.c | 68 ++--
tools/include/uapi/linux/bpf.h | 2 +
tools/lib/bpf/bpf.h | 8 +
tools/lib/bpf/libbpf.c | 26 +-
tools/lib/bpf/libbpf.h | 21 +-
.../selftests/bpf/prog_tests/percpu_alloc.c | 335 ++++++++++++++++++
.../selftests/bpf/progs/percpu_alloc_array.c | 32 ++
13 files changed, 590 insertions(+), 107 deletions(-)
--
2.51.2
The bench test "trig-kernel-count" can be used as a baseline comparison
for fentry and other benchmarks, and the calling to bpf_get_numa_node_id()
should be considered as composition of the baseline. So, let's call it in
trigger_count(). Meanwhile, rename trigger_count() to
trigger_kernel_count() to make it easier understand.
Signed-off-by: Menglong Dong <dongml2(a)chinatelecom.cn>
---
tools/testing/selftests/bpf/benchs/bench_trigger.c | 4 ++--
tools/testing/selftests/bpf/progs/trigger_bench.c | 6 ++++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
index 1e2aff007c2a..34018fc3927f 100644
--- a/tools/testing/selftests/bpf/benchs/bench_trigger.c
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -180,10 +180,10 @@ static void trigger_kernel_count_setup(void)
{
setup_ctx();
bpf_program__set_autoload(ctx.skel->progs.trigger_driver, false);
- bpf_program__set_autoload(ctx.skel->progs.trigger_count, true);
+ bpf_program__set_autoload(ctx.skel->progs.trigger_kernel_count, true);
load_ctx();
/* override driver program */
- ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_count);
+ ctx.driver_prog_fd = bpf_program__fd(ctx.skel->progs.trigger_kernel_count);
}
static void trigger_kprobe_setup(void)
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 3d5f30c29ae3..2898b3749d07 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -42,12 +42,14 @@ int bench_trigger_uprobe_multi(void *ctx)
const volatile int batch_iters = 0;
SEC("?raw_tp")
-int trigger_count(void *ctx)
+int trigger_kernel_count(void *ctx)
{
int i;
- for (i = 0; i < batch_iters; i++)
+ for (i = 0; i < batch_iters; i++) {
inc_counter();
+ bpf_get_numa_node_id();
+ }
return 0;
}
--
2.51.2
This series adds support for tests that use multiple devices, and adds
one new test, vfio_pci_device_init_perf_test, which measures parallel
device initialization time to demonstrate the improvement from commit
e908f58b6beb ("vfio/pci: Separate SR-IOV VF dev_set").
This series also breaks apart the monolithic vfio_util.h and
vfio_pci_device.c into separate files, to account for all the new code.
This required quite a bit of code motion so the diffstat looks large.
The final layout is more granular and provides a better separation of
the IOMMU code from the device code.
Final layout:
C files:
- tools/testing/selftests/vfio/lib/libvfio.c
- tools/testing/selftests/vfio/lib/iommu.c
- tools/testing/selftests/vfio/lib/iova_allocator.c
- tools/testing/selftests/vfio/lib/vfio_pci_device.c
- tools/testing/selftests/vfio/lib/vfio_pci_driver.c
H files:
- tools/testing/selftests/vfio/lib/include/libvfio.h
- tools/testing/selftests/vfio/lib/include/libvfio/assert.h
- tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
- tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
- tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
- tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
Notably, vfio_util.h is now gone and replaced with libvfio.h.
This series is based on vfio/next plus Alex Mastro's series to add the
IOVA allocator [1]. It should apply cleanly to vfio/next once Alex's
series is merged by Linus into the next 6.18 rc and then merged into
vfio/next.
This series can be found on GitHub:
https://github.com/dmatlack/linux/tree/vfio/selftests/init_perf_test/v3
[1] https://lore.kernel.org/kvm/20251111-iova-ranges-v3-0-7960244642c5@fb.com/
Cc: Alex Mastro <amastro(a)fb.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Josh Hilke <jrhilke(a)google.com>
Cc: Raghavendra Rao Ananta <rananta(a)google.com>
Cc: Vipin Sharma <vipinsh(a)google.com>
v3:
- Replace literal with NSEC_PER_SEC (Alex Mastro)
- Fix Makefile accumulate vs. assignment (Alex Mastro)
v2: https://lore.kernel.org/kvm/20251112192232.442761-1-dmatlack@google.com/
v1: https://lore.kernel.org/kvm/20251008232531.1152035-1-dmatlack@google.com/
David Matlack (18):
vfio: selftests: Move run.sh into scripts directory
vfio: selftests: Split run.sh into separate scripts
vfio: selftests: Allow passing multiple BDFs on the command line
vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode
vfio: selftests: Introduce struct iommu
vfio: selftests: Support multiple devices in the same
container/iommufd
vfio: selftests: Eliminate overly chatty logging
vfio: selftests: Prefix logs with device BDF where relevant
vfio: selftests: Upgrade driver logging to dev_err()
vfio: selftests: Rename struct vfio_dma_region to dma_region
vfio: selftests: Move IOMMU library code into iommu.c
vfio: selftests: Move IOVA allocator into iova_allocator.c
vfio: selftests: Stop passing device for IOMMU operations
vfio: selftests: Rename vfio_util.h to libvfio.h
vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c
vfio: selftests: Split libvfio.h into separate header files
vfio: selftests: Eliminate INVALID_IOVA
vfio: selftests: Add vfio_pci_device_init_perf_test
tools/testing/selftests/vfio/Makefile | 10 +-
.../selftests/vfio/lib/drivers/dsa/dsa.c | 36 +-
.../selftests/vfio/lib/drivers/ioat/ioat.c | 18 +-
.../selftests/vfio/lib/include/libvfio.h | 26 +
.../vfio/lib/include/libvfio/assert.h | 54 ++
.../vfio/lib/include/libvfio/iommu.h | 76 +++
.../vfio/lib/include/libvfio/iova_allocator.h | 23 +
.../lib/include/libvfio/vfio_pci_device.h | 125 ++++
.../lib/include/libvfio/vfio_pci_driver.h | 97 +++
.../selftests/vfio/lib/include/vfio_util.h | 331 -----------
tools/testing/selftests/vfio/lib/iommu.c | 465 +++++++++++++++
.../selftests/vfio/lib/iova_allocator.c | 94 +++
tools/testing/selftests/vfio/lib/libvfio.c | 78 +++
tools/testing/selftests/vfio/lib/libvfio.mk | 5 +-
.../selftests/vfio/lib/vfio_pci_device.c | 555 +-----------------
.../selftests/vfio/lib/vfio_pci_driver.c | 16 +-
tools/testing/selftests/vfio/run.sh | 109 ----
.../testing/selftests/vfio/scripts/cleanup.sh | 41 ++
tools/testing/selftests/vfio/scripts/lib.sh | 42 ++
tools/testing/selftests/vfio/scripts/run.sh | 16 +
tools/testing/selftests/vfio/scripts/setup.sh | 48 ++
.../selftests/vfio/vfio_dma_mapping_test.c | 46 +-
.../selftests/vfio/vfio_iommufd_setup_test.c | 2 +-
.../vfio/vfio_pci_device_init_perf_test.c | 168 ++++++
.../selftests/vfio/vfio_pci_device_test.c | 12 +-
.../selftests/vfio/vfio_pci_driver_test.c | 51 +-
26 files changed, 1481 insertions(+), 1063 deletions(-)
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/assert.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iommu.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h
create mode 100644 tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h
delete mode 100644 tools/testing/selftests/vfio/lib/include/vfio_util.h
create mode 100644 tools/testing/selftests/vfio/lib/iommu.c
create mode 100644 tools/testing/selftests/vfio/lib/iova_allocator.c
create mode 100644 tools/testing/selftests/vfio/lib/libvfio.c
delete mode 100755 tools/testing/selftests/vfio/run.sh
create mode 100755 tools/testing/selftests/vfio/scripts/cleanup.sh
create mode 100755 tools/testing/selftests/vfio/scripts/lib.sh
create mode 100755 tools/testing/selftests/vfio/scripts/run.sh
create mode 100755 tools/testing/selftests/vfio/scripts/setup.sh
create mode 100644 tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c
base-commit: fa804aa4ac1b091ef2ec2981f08a1c28aaeba8e7
prerequisite-patch-id: dcf23dcc1198960bda3102eefaa21df60b2e4c54
prerequisite-patch-id: e32e56d5bf7b6c7dd40d737aa3521560407e00f5
prerequisite-patch-id: 4f79a41bf10a4c025ba5f433551b46035aa15878
prerequisite-patch-id: f903a45f0c32319138cd93a007646ab89132b18c
--
2.52.0.rc2.455.g230fcf2819-goog
Hi Shuah,
Thanks for pointing that out. Apologies for missing the mailing lists earlier.
Resending this follow-up with the correct CC list and in plain text format.
Please let me know if there’s anything else I should improve in this patch.
I’m happy to resend it as v4 if needed.
Thanks,
Sameeksha
On Mon, 24 Nov 2025 at 23:59, Shuah Khan <skhan(a)linuxfoundation.org> wrote:
>
> On 11/21/25 23:21, Sameeksha Sankpal wrote:
> > Hi,
> > Just following up on this patch.
> > It’s been a few months, so I wanted to check if there is anything else I
> > should address or improve to move it forward.
>
> I see that you didn't cc any mailing list on this email? Please keep
> everybody in the loop when you send responses.
>
> >
> > Thanks,
> > Sameeksha Sankpal
> >
> > On Fri, 30 May 2025 at 04:25, Sameeksha Sankpal <sameekshasankpal(a)gmail.com>
> > wrote:
> >
> >> Rebase the error logging enhancement for get_proc_stat() against the
> >> upstream seccomp tree with proper indentation formatting.
> >>
> >> Suggested-by: Kees Cook <kees(a)kernel.org>
> >> Signed-off-by: Sameeksha Sankpal <sameekshasankpal(a)gmail.com>
> >> ---
> >> v1 -> v2:
> >> - Used TH_LOG instead of printf for error logging
> >> - Moved variable declaration to the top of the function
> >> - Applied review suggestion by Kees Cook
> >>
> >> v2 -> v3:
> >> - Rebased against upstream seccomp tree (was previously against v1)
> >> - Fixed indentation to use tabs instead of spaces
> >> - Used scripts/checkpatch.pl to check the patch for common errors
> >> - Removed the blank line beforeS S-o-b added in v2
> >>
> >> tools/testing/selftests/seccomp/seccomp_bpf.c | 5 +++++
> >> 1 file changed, 5 insertions(+)
> >>
> >> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c
> >> b/tools/testing/selftests/seccomp/seccomp_bpf.c
> >> index 61acbd45ffaa..dbd7e705a2af 100644
> >> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> >> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> >> @@ -4508,9 +4508,14 @@ static char get_proc_stat(struct __test_metadata
> >> *_metadata, pid_t pid)
> >> char proc_path[100] = {0};
> >> char status;
> >> char *line;
> >> + int rc;
> >>
> >> snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
> >> ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
> >> + rc = get_nth(_metadata, proc_path, 3, &line);
> >> + ASSERT_EQ(rc, 1) {
> >> + TH_LOG("user_notification_fifo: failed to read stat for
> >> PID %d (rc=%d)", pid, rc);
> >> + }
> >>
> >> status = *line;
> >> free(line);
> >> --
> >> 2.43.0
> >>
> >>
> >
> thanks,
> -- Shuah
From: "Mike Rapoport (Microsoft)" <rppt(a)kernel.org>
Hi,
These patches allow guest_memfd to notify userspace about minor page
faults using userfaultfd and let userspace to resolve these page faults
using UFFDIO_CONTINUE.
To allow UFFDIO_CONTINUE outside of the core mm I added a get_shmem_folio()
callback to vm_ops that allows an address space backing a VMA to return a
folio that exists in it's page cache (patch 2)
In order for guest_memfd to notify userspace about page faults, there is a
new VM_FAULT_UFFD_MINOR that a ->fault() handler can return to inform the
page fault handler that it needs to call handle_userfault() to complete the
fault (patch 3).
Patch 4 plumbs these new goodies into guest_memfd.
This series is the minimal change I've been able to come up with to allow
integration of guest_memfd with uffd and while refactoring uffd and making
mfill_atomic() flow more linear would have been a nice improvement, it's
way out of the scope of enabling uffd with guest_memfd.
v2 changes:
* Introduce VM_FAULF_UFFD_MINOR to avoid exporting handle_userfault()
* Simplify vma_can_mfill_atomic()
* Rename get_pagecache_folio() to get_shared_folio() and use inode
instead of vma as its argument
v1: https://lore.kernel.org/all/20251117114631.2029447-1-rppt@kernel.org
Mike Rapoport (Microsoft) (4):
userfaultfd: move vma_can_userfault out of line
userfaultfd, shmem: use a VMA callback to handle UFFDIO_CONTINUE
mm: introduce VM_FAULT_UFFD_MINOR fault reason
guest_memfd: add support for userfaultfd minor mode
Nikita Kalyazin (1):
KVM: selftests: test userfaultfd minor for guest_memfd
include/linux/mm.h | 9 ++
include/linux/mm_types.h | 3 +
include/linux/userfaultfd_k.h | 36 +-----
mm/memory.c | 2 +
mm/shmem.c | 21 +++-
mm/userfaultfd.c | 80 +++++++++++---
.../testing/selftests/kvm/guest_memfd_test.c | 103 ++++++++++++++++++
virt/kvm/guest_memfd.c | 29 +++++
8 files changed, 232 insertions(+), 51 deletions(-)
base-commit: 6a23ae0a96a600d1d12557add110e0bb6e32730c
--
2.50.1
Currently the only way of excluding certain tests from a collection is by
passing all the other tests explicitly via `--test`. Therefore, if the user
wants to skip a single test the resulting command line might be too big,
depending on the collection. Add an option `--skip` that takes care of
that.
Signed-off-by: Ricardo B. Marlière <rbm(a)suse.com>
---
tools/testing/selftests/run_kselftest.sh | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
index d4be97498b32..84d45254675c 100755
--- a/tools/testing/selftests/run_kselftest.sh
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -30,6 +30,7 @@ Usage: $0 [OPTIONS]
-s | --summary Print summary with detailed log in output.log (conflict with -p)
-p | --per-test-log Print test log in /tmp with each test name (conflict with -s)
-t | --test COLLECTION:TEST Run TEST from COLLECTION
+ -S | --skip COLLECTION:TEST Skip TEST from COLLECTION
-c | --collection COLLECTION Run all tests from COLLECTION
-l | --list List the available collection:test entries
-d | --dry-run Don't actually run any tests
@@ -43,6 +44,7 @@ EOF
COLLECTIONS=""
TESTS=""
+SKIP=""
dryrun=""
kselftest_override_timeout=""
ERROR_ON_FAIL=true
@@ -58,6 +60,9 @@ while true; do
-t | --test)
TESTS="$TESTS $2"
shift 2 ;;
+ -S | --skip)
+ SKIP="$SKIP $2"
+ shift 2 ;;
-c | --collection)
COLLECTIONS="$COLLECTIONS $2"
shift 2 ;;
@@ -109,6 +114,12 @@ if [ -n "$TESTS" ]; then
done
available="$(echo "$valid" | sed -e 's/ /\n/g')"
fi
+# Remove tests to be skipped from available list
+if [ -n "$SKIP" ]; then
+ for skipped in $SKIP ; do
+ available="$(echo "$available" | grep -v "^${skipped}$")"
+ done
+fi
kselftest_failures_file="$(mktemp --tmpdir kselftest-failures-XXXXXX)"
export kselftest_failures_file
---
base-commit: a2f7990d330937a204b86b9cafbfef82f87a8693
change-id: 20251125-selftests-add_skip_opt-0f3fd24d7afa
Best regards,
--
Ricardo B. Marlière <rbm(a)suse.com>
Currently, x86, Riscv, Loongarch use the Generic Entry which makes
maintainers' work easier and codes more elegant. arm64 has already
successfully switched to the Generic IRQ Entry in commit
b3cf07851b6c ("arm64: entry: Switch to generic IRQ entry"), it is
time to completely convert arm64 to Generic Entry.
The goal is to bring arm64 in line with other architectures that already
use the generic entry infrastructure, reducing duplicated code and
making it easier to share future changes in entry/exit paths, such as
"Syscall User Dispatch".
This patch set is rebased on v6.18-rc6.
The performance benchmarks from perf bench basic syscall on
real hardware are below:
| Metric | W/O Generic Framework | With Generic Framework | Change |
| ---------- | --------------------- | ---------------------- | ------ |
| Total time | 2.813 [sec] | 2.930 [sec] | ↑4% |
| usecs/op | 0.281349 | 0.293006 | ↑4% |
| ops/sec | 3,554,299 | 3,412,894 | ↓4% |
Compared to earlier with arch specific handling, the performance decreased
by approximately 4%.
It was tested ok with following test cases on QEMU virt platform:
- Perf tests.
- Different `dynamic preempt` mode switch.
- Pseudo NMI tests.
- Stress-ng CPU stress test.
- MTE test case in Documentation/arch/arm64/memory-tagging-extension.rst
and all test cases in tools/testing/selftests/arm64/mte/*.
- "sud" selftest testcase.
- get_syscall_info, peeksiginfo in tools/testing/selftests/ptrace.
The test QEMU configuration is as follows:
qemu-system-aarch64 \
-M virt,gic-version=3,virtualization=on,mte=on \
-cpu max,pauth-impdef=on \
-kernel Image \
-smp 8,sockets=1,cores=4,threads=2 \
-m 512m \
-nographic \
-no-reboot \
-device virtio-rng-pci \
-append "root=/dev/vda rw console=ttyAMA0 kgdboc=ttyAMA0,115200 \
earlycon preempt=voluntary irqchip.gicv3_pseudo_nmi=1" \
-drive if=none,file=images/rootfs.ext4,format=raw,id=hd0 \
-device virtio-blk-device,drive=hd0 \
Chanegs in v7:
- Support "Syscall User Dispatch" by implementing
arch_syscall_is_vdso_sigreturn() as kemal suggested.
- Add aarch64 support for "sud" selftest testcase, which tested ok with
the patch series.
- Fix the kernel test robot warning for arch_ptrace_report_syscall_entry()
and arch_ptrace_report_syscall_exit() in asm/entry-common.h.
- Add perf syscall performance test.
- Link to v6: https://lore.kernel.org/all/20250916082611.2972008-1-ruanjinjie@huawei.com/
Changes in v6:
- Rebased on v6.17-rc5-next as arm64 generic irq entry has merged.
- Update the commit message.
- Link to v5: https://lore.kernel.org/all/20241206101744.4161990-1-ruanjinjie@huawei.com/
Changes in v5:
- Not change arm32 and keep inerrupts_enabled() macro for gicv3 driver.
- Move irqentry_state definition into arch/arm64/kernel/entry-common.c.
- Avoid removing the __enter_from_*() and __exit_to_*() wrappers.
- Update "irqentry_state_t ret/irq_state" to "state"
to keep it consistently.
- Use generic irq entry header for PREEMPT_DYNAMIC after split
the generic entry.
- Also refactor the ARM64 syscall code.
- Introduce arch_ptrace_report_syscall_entry/exit(), instead of
arch_pre/post_report_syscall_entry/exit() to simplify code.
- Make the syscall patches clear separation.
- Update the commit message.
- Link to v4: https://lore.kernel.org/all/20241025100700.3714552-1-ruanjinjie@huawei.com/
Changes in v4:
- Rework/cleanup split into a few patches as Mark suggested.
- Replace interrupts_enabled() macro with regs_irqs_disabled(), instead
of left it here.
- Remove rcu and lockdep state in pt_regs by using temporary
irqentry_state_t as Mark suggested.
- Remove some unnecessary intermediate functions to make it clear.
- Rework preempt irq and PREEMPT_DYNAMIC code
to make the switch more clear.
- arch_prepare_*_entry/exit() -> arch_pre_*_entry/exit().
- Expand the arch functions comment.
- Make arch functions closer to its caller.
- Declare saved_reg in for block.
- Remove arch_exit_to_kernel_mode_prepare(), arch_enter_from_kernel_mode().
- Adjust "Add few arch functions to use generic entry" patch to be
the penultimate.
- Update the commit message.
- Add suggested-by.
- Link to v3: https://lore.kernel.org/all/20240629085601.470241-1-ruanjinjie@huawei.com/
Changes in v3:
- Test the MTE test cases.
- Handle forget_syscall() in arch_post_report_syscall_entry()
- Make the arch funcs not use __weak as Thomas suggested, so move
the arch funcs to entry-common.h, and make arch_forget_syscall() folded
in arch_post_report_syscall_entry() as suggested.
- Move report_single_step() to thread_info.h for arm64
- Change __always_inline() to inline, add inline for the other arch funcs.
- Remove unused signal.h for entry-common.h.
- Add Suggested-by.
- Update the commit message.
Changes in v2:
- Add tested-by.
- Fix a bug that not call arch_post_report_syscall_entry() in
syscall_trace_enter() if ptrace_report_syscall_entry() return not zero.
- Refactor report_syscall().
- Add comment for arch_prepare_report_syscall_exit().
- Adjust entry-common.h header file inclusion to alphabetical order.
- Update the commit message.
Jinjie Ruan (10):
arm64/ptrace: Split report_syscall()
arm64/ptrace: Refactor syscall_trace_enter/exit()
arm64/ptrace: Refator el0_svc_common()
entry: Add syscall_exit_to_user_mode_prepare() helper
arm64/ptrace: Handle ptrace_report_syscall_entry() error
arm64/ptrace: Expand secure_computing() in place
arm64/ptrace: Use syscall_get_arguments() heleper
entry: Add arch_ptrace_report_syscall_entry/exit()
entry: Add has_syscall_work() helper
arm64: entry: Convert to generic entry
kemal (1):
selftests: sud_test: Support aarch64
arch/arm64/Kconfig | 2 +-
arch/arm64/include/asm/entry-common.h | 69 ++++++++++++++
arch/arm64/include/asm/syscall.h | 29 +++++-
arch/arm64/include/asm/thread_info.h | 22 +----
arch/arm64/kernel/debug-monitors.c | 7 ++
arch/arm64/kernel/ptrace.c | 90 -------------------
arch/arm64/kernel/signal.c | 2 +-
arch/arm64/kernel/syscall.c | 31 ++-----
include/linux/entry-common.h | 42 ++++++---
kernel/entry/syscall-common.c | 43 ++++++++-
.../syscall_user_dispatch/sud_test.c | 4 +
11 files changed, 188 insertions(+), 153 deletions(-)
--
2.34.1
syzkaller reported a bug [1] where a socket using sockmap, after being
unloaded, exposed incorrect copied_seq calculation. The selftest I
provided can be used to reproduce the issue reported by syzkaller.
TCP recvmsg seq # bug 2: copied E92C873, seq E68D125, rcvnxt E7CEB7C, fl 40
WARNING: CPU: 1 PID: 5997 at net/ipv4/tcp.c:2724 tcp_recvmsg_locked+0xb2f/0x2910 net/ipv4/tcp.c:2724
Call Trace:
<TASK>
receive_fallback_to_copy net/ipv4/tcp.c:1968 [inline]
tcp_zerocopy_receive+0x131a/0x2120 net/ipv4/tcp.c:2200
do_tcp_getsockopt+0xe28/0x26c0 net/ipv4/tcp.c:4713
tcp_getsockopt+0xdf/0x100 net/ipv4/tcp.c:4812
do_sock_getsockopt+0x34d/0x440 net/socket.c:2421
__sys_getsockopt+0x12f/0x260 net/socket.c:2450
__do_sys_getsockopt net/socket.c:2457 [inline]
__se_sys_getsockopt net/socket.c:2454 [inline]
__x64_sys_getsockopt+0xbd/0x160 net/socket.c:2454
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xcd/0xfa0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
A sockmap socket maintains its own receive queue (ingress_msg) which may
contain data from either its own protocol stack or forwarded from other
sockets.
FD1:read()
-- FD1->copied_seq++
| [read data]
|
[enqueue data] v
[sockmap] -> ingress to self -> ingress_msg queue
FD1 native stack ------> ^
-- FD1->rcv_nxt++ -> redirect to other | [enqueue data]
| |
| ingress to FD1
v ^
... | [sockmap]
FD2 native stack
The issue occurs when reading from ingress_msg: we update tp->copied_seq
by default, but if the data comes from other sockets (not the socket's
own protocol stack), tcp->rcv_nxt remains unchanged. Later, when
converting back to a native socket, reads may fail as copied_seq could
be significantly larger than rcv_nxt.
Additionally, FIONREAD calculation based on copied_seq and rcv_nxt is
insufficient for sockmap sockets, requiring separate field tracking.
[1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983
---
v1 -> v4: Use skmsg.sk instead of extending BPF_F_XXX macro and fix CI
failure reported by CI
v1: https://lore.kernel.org/bpf/20251117110736.293040-1-jiayuan.chen@linux.dev/
Jiayuan Chen (3):
bpf, sockmap: Fix incorrect copied_seq calculation
bpf, sockmap: Fix FIONREAD for sockmap
bpf, selftest: Add tests for FIONREAD and copied_seq
include/linux/skmsg.h | 58 ++++-
net/core/skmsg.c | 28 ++-
net/ipv4/tcp_bpf.c | 26 ++-
net/ipv4/udp_bpf.c | 25 ++-
.../selftests/bpf/prog_tests/sockmap_basic.c | 203 +++++++++++++++++-
.../bpf/progs/test_sockmap_pass_prog.c | 8 +
6 files changed, 331 insertions(+), 17 deletions(-)
--
2.43.0
Hi all,
This is v2 of a short series that adds kernel support for the ratified
Zilsd (Load/Store pair) and Zclsd (Compressed Load/Store pair) RISC-V
ISA extensions. The series enables kernel-side exposure so user-space
(for example glibc) can detect and use these extensions via hwprobe and
runtime checks.
Patches:
- Patch 1:Add device tree bindings documentation for Zilsd and Zclsd.
- Patch 2: Extend RISC-V ISA extension string parsing to recognize them.
- Patch 3: Export Zilsd and Zclsd via riscv_hwprobe.
- Patch 4: Allow KVM guests to use them.
- Patch 5: Add KVM selftests.
Changes in v2:
- Device-tree schema: simplified the rv64 validation for Zilsd by
removing a redundant `contais: const: zilsd` in the `if` clause; the
simpler `if (riscv, isa-base contains rv64i) then (riscv,
isa-extension not contains zilsd)` form is used instead. Behaviour is
unchanged, and the logic is cleaner.
- Device-tree schema: corrected Zclsd dependency to require both Zilsd
and Zca (previous `anyOf` was incorrect; now both are enforced).
- Commit message typo fixed: "dt-bidings" -> "dt-bindings" in the Patch
1 commit subject.
The v2 changes are documentation/schema corrections in extensions.yaml.
No functional changes were made to ISA parsing, hwprobe syscall, KVM
guest support or the selftests beyond ensuring the binding correctly
documents and validates the extension relationships.
Please review v2 and advise if futher changes are needed.
Thanks,
Pincheng Wang
Pincheng Wang (5):
dt-bindings: riscv: add Zilsd and Zclsd extension descriptions
riscv: add ISA extension parsing for Zilsd and Zclsd
riscv: hwprobe: export Zilsd and Zclsd ISA extensions
riscv: KVM: allow Zilsd and Zclsd extensions for Guest/VM
KVM: riscv: selftests: add Zilsd and Zclsd extension to get-reg-list
test
Documentation/arch/riscv/hwprobe.rst | 8 +++++
.../devicetree/bindings/riscv/extensions.yaml | 36 +++++++++++++++++++
arch/riscv/include/asm/hwcap.h | 2 ++
arch/riscv/include/uapi/asm/hwprobe.h | 2 ++
arch/riscv/include/uapi/asm/kvm.h | 2 ++
arch/riscv/kernel/cpufeature.c | 24 +++++++++++++
arch/riscv/kernel/sys_hwprobe.c | 2 ++
arch/riscv/kvm/vcpu_onereg.c | 2 ++
.../selftests/kvm/riscv/get-reg-list.c | 6 ++++
9 files changed, 84 insertions(+)
--
2.39.5
This series adds namespace support to vhost-vsock and loopback. It does
not add namespaces to any of the other guest transports (virtio-vsock,
hyperv, or vmci).
The current revision supports two modes: local and global. Local
mode is complete isolation of namespaces, while global mode is complete
sharing between namespaces of CIDs (the original behavior).
The mode is set using /proc/sys/net/vsock/ns_mode.
Modes are per-netns and write-once. This allows a system to configure
namespaces independently (some may share CIDs, others are completely
isolated). This also supports future possible mixed use cases, where
there may be namespaces in global mode spinning up VMs while there are
mixed mode namespaces that provide services to the VMs, but are not
allowed to allocate from the global CID pool (this mode is not
implemented in this series).
If a socket or VM is created when a namespace is global but the
namespace changes to local, the socket or VM will continue working
normally. That is, the socket or VM assumes the mode behavior of the
namespace at the time the socket/VM was created. The original mode is
captured in vsock_create() and so occurs at the time of socket(2) and
accept(2) for sockets and open(2) on /dev/vhost-vsock for VMs. This
prevents a socket/VM connection from suddenly breaking due to a
namespace mode change. Any new sockets/VMs created after the mode change
will adopt the new mode's behavior.
Additionally, added tests for the new namespace features:
tools/testing/selftests/vsock/vmtest.sh
1..29
ok 1 vm_server_host_client
ok 2 vm_client_host_server
ok 3 vm_loopback
ok 4 ns_vm_local_mode_rejected
ok 5 ns_host_vsock_ns_mode_ok
ok 6 ns_host_vsock_ns_mode_write_once_ok
ok 7 ns_global_same_cid_fails
ok 8 ns_local_same_cid_ok
ok 9 ns_global_local_same_cid_ok
ok 10 ns_local_global_same_cid_ok
ok 11 ns_diff_global_host_connect_to_global_vm_ok
ok 12 ns_diff_global_host_connect_to_local_vm_fails
ok 13 ns_diff_global_vm_connect_to_global_host_ok
ok 14 ns_diff_global_vm_connect_to_local_host_fails
ok 15 ns_diff_local_host_connect_to_local_vm_fails
ok 16 ns_diff_local_vm_connect_to_local_host_fails
ok 17 ns_diff_global_to_local_loopback_local_fails
ok 18 ns_diff_local_to_global_loopback_fails
ok 19 ns_diff_local_to_local_loopback_fails
ok 20 ns_diff_global_to_global_loopback_ok
ok 21 ns_same_local_loopback_ok
ok 22 ns_same_local_host_connect_to_local_vm_ok
ok 23 ns_same_local_vm_connect_to_local_host_ok
ok 24 ns_mode_change_connection_continue_vm_ok
ok 25 ns_mode_change_connection_continue_host_ok
ok 26 ns_mode_change_connection_continue_both_ok
ok 27 ns_delete_vm_ok
ok 28 ns_delete_host_ok
ok 29 ns_delete_both_ok
SUMMARY: PASS=29 SKIP=0 FAIL=0
Dependent on series:
https://lore.kernel.org/all/20251108-vsock-selftests-fixes-and-improvements…
Thanks again for everyone's help and reviews!
Suggested-by: Sargun Dhillon <sargun(a)sargun.me>
Signed-off-by: Bobby Eshleman <bobbyeshleman(a)gmail.com>
Changes in v11:
- vmtest: add a patch to use ss in wait_for_listener functions and
support vsock, tcp, and unix. Change all patches to use the new
functions.
- vmtest: add a patch to re-use vm dmesg / warn counting functions
- Link to v10: https://lore.kernel.org/r/20251117-vsock-vmtest-v10-0-df08f165bf3e@meta.com
Changes in v10:
- Combine virtio common patches into one (Stefano)
- Resolve vsock_loopback virtio_transport_reset_no_sock() issue
with info->vsk setting. This eliminates the need for skb->cb,
so remove skb->cb patches.
- many line width 80 fixes
- Link to v9: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-0-852787a37bed@meta.com
Changes in v9:
- reorder loopback patch after patch for virtio transport common code
- remove module ordering tests patch because loopback no longer depends
on pernet ops
- major simplifications in vsock_loopback
- added a new patch for blocking local mode for guests, added test case
to check
- add net ref tracking to vsock_loopback patch
- Link to v8: https://lore.kernel.org/r/20251023-vsock-vmtest-v8-0-dea984d02bb0@meta.com
Changes in v8:
- Break generic cleanup/refactoring patches into standalone series,
remove those from this series
- Link to dependency: https://lore.kernel.org/all/20251022-vsock-selftests-fixes-and-improvements…
- Link to v7: https://lore.kernel.org/r/20251021-vsock-vmtest-v7-0-0661b7b6f081@meta.com
Changes in v7:
- fix hv_sock build
- break out vmtest patches into distinct, more well-scoped patches
- change `orig_net_mode` to `net_mode`
- many fixes and style changes in per-patch change sets (see individual
patches for specific changes)
- optimize `virtio_vsock_skb_cb` layout
- update commit messages with more useful descriptions
- vsock_loopback: use orig_net_mode instead of current net mode
- add tests for edge cases (ns deletion, mode changing, loopback module
load ordering)
- Link to v6: https://lore.kernel.org/r/20250916-vsock-vmtest-v6-0-064d2eb0c89d@meta.com
Changes in v6:
- define behavior when mode changes to local while socket/VM is alive
- af_vsock: clarify description of CID behavior
- af_vsock: use stronger langauge around CID rules (dont use "may")
- af_vsock: improve naming of buf/buffer
- af_vsock: improve string length checking on proc writes
- vsock_loopback: add space in struct to clarify lock protection
- vsock_loopback: do proper cleanup/unregister on vsock_loopback_exit()
- vsock_loopback: use virtio_vsock_skb_net() instead of sock_net()
- vsock_loopback: set loopback to NULL after kfree()
- vsock_loopback: use pernet_operations and remove callback mechanism
- vsock_loopback: add macros for "global" and "local"
- vsock_loopback: fix length checking
- vmtest.sh: check for namespace support in vmtest.sh
- Link to v5: https://lore.kernel.org/r/20250827-vsock-vmtest-v5-0-0ba580bede5b@meta.com
Changes in v5:
- /proc/net/vsock_ns_mode -> /proc/sys/net/vsock/ns_mode
- vsock_global_net -> vsock_global_dummy_net
- fix netns lookup in vhost_vsock to respect pid namespaces
- add callbacks for vsock_loopback to avoid circular dependency
- vmtest.sh loads vsock_loopback module
- remove vsock_net_mode_can_set()
- change vsock_net_write_mode() to return true/false based on success
- make vsock_net_mode enum instead of u8
- Link to v4: https://lore.kernel.org/r/20250805-vsock-vmtest-v4-0-059ec51ab111@meta.com
Changes in v4:
- removed RFC tag
- implemented loopback support
- renamed new tests to better reflect behavior
- completed suite of tests with permutations of ns modes and vsock_test
as guest/host
- simplified socat bridging with unix socket instead of tcp + veth
- only use vsock_test for success case, socat for failure case (context
in commit message)
- lots of cleanup
Changes in v3:
- add notion of "modes"
- add procfs /proc/net/vsock_ns_mode
- local and global modes only
- no /dev/vhost-vsock-netns
- vmtest.sh already merged, so new patch just adds new tests for NS
- Link to v2:
https://lore.kernel.org/kvm/20250312-vsock-netns-v2-0-84bffa1aa97a@gmail.com
Changes in v2:
- only support vhost-vsock namespaces
- all g2h namespaces retain old behavior, only common API changes
impacted by vhost-vsock changes
- add /dev/vhost-vsock-netns for "opt-in"
- leave /dev/vhost-vsock to old behavior
- removed netns module param
- Link to v1:
https://lore.kernel.org/r/20200116172428.311437-1-sgarzare@redhat.com
Changes in v1:
- added 'netns' module param to vsock.ko to enable the
network namespace support (disabled by default)
- added 'vsock_net_eq()' to check the "net" assigned to a socket
only when 'netns' support is enabled
- Link to RFC: https://patchwork.ozlabs.org/cover/1202235/
---
Bobby Eshleman (13):
vsock: a per-net vsock NS mode state
vsock: add netns to vsock core
vsock: reject bad VSOCK_NET_MODE_LOCAL configuration for G2H
virtio: set skb owner of virtio_transport_reset_no_sock() reply
vsock: add netns support to virtio transports
selftests/vsock: add namespace helpers to vmtest.sh
selftests/vsock: prepare vm management helpers for namespaces
selftests/vsock: add vm_dmesg_{warn,oops}_count() helpers
selftests/vsock: use ss to wait for listeners instead of /proc/net
selftests/vsock: add tests for proc sys vsock ns_mode
selftests/vsock: add namespace tests for CID collisions
selftests/vsock: add tests for host <-> vm connectivity with namespaces
selftests/vsock: add tests for namespace deletion and mode changes
MAINTAINERS | 1 +
drivers/vhost/vsock.c | 57 +-
include/linux/virtio_vsock.h | 8 +-
include/net/af_vsock.h | 64 +-
include/net/net_namespace.h | 4 +
include/net/netns/vsock.h | 17 +
net/vmw_vsock/af_vsock.c | 290 ++++++++-
net/vmw_vsock/hyperv_transport.c | 6 +
net/vmw_vsock/virtio_transport.c | 29 +-
net/vmw_vsock/virtio_transport_common.c | 69 +-
net/vmw_vsock/vmci_transport.c | 12 +
net/vmw_vsock/vsock_loopback.c | 20 +-
tools/testing/selftests/vsock/vmtest.sh | 1087 +++++++++++++++++++++++++++++--
13 files changed, 1560 insertions(+), 104 deletions(-)
---
base-commit: 962ac5ca99a5c3e7469215bf47572440402dfd59
change-id: 20250325-vsock-vmtest-b3a21d2102c2
prerequisite-message-id: <20251022-vsock-selftests-fixes-and-improvements-v1-0-edeb179d6463(a)meta.com>
prerequisite-patch-id: a2eecc3851f2509ed40009a7cab6990c6d7cfff5
prerequisite-patch-id: 501db2100636b9c8fcb3b64b8b1df797ccbede85
prerequisite-patch-id: ba1a2f07398a035bc48ef72edda41888614be449
prerequisite-patch-id: fd5cc5445aca9355ce678e6d2bfa89fab8a57e61
prerequisite-patch-id: 795ab4432ffb0843e22b580374782e7e0d99b909
prerequisite-patch-id: 1499d263dc933e75366c09e045d2125ca39f7ddd
prerequisite-patch-id: f92d99bb1d35d99b063f818a19dcda999152d74c
prerequisite-patch-id: e3296f38cdba6d903e061cff2bbb3e7615e8e671
prerequisite-patch-id: bc4662b4710d302d4893f58708820fc2a0624325
prerequisite-patch-id: f8991f2e98c2661a706183fde6b35e2b8d9aedcf
prerequisite-patch-id: 44bf9ed69353586d284e5ee63d6fffa30439a698
prerequisite-patch-id: d50621bc630eeaf608bbaf260370c8dabf6326df
Best regards,
--
Bobby Eshleman <bobbyeshleman(a)meta.com>
'head' is defined as 'static struct robust_list_head' that stores the
local variable of 'struct lock_struct a' raising the Wdangling-pointer
warning.
robust_list.c: In function ���child_circular_list���:
robust_list.c:522:24: warning: storing the address of local variable
���a��� in ���head.list.next��� [-Wdangling-pointer=]
522 | head.list.next = &a.list;
| ~~~~~~~~~~~~~~~^~~~~~~~~
robust_list.c:513:28: note: ���a��� declared here
513 | struct lock_struct a, b, c;
| ^
robust_list.c:512:40: note: ���head��� declared here
512 | static struct robust_list_head head;
| ^~~~
Since 'head' doesn't need static storge duration, removing the static
keyword of it to fix this.
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux(a)gmail.com>
---
v3:
Updated the patch name and msg as suggested by Andr��.
v2: https://lore.kernel.org/all/20251118170907.108832-1-ankitkhushwaha.linux@gm…
Added changes suggested by Andr��.
v1: https://lore.kernel.org/all/20251118162619.50586-1-ankitkhushwaha.linux@gma…
---
tools/testing/selftests/futex/functional/robust_list.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/futex/functional/robust_list.c b/tools/testing/selftests/futex/functional/robust_list.c
index e7d1254e18ca..ef21a7ec9def 100644
--- a/tools/testing/selftests/futex/functional/robust_list.c
+++ b/tools/testing/selftests/futex/functional/robust_list.c
@@ -509,7 +509,7 @@ TEST(test_robust_list_multiple_elements)
static int child_circular_list(void *arg)
{
- static struct robust_list_head head;
+ struct robust_list_head head;
struct lock_struct a, b, c;
int ret;
--
2.52.0
netdev CI reserves SKIP in selftests for cases which can't be executed
due to setup issues, like missing or old commands. Tests which are
expected to fail must use XFAIL.
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: kuniyu(a)google.com
CC: adelodunolaoluwa(a)yahoo.com
CC: shuah(a)kernel.org
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/net/af_unix/unix_connreset.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/af_unix/unix_connreset.c b/tools/testing/selftests/net/af_unix/unix_connreset.c
index bffef2b54bfd..6eb936207b31 100644
--- a/tools/testing/selftests/net/af_unix/unix_connreset.c
+++ b/tools/testing/selftests/net/af_unix/unix_connreset.c
@@ -161,8 +161,12 @@ TEST_F(unix_sock, reset_closed_embryo)
char buf[16] = {};
ssize_t n;
- if (variant->socket_type == SOCK_DGRAM)
- SKIP(return, "This test only applies to SOCK_STREAM and SOCK_SEQPACKET");
+ if (variant->socket_type == SOCK_DGRAM) {
+ snprintf(_metadata->results->reason,
+ sizeof(_metadata->results->reason),
+ "Test only applies to SOCK_STREAM and SOCK_SEQPACKET");
+ exit(KSFT_XFAIL);
+ }
/* Close server without accept()ing */
close(self->server);
--
2.51.1
This commit ensures that the required log level is set at the start of
the test iteration.
Part of the cleanup performed at the end of each test iteration resets
the log level (do_cleanup in lib_netcons.sh) to the values defined at the
time test script started. This may cause further test iterations to fail
if the default values are not sufficient.
Signed-off-by: Andre Carvalho <asantostc(a)gmail.com>
---
tools/testing/selftests/drivers/net/netcons_basic.sh | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index a3446b569976..2022f3061738 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -28,8 +28,6 @@ OUTPUT_FILE="/tmp/${TARGET}"
# Check for basic system dependency and exit if not found
check_for_dependencies
-# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
-echo "6 5" > /proc/sys/kernel/printk
# Remove the namespace, interfaces and netconsole target on exit
trap cleanup EXIT
@@ -39,6 +37,9 @@ do
for IP_VERSION in "ipv6" "ipv4"
do
echo "Running with target mode: ${FORMAT} (${IP_VERSION})"
+ # Set current loglevel to KERN_INFO(6), and default to
+ # KERN_NOTICE(5)
+ echo "6 5" > /proc/sys/kernel/printk
# Create one namespace and two interfaces
set_network "${IP_VERSION}"
# Create a dynamic target for netconsole
---
base-commit: e2c20036a8879476c88002730d8a27f4e3c32d4b
change-id: 20251121-netcons-basic-loglevel-69e2715c1029
Best regards,
--
Andre Carvalho <asantostc(a)gmail.com>
First patch here tries to auto-disable building the iouring sample.
Our CI will still run the iouring test(s), of course, but it looks
like the liburing updates aren't very quick in distroes and having
to hack around it when developing unrelated tests is a bit annoying.
Remaining 4 patches iron out running the Toeplitz hash test against
real NICs. I tested mlx5, bnxt and fbnic, they all pass now.
I switched to using YNL directly in the C code, can't see a reason
to get the info in Python and pass it to C via argv. The old code
likely did this because it predates YNL.
Jakub Kicinski (5):
selftests: hw-net: auto-disable building the iouring C code
selftests: hw-net: toeplitz: make sure NICs have pure Toeplitz
configured
selftests: hw-net: toeplitz: read the RSS key directly from C
selftests: hw-net: toeplitz: read indirection table from the device
selftests: hw-net: toeplitz: give the test up to 4 seconds
.../testing/selftests/drivers/net/hw/Makefile | 23 ++++++-
.../selftests/drivers/net/hw/toeplitz.c | 65 ++++++++++++++++++-
.../selftests/drivers/net/hw/toeplitz.py | 28 ++++----
3 files changed, 98 insertions(+), 18 deletions(-)
--
2.51.1
Since commit 31158ad02ddb ("rqspinlock: Add deadlock detection
and recovery") the updated path on re-entrancy now reports deadlock
via -EDEADLK instead of the previous -EBUSY.
Also, the way reentrancy was exercised (via fentry/lookup_elem_raw)
has been fragile because lookup_elem_raw may be inlined
(find_kernel_btf_id() will return -ESRCH).
To fix this fentry is attached to bpf_obj_free_fields() instead of
lookup_elem_raw() and:
- The htab map is made to use a BTF-described struct val with a
struct bpf_timer so that check_and_free_fields() reliably calls
bpf_obj_free_fields() on element replacement.
- The selftest is updated to do two updates to the same key (insert +
replace) in prog_test.
- The selftest is updated to align with expected errno with the
kernel’s current behavior.
Signed-off-by: Saket Kumar Bhaskar <skb99(a)linux.ibm.com>
---
Changes since v2:
Addressed CI failures:
* Initialize key to 0 before the first update.
* Used pointer value to pass for update and memset rather than
&value.
v2: https://lore.kernel.org/all/20251114152653.356782-1-skb99@linux.ibm.com/
Changes since v1:
Addressed comments from Alexei:
* Fixed the scenario where test may fail when lookup_elem_raw()
is inlined.
v1: https://lore.kernel.org/all/20251106052628.349117-1-skb99@linux.ibm.com/
.../selftests/bpf/prog_tests/htab_update.c | 37 ++++++++++++++-----
.../testing/selftests/bpf/progs/htab_update.c | 19 +++++++---
2 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/htab_update.c b/tools/testing/selftests/bpf/prog_tests/htab_update.c
index 2bc85f4814f4..d0b405eb2966 100644
--- a/tools/testing/selftests/bpf/prog_tests/htab_update.c
+++ b/tools/testing/selftests/bpf/prog_tests/htab_update.c
@@ -15,17 +15,17 @@ struct htab_update_ctx {
static void test_reenter_update(void)
{
struct htab_update *skel;
- unsigned int key, value;
+ void *value = NULL;
+ unsigned int key, value_size;
int err;
skel = htab_update__open();
if (!ASSERT_OK_PTR(skel, "htab_update__open"))
return;
- /* lookup_elem_raw() may be inlined and find_kernel_btf_id() will return -ESRCH */
- bpf_program__set_autoload(skel->progs.lookup_elem_raw, true);
+ bpf_program__set_autoload(skel->progs.bpf_obj_free_fields, true);
err = htab_update__load(skel);
- if (!ASSERT_TRUE(!err || err == -ESRCH, "htab_update__load") || err)
+ if (!ASSERT_TRUE(!err, "htab_update__load") || err)
goto out;
skel->bss->pid = getpid();
@@ -33,14 +33,33 @@ static void test_reenter_update(void)
if (!ASSERT_OK(err, "htab_update__attach"))
goto out;
- /* Will trigger the reentrancy of bpf_map_update_elem() */
+ value_size = bpf_map__value_size(skel->maps.htab);
+
+ value = calloc(1, value_size);
+ if (!ASSERT_OK_PTR(value, "calloc value"))
+ goto out;
+ /*
+ * First update: plain insert. This should NOT trigger the re-entrancy
+ * path, because there is no old element to free yet.
+ */
key = 0;
- value = 0;
- err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, &value, 0);
- if (!ASSERT_OK(err, "add element"))
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
+ if (!ASSERT_OK(err, "first update (insert)"))
+ goto out;
+
+ /*
+ * Second update: replace existing element with same key and trigger
+ * the reentrancy of bpf_map_update_elem().
+ * check_and_free_fields() calls bpf_obj_free_fields() on the old
+ * value, which is where fentry program runs and performs a nested
+ * bpf_map_update_elem(), triggering -EDEADLK.
+ */
+ memset(value, 0, value_size);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.htab), &key, value, BPF_ANY);
+ if (!ASSERT_OK(err, "second update (replace)"))
goto out;
- ASSERT_EQ(skel->bss->update_err, -EBUSY, "no reentrancy");
+ ASSERT_EQ(skel->bss->update_err, -EDEADLK, "no reentrancy");
out:
htab_update__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/progs/htab_update.c b/tools/testing/selftests/bpf/progs/htab_update.c
index 7481bb30b29b..195d3b2fba00 100644
--- a/tools/testing/selftests/bpf/progs/htab_update.c
+++ b/tools/testing/selftests/bpf/progs/htab_update.c
@@ -6,24 +6,31 @@
char _license[] SEC("license") = "GPL";
+/* Map value type: has BTF-managed field (bpf_timer) */
+struct val {
+ struct bpf_timer t;
+ __u64 payload;
+};
+
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1);
- __uint(key_size, sizeof(__u32));
- __uint(value_size, sizeof(__u32));
+ __type(key, __u32);
+ __type(value, struct val);
} htab SEC(".maps");
int pid = 0;
int update_err = 0;
-SEC("?fentry/lookup_elem_raw")
-int lookup_elem_raw(void *ctx)
+SEC("?fentry/bpf_obj_free_fields")
+int bpf_obj_free_fields(void *ctx)
{
- __u32 key = 0, value = 1;
+ __u32 key = 0;
+ struct val value = { .payload = 1 };
if ((bpf_get_current_pid_tgid() >> 32) != pid)
return 0;
- update_err = bpf_map_update_elem(&htab, &key, &value, 0);
+ update_err = bpf_map_update_elem(&htab, &key, &value, BPF_ANY);
return 0;
}
--
2.51.0