Due to several issues which are unlikely to be fixed in the near future,
the access_tracking_perf_test sanity check for how many pages are still clean
after an iteration is not reliable when NUMA balancing is active.
This patch series refactors this test to skip this check by default automatically.
V2: adopted Sean's suggestions.
Best regards,
Maxim Levitsky
Maxim Levitsky (1):
KVM: selftests: access_tracking_perf_test: add option to skip the
sanity check
Sean Christopherson (1):
KVM: selftests: Extract guts of THP accessor to standalone sysfs
helpers
.../selftests/kvm/access_tracking_perf_test.c | 33 +++++++++++++--
.../testing/selftests/kvm/include/test_util.h | 1 +
tools/testing/selftests/kvm/lib/test_util.c | 42 ++++++++++++++-----
3 files changed, 61 insertions(+), 15 deletions(-)
--
2.26.3
With the switch over to nolibc the source file vdso_standalone_test_x86.c
was intended to be replaced with a symlink to vdso_test_gettimeofday.c.
This was the patch that was submitted to LKML, but during application the
symlink was replaced by a textual copy of the linked-to file.
Having two copies introduces the possibility of divergence and increases
maintenance burden, switch back to a symlink.
Link: https://lore.kernel.org/lkml/20250226-parse_vdso-nolibc-v2-16-28e14e031ed8@…
Fixes: 8770a9183fe1 ("selftests: vDSO: vdso_standalone_test_x86: Switch to nolibc")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
---
If symlinks are problematic an #include shim would also work.
These are not handled really well by the kselftests build system though,
as #include dependencies are not tracked by it.
---
.../selftests/vDSO/vdso_standalone_test_x86.c | 59 +---------------------
1 file changed, 1 insertion(+), 58 deletions(-)
diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
deleted file mode 100644
index 9ce795b806f0992b83cef78c7e16fac0e54750da..0000000000000000000000000000000000000000
--- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and
- * vDSO gettimeofday()
- * Copyright (c) 2014 Andy Lutomirski
- *
- * Compile with:
- * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c
- *
- * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too.
- */
-
-#include <stdio.h>
-#ifndef NOLIBC
-#include <sys/auxv.h>
-#include <sys/time.h>
-#endif
-
-#include "../kselftest.h"
-#include "parse_vdso.h"
-#include "vdso_config.h"
-#include "vdso_call.h"
-
-int main(int argc, char **argv)
-{
- const char *version = versions[VDSO_VERSION];
- const char **name = (const char **)&names[VDSO_NAMES];
-
- unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
- if (!sysinfo_ehdr) {
- printf("AT_SYSINFO_EHDR is not present!\n");
- return KSFT_SKIP;
- }
-
- vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR));
-
- /* Find gettimeofday. */
- typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
- gtod_t gtod = (gtod_t)vdso_sym(version, name[0]);
-
- if (!gtod) {
- printf("Could not find %s\n", name[0]);
- return KSFT_SKIP;
- }
-
- struct timeval tv;
- long ret = VDSO_CALL(gtod, 2, &tv, 0);
-
- if (ret == 0) {
- printf("The time is %lld.%06lld\n",
- (long long)tv.tv_sec, (long long)tv.tv_usec);
- } else {
- printf("%s failed\n", name[0]);
- return KSFT_FAIL;
- }
-
- return 0;
-}
diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
new file mode 120000
index 0000000000000000000000000000000000000000..4d3d96f1e440c965474681a6f35375a60b3921be
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
@@ -0,0 +1 @@
+vdso_test_gettimeofday.c
\ No newline at end of file
---
base-commit: 1e26c5e28ca5821a824e90dd359556f5e9e7b89f
change-id: 20250326-vdso-selftests-fix-vdso_standalone_test_x86-c3a77b57ccbd
Best regards,
--
Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
Context
=======
We've observed within Red Hat that isolated, NOHZ_FULL CPUs running a
pure-userspace application get regularly interrupted by IPIs sent from
housekeeping CPUs. Those IPIs are caused by activity on the housekeeping CPUs
leading to various on_each_cpu() calls, e.g.:
64359.052209596 NetworkManager 0 1405 smp_call_function_many_cond (cpu=0, func=do_kernel_range_flush)
smp_call_function_many_cond+0x1
smp_call_function+0x39
on_each_cpu+0x2a
flush_tlb_kernel_range+0x7b
__purge_vmap_area_lazy+0x70
_vm_unmap_aliases.part.42+0xdf
change_page_attr_set_clr+0x16a
set_memory_ro+0x26
bpf_int_jit_compile+0x2f9
bpf_prog_select_runtime+0xc6
bpf_prepare_filter+0x523
sk_attach_filter+0x13
sock_setsockopt+0x92c
__sys_setsockopt+0x16a
__x64_sys_setsockopt+0x20
do_syscall_64+0x87
entry_SYSCALL_64_after_hwframe+0x65
The heart of this series is the thought that while we cannot remove NOHZ_FULL
CPUs from the list of CPUs targeted by these IPIs, they may not have to execute
the callbacks immediately. Anything that only affects kernelspace can wait
until the next user->kernel transition, providing it can be executed "early
enough" in the entry code.
The original implementation is from Peter [1]. Nicolas then added kernel TLB
invalidation deferral to that [2], and I picked it up from there.
Deferral approach
=================
Storing each and every callback, like a secondary call_single_queue turned out
to be a no-go: the whole point of deferral is to keep NOHZ_FULL CPUs in
userspace for as long as possible - no signal of any form would be sent when
deferring an IPI. This means that any form of queuing for deferred callbacks
would end up as a convoluted memory leak.
Deferred IPIs must thus be coalesced, which this series achieves by assigning
IPIs a "type" and having a mapping of IPI type to callback, leveraged upon
kernel entry.
What about IPIs whose callback take a parameter, you may ask?
Peter suggested during OSPM23 [3] that since on_each_cpu() targets
housekeeping CPUs *and* isolated CPUs, isolated CPUs can access either global or
housekeeping-CPU-local state to "reconstruct" the data that would have been sent
via the IPI.
This series does not affect any IPI callback that requires an argument, but the
approach would remain the same (one coalescable callback executed on kernel
entry).
Kernel entry vs execution of the deferred operation
===================================================
This is what I've referred to as the "Danger Zone" during my LPC24 talk [4].
There is a non-zero length of code that is executed upon kernel entry before the
deferred operation can be itself executed (i.e. before we start getting into
context_tracking.c proper), i.e.:
idtentry_func_foo() <--- we're in the kernel
irqentry_enter()
enter_from_user_mode()
__ct_user_exit()
ct_kernel_enter_state()
ct_work_flush() <--- deferred operation is executed here
This means one must take extra care to what can happen in the early entry code,
and that <bad things> cannot happen. For instance, we really don't want to hit
instructions that have been modified by a remote text_poke() while we're on our
way to execute a deferred sync_core(). Patches doing the actual deferral have
more detail on this.
Patches
=======
o Patches 1-2 are standalone objtool cleanups.
o Patches 3-4 add an RCU testing feature.
o Patches 5-6 add infrastructure for annotating static keys and static calls
that may be used in noinstr code (courtesy of Josh).
o Patches 7-19 use said annotations on relevant keys / calls.
o Patch 20 enforces proper usage of said annotations (courtesy of Josh).
o Patches 21-23 fiddle with CT_STATE* within context tracking
o Patches 24-29 add the actual IPI deferral faff
o Patch 30 adds a freebie: deferring IPIs for NOHZ_IDLE. Not tested that much!
if you care about battery-powered devices and energy consumption, go give it
a try!
Patches are also available at:
https://gitlab.com/vschneid/linux.git -b redhat/isolirq/defer/v4
Stuff I'd like eyes and neurons on
==================================
Context-tracking vs idle. Patch 22 "improves" the situation by adding an
IDLE->KERNEL transition when getting an IRQ while idle, but it leaves the
following window:
~> IRQ
ct_nmi_enter()
state = state + CT_STATE_KERNEL - CT_STATE_IDLE
[...]
ct_nmi_exit()
state = state - CT_STATE_KERNEL + CT_STATE_IDLE
[...] /!\ CT_STATE_IDLE here while we're really in kernelspace! /!\
ct_cpuidle_exit()
state = state + CT_STATE_KERNEL - CT_STATE_IDLE
Said window is contained within cpu_idle_poll() and the cpuidle call within
cpuidle_enter_state(), both being noinstr (the former is __cpuidle which is
noinstr itself). Thus objtool will consider it as early entry and will warn
accordingly of any static key / call misuse, so the damage is somewhat
contained, but it's not ideal.
I tried fiddling with this but idle polling likes being annoying, as it is
shaped like so:
ct_cpuidle_enter();
raw_local_irq_enable();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
raw_local_irq_disable();
ct_cpuidle_exit();
IOW, getting an IRQ that doesn't end up setting NEED_RESCHED while idle-polling
doesn't come near ct_cpuidle_exit(), which prevents me from having the outermost
ct_nmi_exit() leave the state as CT_STATE_KERNEL (rather than CT_STATE_IDLE).
Testing
=======
Xeon E5-2699 system with SMToff, NOHZ_FULL, isolated CPUs.
RHEL9 userspace.
Workload is using rteval (kernel compilation + hackbench) on housekeeping CPUs
and a dummy stay-in-userspace loop on the isolated CPUs. The main invocation is:
$ trace-cmd record -e "csd_queue_cpu" -f "cpu & CPUS{$ISOL_CPUS}" \
-e "ipi_send_cpumask" -f "cpumask & CPUS{$ISOL_CPUS}" \
-e "ipi_send_cpu" -f "cpu & CPUS{$ISOL_CPUS}" \
rteval --onlyload --loads-cpulist=$HK_CPUS \
--hackbench-runlowmem=True --duration=$DURATION
This only records IPIs sent to isolated CPUs, so any event there is interference
(with a bit of fuzz at the start/end of the workload when spawning the
processes). All tests were done with a duration of 6 hours.
v6.13-rc6
# This is the actual IPI count
$ trace-cmd report | grep callback | awk '{ print $(NF) }' | sort | uniq -c | sort -nr
531 callback=generic_smp_call_function_single_interrupt+0x0
# These are the different CSD's that caused IPIs
$ trace-cmd report | grep csd_queue | awk '{ print $(NF-1) }' | sort | uniq -c | sort -nr
12818 func=do_flush_tlb_all
910 func=do_kernel_range_flush
78 func=do_sync_core
v6.13-rc6 + patches:
# This is the actual IPI count
$ trace-cmd report | grep callback | awk '{ print $(NF) }' | sort | uniq -c | sort -nr
# Zilch!
# These are the different CSD's that caused IPIs
$ trace-cmd report | grep csd_queue | awk '{ print $(NF-1) }' | sort | uniq -c | sort -nr
# Nada!
Note that tlb_remove_table_smp_sync() showed up during testing of v3, and has
gone as mysteriously as it showed up. Yair had a series adressing this [5] which
would be worth revisiting.
Acknowledgements
================
Special thanks to:
o Clark Williams for listening to my ramblings about this and throwing ideas my way
o Josh Poimboeuf for all his help with everything objtool-related
o All of the folks who attended various (too many?) talks about this and
provided precious feedback.
Links
=====
[1]: https://lore.kernel.org/all/20210929151723.162004989@infradead.org/
[2]: https://github.com/vianpl/linux.git -b ct-work-defer-wip
[3]: https://youtu.be/0vjE6fjoVVE
[4]: https://lpc.events/event/18/contributions/1889/
[5]: https://lore.kernel.org/lkml/20230620144618.125703-1-ypodemsk@redhat.com/
Revisions
=========
RFCv3 -> v4
++++++++++++++
o Rebased onto v6.13-rc6
o New objtool patches from Josh
o More .noinstr static key/call patches
o Static calls now handled as well (again thanks to Josh)
o Fixed clearing the work bits on kernel exit
o Messed with IRQ hitting an idle CPU vs context tracking
o Various comment and naming cleanups
o Made RCU_DYNTICKS_TORTURE depend on !COMPILE_TEST (PeterZ)
o Fixed the CT_STATE_KERNEL check when setting a deferred work (Frederic)
o Cleaned up the __flush_tlb_all() mess thanks to PeterZ
RFCv2 -> RFCv3
++++++++++++++
o Rebased onto v6.12-rc6
o Added objtool documentation for the new warning (Josh)
o Added low-size RCU watching counter to TREE04 torture scenario (Paul)
o Added FORCEFUL jump label and static key types
o Added noinstr-compliant helpers for tlb flush deferral
RFCv1 -> RFCv2
++++++++++++++
o Rebased onto v6.5-rc1
o Updated the trace filter patches (Steven)
o Fixed __ro_after_init keys used in modules (Peter)
o Dropped the extra context_tracking atomic, squashed the new bits in the
existing .state field (Peter, Frederic)
o Added an RCU_EXPERT config for the RCU dynticks counter size, and added an
rcutorture case for a low-size counter (Paul)
o Fixed flush_tlb_kernel_range_deferrable() definition
Josh Poimboeuf (3):
jump_label: Add annotations for validating noinstr usage
static_call: Add read-only-after-init static calls
objtool: Add noinstr validation for static branches/calls
Peter Zijlstra (1):
x86,tlb: Make __flush_tlb_global() noinstr-compliant
Valentin Schneider (26):
objtool: Make validate_call() recognize indirect calls to pv_ops[]
objtool: Flesh out warning related to pv_ops[] calls
rcu: Add a small-width RCU watching counter debug option
rcutorture: Make TREE04 use CONFIG_RCU_DYNTICKS_TORTURE
x86/paravirt: Mark pv_sched_clock static call as __ro_after_init
x86/idle: Mark x86_idle static call as __ro_after_init
x86/paravirt: Mark pv_steal_clock static call as __ro_after_init
riscv/paravirt: Mark pv_steal_clock static call as __ro_after_init
loongarch/paravirt: Mark pv_steal_clock static call as __ro_after_init
arm64/paravirt: Mark pv_steal_clock static call as __ro_after_init
arm/paravirt: Mark pv_steal_clock static call as __ro_after_init
perf/x86/amd: Mark perf_lopwr_cb static call as __ro_after_init
sched/clock: Mark sched_clock_running key as __ro_after_init
x86/speculation/mds: Mark mds_idle_clear key as allowed in .noinstr
sched/clock, x86: Mark __sched_clock_stable key as allowed in .noinstr
x86/kvm/vmx: Mark vmx_l1d_should flush and vmx_l1d_flush_cond keys as
allowed in .noinstr
stackleack: Mark stack_erasing_bypass key as allowed in .noinstr
context_tracking: Explicitely use CT_STATE_KERNEL where it is missing
context_tracking: Exit CT_STATE_IDLE upon irq/nmi entry
context_tracking: Turn CT_STATE_* into bits
context-tracking: Introduce work deferral infrastructure
context_tracking,x86: Defer kernel text patching IPIs
x86/tlb: Make __flush_tlb_local() noinstr-compliant
x86/tlb: Make __flush_tlb_all() noinstr
x86/mm, mm/vmalloc: Defer flush_tlb_kernel_range() targeting NOHZ_FULL
CPUs
context-tracking: Add a Kconfig to enable IPI deferral for NO_HZ_IDLE
arch/Kconfig | 9 ++
arch/arm/kernel/paravirt.c | 2 +-
arch/arm64/kernel/paravirt.c | 2 +-
arch/loongarch/kernel/paravirt.c | 2 +-
arch/riscv/kernel/paravirt.c | 2 +-
arch/x86/Kconfig | 1 +
arch/x86/events/amd/brs.c | 2 +-
arch/x86/include/asm/context_tracking_work.h | 22 ++++
arch/x86/include/asm/invpcid.h | 13 +--
arch/x86/include/asm/paravirt.h | 4 +-
arch/x86/include/asm/text-patching.h | 1 +
arch/x86/include/asm/tlbflush.h | 3 +-
arch/x86/include/asm/xen/hypercall.h | 11 +-
arch/x86/kernel/alternative.c | 38 ++++++-
arch/x86/kernel/cpu/bugs.c | 9 +-
arch/x86/kernel/kprobes/core.c | 4 +-
arch/x86/kernel/kprobes/opt.c | 4 +-
arch/x86/kernel/module.c | 2 +-
arch/x86/kernel/paravirt.c | 4 +-
arch/x86/kernel/process.c | 2 +-
arch/x86/kvm/vmx/vmx.c | 11 +-
arch/x86/mm/tlb.c | 46 ++++++--
arch/x86/xen/mmu_pv.c | 10 +-
arch/x86/xen/xen-ops.h | 12 +-
include/asm-generic/sections.h | 15 +++
include/linux/context_tracking.h | 21 ++++
include/linux/context_tracking_state.h | 64 +++++++++--
include/linux/context_tracking_work.h | 28 +++++
include/linux/jump_label.h | 30 ++++-
include/linux/objtool.h | 7 ++
include/linux/static_call.h | 19 ++++
kernel/context_tracking.c | 98 ++++++++++++++--
kernel/rcu/Kconfig.debug | 15 +++
kernel/sched/clock.c | 7 +-
kernel/stackleak.c | 6 +-
kernel/time/Kconfig | 19 ++++
mm/vmalloc.c | 35 +++++-
tools/objtool/Documentation/objtool.txt | 34 ++++++
tools/objtool/check.c | 106 +++++++++++++++---
tools/objtool/include/objtool/check.h | 1 +
tools/objtool/include/objtool/elf.h | 1 +
tools/objtool/include/objtool/special.h | 1 +
tools/objtool/special.c | 18 ++-
.../selftests/rcutorture/configs/rcu/TREE04 | 1 +
44 files changed, 635 insertions(+), 107 deletions(-)
create mode 100644 arch/x86/include/asm/context_tracking_work.h
create mode 100644 include/linux/context_tracking_work.h
--
2.43.0
This patch set convert iptables to nftables for wireguard testing, as
iptables is deparated and nftables is the default framework of most releases.
v5: remove the counter in nft rules and link nft statically (Jason A. Donenfeld)
v4: no update, just re-send
v3: drop iptables directly (Jason A. Donenfeld)
Also convert to using nft for qemu testing (Jason A. Donenfeld)
v2: use one nft table for testing (Phil Sutter)
Hangbin Liu (2):
wireguard: selftests: convert iptables to nft
wireguard: selftests: update to using nft for qemu test
tools/testing/selftests/wireguard/netns.sh | 29 +++++++++------
.../testing/selftests/wireguard/qemu/Makefile | 36 ++++++++++++++-----
.../selftests/wireguard/qemu/kernel.config | 7 ++--
3 files changed, 49 insertions(+), 23 deletions(-)
--
2.46.0
Hello
I am keen to exchange thoughts on financial opportunities with you. If it
aligns with your preferences, I would appreciate your WhatsApp contact
details for a more fluid dialogue. Should that not suit, please let me know
a convenient time for us to correspond directly.
Thank you for your kind consideration. I eagerly anticipate your reply.
Warm regards,
General Johanne
This started with a patch that enabled `clippy::ptr_as_ptr`. Benno
Lossin suggested I also look into `clippy::ptr_cast_constness` and I
discovered `clippy::as_ptr_cast_mut`. This series now enables all 3
lints. It also enables `clippy::as_underscore` which ensures other
pointer casts weren't missed. The first commit reduces the need for
pointer casts and is shared with another series[1].
The final patch also enables pointer provenance lints and fixes
violations. See that commit message for details. The build system
portion of that commit is pretty messy but I couldn't find a better way
to convincingly ensure that these lints were applied globally.
Suggestions would be very welcome.
Link: https://lore.kernel.org/all/20250307-no-offset-v1-0-0c728f63b69c@gmail.com/ [1]
Signed-off-by: Tamir Duberstein <tamird(a)gmail.com>
---
Changes in v5:
- Use `pointer::addr` in OF. (Boqun Feng)
- Add documentation on stubs. (Benno Lossin)
- Mark stubs `#[inline]`.
- Pick up Alice's RB on a shared commit from
https://lore.kernel.org/all/Z9f-3Aj3_FWBZRrm@google.com/.
- Link to v4: https://lore.kernel.org/r/20250315-ptr-as-ptr-v4-0-b2d72c14dc26@gmail.com
Changes in v4:
- Add missing SoB. (Benno Lossin)
- Use `without_provenance_mut` in alloc. (Boqun Feng)
- Limit strict provenance lints to the `kernel` crate to avoid complex
logic in the build system. This can be revisited on MSRV >= 1.84.0.
- Rebase on rust-next.
- Link to v3: https://lore.kernel.org/r/20250314-ptr-as-ptr-v3-0-e7ba61048f4a@gmail.com
Changes in v3:
- Fixed clippy warning in rust/kernel/firmware.rs. (kernel test robot)
Link: https://lore.kernel.org/all/202503120332.YTCpFEvv-lkp@intel.com/
- s/as u64/as bindings::phys_addr_t/g. (Benno Lossin)
- Use strict provenance APIs and enable lints. (Benno Lossin)
- Link to v2: https://lore.kernel.org/r/20250309-ptr-as-ptr-v2-0-25d60ad922b7@gmail.com
Changes in v2:
- Fixed typo in first commit message.
- Added additional patches, converted to series.
- Link to v1: https://lore.kernel.org/r/20250307-ptr-as-ptr-v1-1-582d06514c98@gmail.com
---
Tamir Duberstein (6):
rust: retain pointer mut-ness in `container_of!`
rust: enable `clippy::ptr_as_ptr` lint
rust: enable `clippy::ptr_cast_constness` lint
rust: enable `clippy::as_ptr_cast_mut` lint
rust: enable `clippy::as_underscore` lint
rust: use strict provenance APIs
Makefile | 4 ++
init/Kconfig | 3 +
rust/bindings/lib.rs | 1 +
rust/kernel/alloc.rs | 2 +-
rust/kernel/alloc/allocator_test.rs | 2 +-
rust/kernel/alloc/kvec.rs | 4 +-
rust/kernel/block/mq/operations.rs | 2 +-
rust/kernel/block/mq/request.rs | 7 +-
rust/kernel/device.rs | 5 +-
rust/kernel/device_id.rs | 2 +-
rust/kernel/devres.rs | 19 +++---
rust/kernel/error.rs | 2 +-
rust/kernel/firmware.rs | 3 +-
rust/kernel/fs/file.rs | 2 +-
rust/kernel/io.rs | 16 ++---
rust/kernel/kunit.rs | 15 ++---
rust/kernel/lib.rs | 113 ++++++++++++++++++++++++++++++++-
rust/kernel/list/impl_list_item_mod.rs | 2 +-
rust/kernel/miscdevice.rs | 2 +-
rust/kernel/of.rs | 6 +-
rust/kernel/pci.rs | 15 +++--
rust/kernel/platform.rs | 6 +-
rust/kernel/print.rs | 11 ++--
rust/kernel/rbtree.rs | 23 +++----
rust/kernel/seq_file.rs | 3 +-
rust/kernel/str.rs | 18 ++----
rust/kernel/sync/poll.rs | 2 +-
rust/kernel/uaccess.rs | 12 ++--
rust/kernel/workqueue.rs | 12 ++--
rust/uapi/lib.rs | 1 +
30 files changed, 218 insertions(+), 97 deletions(-)
---
base-commit: 498f7ee4773f22924f00630136da8575f38954e8
change-id: 20250307-ptr-as-ptr-21b1867fc4d4
Best regards,
--
Tamir Duberstein <tamird(a)gmail.com>
Hi,
"kci_test_bridge_parent_id" test failed with error "as device can not be
enslaved while up".
Here is Error log.
-------------------
> ./rtnetlink.sh -t kci_test_bridge_parent_id -v
COMMAND: ip link add name test-dummy0 type dummy
COMMAND: ip link set test-dummy0 up
COMMAND: modprobe -q netdevsim
COMMAND: ip link add name test-bond0 type bond mode 802.3ad
COMMAND: ip link set dev eni10np1 master test-bond0
Error: Device can not be enslaved while up.
COMMAND: ip link set dev eni20np1 master test-bond0
Error: Device can not be enslaved while up.
COMMAND: ip link add name test-br0 type bridge
COMMAND: ip link set dev test-bond0 master test-br0
FAIL: bridge_parent_id
-------------------
upstream commit ec4ffd100ffb ("Revert "net: rtnetlink: Enslave device
before bringing it up""), suggest the following scenario!
$ ip link set dummy0 up
$ ip link set dummy0 master bond0 down
According to last commit, do we need to modify
"kci_test_bridge_parent_id" test set to down.
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -1129,8 +1129,8 @@ kci_test_bridge_parent_id()
dev10=`ls ${sysfsnet}10/net/`
dev20=`ls ${sysfsnet}20/net/`
run_cmd ip link add name test-bond0 type bond mode 802.3ad
- run_cmd ip link set dev $dev10 master test-bond0
- run_cmd ip link set dev $dev20 master test-bond0
+ run_cmd ip link set dev $dev10 master test-bond0 down
+ run_cmd ip link set dev $dev20 master test-bond0 down
run_cmd ip link add name test-br0 type bridge
Success log with modified test case
> ./rtnetlink.sh -t kci_test_bridge_parent_id -v
COMMAND: ip link add name test-dummy0 type dummy
COMMAND: ip link set test-dummy0 up
COMMAND: modprobe -q netdevsim
COMMAND: ip link add name test-bond0 type bond mode 802.3ad
COMMAND: ip link set dev eni10np1 master test-bond0 down
COMMAND: ip link set dev eni20np1 master test-bond0 down
COMMAND: ip link add name test-br0 type bridge
COMMAND: ip link set dev test-bond0 master test-br0
PASS: bridge_parent_id
Thanks,
Alok
Set ret = 0 on successful completion of the processing loop in
cs_dsp_load() and cs_dsp_load_coeff() to ensure that the function
returns 0 on success.
All normal firmware files will have at least one data block, and
processing this block will set ret == 0, from the result of either
regmap_raw_write() or cs_dsp_parse_coeff().
The kunit tests create a dummy firmware file that contains only the
header, without any data blocks. This gives cs_dsp a file to "load"
that will not cause any side-effects. As there aren't any data blocks,
the processing loop will not set ret == 0.
Originally there was a line after the processing loop:
ret = regmap_async_complete(regmap);
which would set ret == 0 before the function returned.
Commit fe08b7d5085a ("firmware: cs_dsp: Remove async regmap writes")
changed the regmap write to a normal sync write, so the call to
regmap_async_complete() wasn't necessary and was removed. It was
overlooked that the ret here wasn't only to check the result of
regmap_async_complete(), it also set the final return value of the
function.
Fixes: fe08b7d5085a ("firmware: cs_dsp: Remove async regmap writes")
Signed-off-by: Richard Fitzgerald <rf(a)opensource.cirrus.com>
---
drivers/firmware/cirrus/cs_dsp.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 42433c19eb30..560724ce21aa 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -1631,6 +1631,7 @@ static int cs_dsp_load(struct cs_dsp *dsp, const struct firmware *firmware,
cs_dsp_debugfs_save_wmfwname(dsp, file);
+ ret = 0;
out_fw:
cs_dsp_buf_free(&buf_list);
@@ -2338,6 +2339,7 @@ static int cs_dsp_load_coeff(struct cs_dsp *dsp, const struct firmware *firmware
cs_dsp_debugfs_save_binname(dsp, file);
+ ret = 0;
out_fw:
cs_dsp_buf_free(&buf_list);
--
2.43.0
This patchset add ftrace helpers functions and
add a new test makes sure that ftrace can trace
a function that was introduced by a livepatch.
Signed-off-by: Filipe Xavier <felipeaggger(a)gmail.com>
Suggested-by: Marcos Paulo de Souza <mpdesouza(a)suse.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza(a)suse.com>
Acked-by: Miroslav Benes <mbenes(a)suse.cz>
---
Changes in v3:
- functions.sh: fixed sed to remove warning from shellcheck and add grep -Fw params.
- test-ftrace.sh: change constant to use common SYSFS_KLP_DIR.
- Link to v2: https://lore.kernel.org/r/20250318-ftrace-sftest-livepatch-v2-0-60cb0aa95cc…
Changes in v2:
- functions.sh: change check traced function to accept a list of functions.
- Link to v1: https://lore.kernel.org/r/20250306-ftrace-sftest-livepatch-v1-0-a6f1dfc30e1…
---
Filipe Xavier (2):
selftests: livepatch: add new ftrace helpers functions
selftests: livepatch: test if ftrace can trace a livepatched function
tools/testing/selftests/livepatch/functions.sh | 49 ++++++++++++++++++++++++
tools/testing/selftests/livepatch/test-ftrace.sh | 34 ++++++++++++++++
2 files changed, 83 insertions(+)
---
base-commit: 848e076317446f9c663771ddec142d7c2eb4cb43
change-id: 20250306-ftrace-sftest-livepatch-60d9dc472235
Best regards,
--
Filipe Xavier <felipeaggger(a)gmail.com>
Bugfixes for a few issues I ran into.
Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
---
Thomas Weißschuh (3):
selftests: vDSO: chacha: Correctly skip test if necessary
selftests: vDSO: chacha: Include asm/hwcap.h for arm64
selftests: vDSO: chacha: Provide default definition of HWCAP_S390_VXRS
tools/testing/selftests/vDSO/vdso_test_chacha.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
---
base-commit: 586de92313fcab8ed84ac5f78f4d2aae2db92c59
change-id: 20250324-s390-vdso-hwcap-0914c0f7c7f1
Best regards,
--
Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
This started with a patch that enabled `clippy::ptr_as_ptr`. Benno
Lossin suggested I also look into `clippy::ptr_cast_constness` and I
discovered `clippy::as_ptr_cast_mut`. This series now enables all 3
lints. It also enables `clippy::as_underscore` which ensures other
pointer casts weren't missed. The first commit reduces the need for
pointer casts and is shared with another series[1].
As a late addition, `clippy::cast_lossless` is also enabled.
Link: https://lore.kernel.org/all/20250307-no-offset-v1-0-0c728f63b69c@gmail.com/ [1]
Signed-off-by: Tamir Duberstein <tamird(a)gmail.com>
---
Changes in v6:
- Drop strict provenance patch.
- Fix URLs in doc comments.
- Add patch to enable `clippy::cast_lossless`.
- Rebase on rust-next.
- Link to v5: https://lore.kernel.org/r/20250317-ptr-as-ptr-v5-0-5b5f21fa230a@gmail.com
Changes in v5:
- Use `pointer::addr` in OF. (Boqun Feng)
- Add documentation on stubs. (Benno Lossin)
- Mark stubs `#[inline]`.
- Pick up Alice's RB on a shared commit from
https://lore.kernel.org/all/Z9f-3Aj3_FWBZRrm@google.com/.
- Link to v4: https://lore.kernel.org/r/20250315-ptr-as-ptr-v4-0-b2d72c14dc26@gmail.com
Changes in v4:
- Add missing SoB. (Benno Lossin)
- Use `without_provenance_mut` in alloc. (Boqun Feng)
- Limit strict provenance lints to the `kernel` crate to avoid complex
logic in the build system. This can be revisited on MSRV >= 1.84.0.
- Rebase on rust-next.
- Link to v3: https://lore.kernel.org/r/20250314-ptr-as-ptr-v3-0-e7ba61048f4a@gmail.com
Changes in v3:
- Fixed clippy warning in rust/kernel/firmware.rs. (kernel test robot)
Link: https://lore.kernel.org/all/202503120332.YTCpFEvv-lkp@intel.com/
- s/as u64/as bindings::phys_addr_t/g. (Benno Lossin)
- Use strict provenance APIs and enable lints. (Benno Lossin)
- Link to v2: https://lore.kernel.org/r/20250309-ptr-as-ptr-v2-0-25d60ad922b7@gmail.com
Changes in v2:
- Fixed typo in first commit message.
- Added additional patches, converted to series.
- Link to v1: https://lore.kernel.org/r/20250307-ptr-as-ptr-v1-1-582d06514c98@gmail.com
---
Tamir Duberstein (6):
rust: retain pointer mut-ness in `container_of!`
rust: enable `clippy::ptr_as_ptr` lint
rust: enable `clippy::ptr_cast_constness` lint
rust: enable `clippy::as_ptr_cast_mut` lint
rust: enable `clippy::as_underscore` lint
rust: enable `clippy::cast_lossless` lint
Makefile | 5 +++++
drivers/gpu/drm/drm_panic_qr.rs | 10 +++++-----
rust/bindings/lib.rs | 1 +
rust/kernel/alloc/allocator_test.rs | 2 +-
rust/kernel/alloc/kvec.rs | 4 ++--
rust/kernel/block/mq/operations.rs | 2 +-
rust/kernel/block/mq/request.rs | 7 ++++---
rust/kernel/device.rs | 5 +++--
rust/kernel/device_id.rs | 2 +-
rust/kernel/devres.rs | 19 ++++++++++---------
rust/kernel/dma.rs | 6 +++---
rust/kernel/error.rs | 2 +-
rust/kernel/firmware.rs | 3 ++-
rust/kernel/fs/file.rs | 2 +-
rust/kernel/io.rs | 18 +++++++++---------
rust/kernel/kunit.rs | 15 +++++++--------
rust/kernel/lib.rs | 5 ++---
rust/kernel/list/impl_list_item_mod.rs | 2 +-
rust/kernel/miscdevice.rs | 2 +-
rust/kernel/net/phy.rs | 4 ++--
rust/kernel/of.rs | 6 +++---
rust/kernel/pci.rs | 13 ++++++++-----
rust/kernel/platform.rs | 6 ++++--
rust/kernel/print.rs | 11 +++++------
rust/kernel/rbtree.rs | 23 ++++++++++-------------
rust/kernel/seq_file.rs | 3 ++-
rust/kernel/str.rs | 10 +++++-----
rust/kernel/sync/poll.rs | 2 +-
rust/kernel/workqueue.rs | 12 ++++++------
rust/uapi/lib.rs | 1 +
30 files changed, 107 insertions(+), 96 deletions(-)
---
base-commit: 28bb48c4cb34f65a9aa602142e76e1426da31293
change-id: 20250307-ptr-as-ptr-21b1867fc4d4
Best regards,
--
Tamir Duberstein <tamird(a)gmail.com>
This patchset add ftrace helpers functions and
add a new test makes sure that ftrace can trace
a function that was introduced by a livepatch.
Signed-off-by: Filipe Xavier <felipeaggger(a)gmail.com>
Suggested-by: Marcos Paulo de Souza <mpdesouza(a)suse.com>
Reviewed-by: Marcos Paulo de Souza <mpdesouza(a)suse.com>
---
Changes in v2:
- functions.sh: change check traced function to accept a list of functions.
- Link to v1: https://lore.kernel.org/r/20250306-ftrace-sftest-livepatch-v1-0-a6f1dfc30e1…
---
Filipe Xavier (2):
selftests: livepatch: add new ftrace helpers functions
selftests: livepatch: test if ftrace can trace a livepatched function
tools/testing/selftests/livepatch/functions.sh | 49 ++++++++++++++++++++++++
tools/testing/selftests/livepatch/test-ftrace.sh | 34 ++++++++++++++++
2 files changed, 83 insertions(+)
---
base-commit: 848e076317446f9c663771ddec142d7c2eb4cb43
change-id: 20250306-ftrace-sftest-livepatch-60d9dc472235
Best regards,
--
Filipe Xavier <felipeaggger(a)gmail.com>
v7: https://lore.kernel.org/netdev/20250227041209.2031104-1-almasrymina@google.…
===
Changelog:
- Check the dmabuf net_iov binding belongs to the device the TX is going
out on. (Jakub)
- Provide detailed inspection of callsites of
__skb_frag_ref/skb_page_unref in patch 2's changelog (Jakub)
v6: https://lore.kernel.org/netdev/20250222191517.743530-1-almasrymina@google.c…
===
v6 has no major changes. Addressed a few issues from Paolo and David,
and collected Acks from Stan. Thank you everyone for the review!
Changes:
- retain behavior to process MSG_FASTOPEN even if the provided cmsg is
invalid (Paolo).
- Rework the freeing of tx_vec slightly (it now has its own err label).
(Paolo).
- Squash the commit that makes dmabuf unbinding scheduled work into the
same one which implements the TX path so we don't run into future
errors on bisecting (Paolo).
- Fix/add comments to explain how dmabuf binding refcounting works
(David).
v5: https://lore.kernel.org/netdev/20250220020914.895431-1-almasrymina@google.c…
===
v5 has no major changes; it clears up the relatively minor issues
pointed out to in v4, and rebases the series on top of net-next to
resolve the conflict with a patch that raced to the tree. It also
collects the review tags from v4.
Changes:
- Rebase to net-next
- Fix issues in selftest (Stan).
- Address comments in the devmem and netmem driver docs (Stan and Bagas)
- Fix zerocopy_fill_skb_from_devmem return error code (Stan).
v4: https://lore.kernel.org/netdev/20250203223916.1064540-1-almasrymina@google.…
===
v4 mainly addresses the critical driver support issue surfaced in v3 by
Paolo and Stan. Drivers aiming to support netmem_tx should make sure not
to pass the netmem dma-addrs to the dma-mapping APIs, as these dma-addrs
may come from dma-bufs.
Additionally other feedback from v3 is addressed.
Major changes:
- Add helpers to handle netmem dma-addrs. Add GVE support for
netmem_tx.
- Fix binding->tx_vec not being freed on error paths during the
tx binding.
- Add a minimal devmem_tx test to devmem.py.
- Clean up everything obsolete from the cover letter (Paolo).
v3: https://patchwork.kernel.org/project/netdevbpf/list/?series=929401&state=*
===
Address minor comments from RFCv2 and fix a few build warnings and
ynl-regen issues. No major changes.
RFC v2: https://patchwork.kernel.org/project/netdevbpf/list/?series=920056&state=*
=======
RFC v2 addresses much of the feedback from RFC v1. I plan on sending
something close to this as net-next reopens, sending it slightly early
to get feedback if any.
Major changes:
--------------
- much improved UAPI as suggested by Stan. We now interpret the iov_base
of the passed in iov from userspace as the offset into the dmabuf to
send from. This removes the need to set iov.iov_base = NULL which may
be confusing to users, and enables us to send multiple iovs in the
same sendmsg() call. ncdevmem and the docs show a sample use of that.
- Removed the duplicate dmabuf iov_iter in binding->iov_iter. I think
this is good improvment as it was confusing to keep track of
2 iterators for the same sendmsg, and mistracking both iterators
caused a couple of bugs reported in the last iteration that are now
resolved with this streamlining.
- Improved test coverage in ncdevmem. Now multiple sendmsg() are tested,
and sending multiple iovs in the same sendmsg() is tested.
- Fixed issue where dmabuf unmapping was happening in invalid context
(Stan).
====================================================================
The TX path had been dropped from the Device Memory TCP patch series
post RFCv1 [1], to make that series slightly easier to review. This
series rebases the implementation of the TX path on top of the
net_iov/netmem framework agreed upon and merged. The motivation for
the feature is thoroughly described in the docs & cover letter of the
original proposal, so I don't repeat the lengthy descriptions here, but
they are available in [1].
Full outline on usage of the TX path is detailed in the documentation
included with this series.
Test example is available via the kselftest included in the series as well.
The series is relatively small, as the TX path for this feature largely
piggybacks on the existing MSG_ZEROCOPY implementation.
Patch Overview:
---------------
1. Documentation & tests to give high level overview of the feature
being added.
1. Add netmem refcounting needed for the TX path.
2. Devmem TX netlink API.
3. Devmem TX net stack implementation.
4. Make dma-buf unbinding scheduled work to handle TX cases where it gets
freed from contexts where we can't sleep.
5. Add devmem TX documentation.
6. Add scaffolding enabling driver support for netmem_tx. Add helpers, driver
feature flag, and docs to enable drivers to declare netmem_tx support.
7. Guard netmem_tx against being enabled against drivers that don't
support it.
8. Add devmem_tx selftests. Add TX path to ncdevmem and add a test to
devmem.py.
Testing:
--------
Testing is very similar to devmem TCP RX path. The ncdevmem test used
for the RX path is now augemented with client functionality to test TX
path.
* Test Setup:
Kernel: net-next with this RFC and memory provider API cherry-picked
locally.
Hardware: Google Cloud A3 VMs.
NIC: GVE with header split & RSS & flow steering support.
Performance results are not included with this version, unfortunately.
I'm having issues running the dma-buf exporter driver against the
upstream kernel on my test setup. The issues are specific to that
dma-buf exporter and do not affect this patch series. I plan to follow
up this series with perf fixes if the tests point to issues once they're
up and running.
Special thanks to Stan who took a stab at rebasing the TX implementation
on top of the netmem/net_iov framework merged. Parts of his proposal [2]
that are reused as-is are forked off into their own patches to give full
credit.
[1] https://lore.kernel.org/netdev/20240909054318.1809580-1-almasrymina@google.…
[2] https://lore.kernel.org/netdev/20240913150913.1280238-2-sdf@fomichev.me/T/#…
Cc: sdf(a)fomichev.me
Cc: asml.silence(a)gmail.com
Cc: dw(a)davidwei.uk
Cc: Jamal Hadi Salim <jhs(a)mojatatu.com>
Cc: Victor Nogueira <victor(a)mojatatu.com>
Cc: Pedro Tammela <pctammela(a)mojatatu.com>
Cc: Samiullah Khawaja <skhawaja(a)google.com>
Mina Almasry (8):
netmem: add niov->type attribute to distinguish different net_iov
types
net: add get_netmem/put_netmem support
net: devmem: Implement TX path
net: add devmem TCP TX documentation
net: enable driver support for netmem TX
gve: add netmem TX support to GVE DQO-RDA mode
net: check for driver support in netmem TX
selftests: ncdevmem: Implement devmem TCP TX
Stanislav Fomichev (1):
net: devmem: TCP tx netlink api
Documentation/netlink/specs/netdev.yaml | 12 +
Documentation/networking/devmem.rst | 150 ++++++++-
.../networking/net_cachelines/net_device.rst | 1 +
Documentation/networking/netdev-features.rst | 5 +
Documentation/networking/netmem.rst | 23 +-
drivers/net/ethernet/google/gve/gve_main.c | 4 +
drivers/net/ethernet/google/gve/gve_tx_dqo.c | 8 +-
include/linux/netdevice.h | 2 +
include/linux/skbuff.h | 17 +-
include/linux/skbuff_ref.h | 4 +-
include/net/netmem.h | 38 ++-
include/net/sock.h | 1 +
include/uapi/linux/netdev.h | 1 +
net/core/datagram.c | 48 ++-
net/core/dev.c | 33 ++
net/core/devmem.c | 118 ++++++-
net/core/devmem.h | 83 ++++-
net/core/netdev-genl-gen.c | 13 +
net/core/netdev-genl-gen.h | 1 +
net/core/netdev-genl.c | 73 ++++-
net/core/skbuff.c | 48 ++-
net/core/sock.c | 6 +
net/ipv4/ip_output.c | 3 +-
net/ipv4/tcp.c | 50 ++-
net/ipv6/ip6_output.c | 3 +-
net/vmw_vsock/virtio_transport_common.c | 5 +-
tools/include/uapi/linux/netdev.h | 1 +
.../selftests/drivers/net/hw/devmem.py | 26 +-
.../selftests/drivers/net/hw/ncdevmem.c | 300 +++++++++++++++++-
29 files changed, 1002 insertions(+), 75 deletions(-)
base-commit: 8ef890df4031121a94407c84659125cbccd3fdbe
--
2.49.0.rc0.332.g42c0ae87b1-goog
The test_rss_context_dump() test assumes the indirection table is always
supported, which is not true for all drivers, e.g., virtio_net when
VIRTIO_NET_F_RSS is disabled.
Skip the check if 'indir' is not present.
Reviewed-by: Nimrod Oren <noren(a)nvidia.com>
Signed-off-by: Gal Pressman <gal(a)nvidia.com>
---
tools/testing/selftests/drivers/net/hw/rss_ctx.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
index d6e69d7d5e43..ca60ae325c22 100755
--- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py
+++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py
@@ -392,7 +392,7 @@ def test_rss_context_dump(cfg):
# Sanity-check the results
for data in ctxs:
- ksft_ne(set(data['indir']), {0}, "indir table is all zero")
+ ksft_ne(set(data.get('indir', [1])), {0}, "indir table is all zero")
ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero")
# More specific checks
--
2.40.1
virtio-net have two usage of hashes: one is RSS and another is hash
reporting. Conventionally the hash calculation was done by the VMM.
However, computing the hash after the queue was chosen defeats the
purpose of RSS.
Another approach is to use eBPF steering program. This approach has
another downside: it cannot report the calculated hash due to the
restrictive nature of eBPF.
Introduce the code to compute hashes to the kernel in order to overcome
thse challenges.
An alternative solution is to extend the eBPF steering program so that it
will be able to report to the userspace, but it is based on context
rewrites, which is in feature freeze. We can adopt kfuncs, but they will
not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM
and vhost_net).
The patches for QEMU to use this new feature was submitted as RFC and
is available at:
https://patchew.org/QEMU/20250313-hash-v4-0-c75c494b495e@daynix.com/
This work was presented at LPC 2024:
https://lpc.events/event/18/contributions/1963/
V1 -> V2:
Changed to introduce a new BPF program type.
Signed-off-by: Akihiko Odaki <akihiko.odaki(a)daynix.com>
---
Changes in v11:
- Added the missing code to free vnet_hash in patch
"tap: Introduce virtio-net hash feature".
- Link to v10: https://lore.kernel.org/r/20250313-rss-v10-0-3185d73a9af0@daynix.com
Changes in v10:
- Split common code and TUN/TAP-specific code into separate patches.
- Reverted a spurious style change in patch "tun: Introduce virtio-net
hash feature".
- Added a comment explaining disable_ipv6 in tests.
- Used AF_PACKET for patch "selftest: tun: Add tests for
virtio-net hashing". I also added the usage of FIXTURE_VARIANT() as
the testing function now needs access to more variant-specific
variables.
- Corrected the message of patch "selftest: tun: Add tests for
virtio-net hashing"; it mentioned validation of configuration but
it is not scope of this patch.
- Expanded the description of patch "selftest: tun: Add tests for
virtio-net hashing".
- Added patch "tun: Allow steering eBPF program to fall back".
- Changed to handle TUNGETVNETHASHCAP before taking the rtnl lock.
- Removed redundant tests for tun_vnet_ioctl().
- Added patch "selftest: tap: Add tests for virtio-net ioctls".
- Added a design explanation of ioctls for extensibility and migration.
- Removed a few branches in patch
"vhost/net: Support VIRTIO_NET_F_HASH_REPORT".
- Link to v9: https://lore.kernel.org/r/20250307-rss-v9-0-df76624025eb@daynix.com
Changes in v9:
- Added a missing return statement in patch
"tun: Introduce virtio-net hash feature".
- Link to v8: https://lore.kernel.org/r/20250306-rss-v8-0-7ab4f56ff423@daynix.com
Changes in v8:
- Disabled IPv6 to eliminate noises in tests.
- Added a branch in tap to avoid unnecessary dissection when hash
reporting is disabled.
- Removed unnecessary rtnl_lock().
- Extracted code to handle new ioctls into separate functions to avoid
adding extra NULL checks to the code handling other ioctls.
- Introduced variable named "fd" to __tun_chr_ioctl().
- s/-/=/g in a patch message to avoid confusing Git.
- Link to v7: https://lore.kernel.org/r/20250228-rss-v7-0-844205cbbdd6@daynix.com
Changes in v7:
- Ensured to set hash_report to VIRTIO_NET_HASH_REPORT_NONE for
VHOST_NET_F_VIRTIO_NET_HDR.
- s/4/sizeof(u32)/ in patch "virtio_net: Add functions for hashing".
- Added tap_skb_cb type.
- Rebased.
- Link to v6: https://lore.kernel.org/r/20250109-rss-v6-0-b1c90ad708f6@daynix.com
Changes in v6:
- Extracted changes to fill vnet header holes into another series.
- Squashed patches "skbuff: Introduce SKB_EXT_TUN_VNET_HASH", "tun:
Introduce virtio-net hash reporting feature", and "tun: Introduce
virtio-net RSS" into patch "tun: Introduce virtio-net hash feature".
- Dropped the RFC tag.
- Link to v5: https://lore.kernel.org/r/20241008-rss-v5-0-f3cf68df005d@daynix.com
Changes in v5:
- Fixed a compilation error with CONFIG_TUN_VNET_CROSS_LE.
- Optimized the calculation of the hash value according to:
https://git.dpdk.org/dpdk/commit/?id=3fb1ea032bd6ff8317af5dac9af901f1f324ca…
- Added patch "tun: Unify vnet implementation".
- Dropped patch "tap: Pad virtio header with zero".
- Added patch "selftest: tun: Test vnet ioctls without device".
- Reworked selftests to skip for older kernels.
- Documented the case when the underlying device is deleted and packets
have queue_mapping set by TC.
- Reordered test harness arguments.
- Added code to handle fragmented packets.
- Link to v4: https://lore.kernel.org/r/20240924-rss-v4-0-84e932ec0e6c@daynix.com
Changes in v4:
- Moved tun_vnet_hash_ext to if_tun.h.
- Renamed virtio_net_toeplitz() to virtio_net_toeplitz_calc().
- Replaced htons() with cpu_to_be16().
- Changed virtio_net_hash_rss() to return void.
- Reordered variable declarations in virtio_net_hash_rss().
- Removed virtio_net_hdr_v1_hash_from_skb().
- Updated messages of "tap: Pad virtio header with zero" and
"tun: Pad virtio header with zero".
- Fixed vnet_hash allocation size.
- Ensured to free vnet_hash when destructing tun_struct.
- Link to v3: https://lore.kernel.org/r/20240915-rss-v3-0-c630015db082@daynix.com
Changes in v3:
- Reverted back to add ioctl.
- Split patch "tun: Introduce virtio-net hashing feature" into
"tun: Introduce virtio-net hash reporting feature" and
"tun: Introduce virtio-net RSS".
- Changed to reuse hash values computed for automq instead of performing
RSS hashing when hash reporting is requested but RSS is not.
- Extracted relevant data from struct tun_struct to keep it minimal.
- Added kernel-doc.
- Changed to allow calling TUNGETVNETHASHCAP before TUNSETIFF.
- Initialized num_buffers with 1.
- Added a test case for unclassified packets.
- Fixed error handling in tests.
- Changed tests to verify that the queue index will not overflow.
- Rebased.
- Link to v2: https://lore.kernel.org/r/20231015141644.260646-1-akihiko.odaki@daynix.com
---
Akihiko Odaki (10):
virtio_net: Add functions for hashing
net: flow_dissector: Export flow_keys_dissector_symmetric
tun: Allow steering eBPF program to fall back
tun: Add common virtio-net hash feature code
tun: Introduce virtio-net hash feature
tap: Introduce virtio-net hash feature
selftest: tun: Test vnet ioctls without device
selftest: tun: Add tests for virtio-net hashing
selftest: tap: Add tests for virtio-net ioctls
vhost/net: Support VIRTIO_NET_F_HASH_REPORT
Documentation/networking/tuntap.rst | 7 +
drivers/net/Kconfig | 1 +
drivers/net/ipvlan/ipvtap.c | 2 +-
drivers/net/macvtap.c | 2 +-
drivers/net/tap.c | 78 +++++-
drivers/net/tun.c | 90 +++++--
drivers/net/tun_vnet.h | 155 ++++++++++-
drivers/vhost/net.c | 68 ++---
include/linux/if_tap.h | 4 +-
include/linux/skbuff.h | 3 +
include/linux/virtio_net.h | 188 ++++++++++++++
include/net/flow_dissector.h | 1 +
include/uapi/linux/if_tun.h | 82 ++++++
net/core/flow_dissector.c | 3 +-
net/core/skbuff.c | 4 +
tools/testing/selftests/net/Makefile | 2 +-
tools/testing/selftests/net/tap.c | 97 ++++++-
tools/testing/selftests/net/tun.c | 491 ++++++++++++++++++++++++++++++++++-
18 files changed, 1194 insertions(+), 84 deletions(-)
---
base-commit: dd83757f6e686a2188997cb58b5975f744bb7786
change-id: 20240403-rss-e737d89efa77
prerequisite-change-id: 20241230-tun-66e10a49b0c7:v6
prerequisite-patch-id: 871dc5f146fb6b0e3ec8612971a8e8190472c0fb
prerequisite-patch-id: 2797ed249d32590321f088373d4055ff3f430a0e
prerequisite-patch-id: ea3370c72d4904e2f0536ec76ba5d26784c0cede
prerequisite-patch-id: 837e4cf5d6b451424f9b1639455e83a260c4440d
prerequisite-patch-id: ea701076f57819e844f5a35efe5cbc5712d3080d
prerequisite-patch-id: 701646fb43ad04cc64dd2bf13c150ccbe6f828ce
prerequisite-patch-id: 53176dae0c003f5b6c114d43f936cf7140d31bb5
prerequisite-change-id: 20250116-buffers-96e14bf023fc:v2
prerequisite-patch-id: 25fd4f99d4236a05a5ef16ab79f3e85ee57e21cc
Best regards,
--
Akihiko Odaki <akihiko.odaki(a)daynix.com>
The SBI Firmware Feature extension allows the S-mode to request some
specific features (either hardware or software) to be enabled. This
series uses this extension to request misaligned access exception
delegation to S-mode in order to let the kernel handle it. It also adds
support for the KVM FWFT SBI extension based on the misaligned access
handling infrastructure.
FWFT SBI extension is part of the SBI V3.0 specifications [1]. It can be
tested using the qemu provided at [2] which contains the series from
[3]. kvm-unit-tests [4] can be used inside kvm to tests the correct
delegation of misaligned exceptions. Upstream OpenSBI can be used.
Note: Since SBI V3.0 is not yet ratified, FWFT extension API is split
between interface only and implementation, allowing to pick only the
interface which do not have hard dependencies on SBI.
The tests can be run using the included kselftest:
$ qemu-system-riscv64 \
-cpu rv64,trap-misaligned-access=true,v=true \
-M virt \
-m 1024M \
-bios fw_dynamic.bin \
-kernel Image
...
# ./misaligned
TAP version 13
1..23
# Starting 23 tests from 1 test cases.
# RUN global.gp_load_lh ...
# OK global.gp_load_lh
ok 1 global.gp_load_lh
# RUN global.gp_load_lhu ...
# OK global.gp_load_lhu
ok 2 global.gp_load_lhu
# RUN global.gp_load_lw ...
# OK global.gp_load_lw
ok 3 global.gp_load_lw
# RUN global.gp_load_lwu ...
# OK global.gp_load_lwu
ok 4 global.gp_load_lwu
# RUN global.gp_load_ld ...
# OK global.gp_load_ld
ok 5 global.gp_load_ld
# RUN global.gp_load_c_lw ...
# OK global.gp_load_c_lw
ok 6 global.gp_load_c_lw
# RUN global.gp_load_c_ld ...
# OK global.gp_load_c_ld
ok 7 global.gp_load_c_ld
# RUN global.gp_load_c_ldsp ...
# OK global.gp_load_c_ldsp
ok 8 global.gp_load_c_ldsp
# RUN global.gp_load_sh ...
# OK global.gp_load_sh
ok 9 global.gp_load_sh
# RUN global.gp_load_sw ...
# OK global.gp_load_sw
ok 10 global.gp_load_sw
# RUN global.gp_load_sd ...
# OK global.gp_load_sd
ok 11 global.gp_load_sd
# RUN global.gp_load_c_sw ...
# OK global.gp_load_c_sw
ok 12 global.gp_load_c_sw
# RUN global.gp_load_c_sd ...
# OK global.gp_load_c_sd
ok 13 global.gp_load_c_sd
# RUN global.gp_load_c_sdsp ...
# OK global.gp_load_c_sdsp
ok 14 global.gp_load_c_sdsp
# RUN global.fpu_load_flw ...
# OK global.fpu_load_flw
ok 15 global.fpu_load_flw
# RUN global.fpu_load_fld ...
# OK global.fpu_load_fld
ok 16 global.fpu_load_fld
# RUN global.fpu_load_c_fld ...
# OK global.fpu_load_c_fld
ok 17 global.fpu_load_c_fld
# RUN global.fpu_load_c_fldsp ...
# OK global.fpu_load_c_fldsp
ok 18 global.fpu_load_c_fldsp
# RUN global.fpu_store_fsw ...
# OK global.fpu_store_fsw
ok 19 global.fpu_store_fsw
# RUN global.fpu_store_fsd ...
# OK global.fpu_store_fsd
ok 20 global.fpu_store_fsd
# RUN global.fpu_store_c_fsd ...
# OK global.fpu_store_c_fsd
ok 21 global.fpu_store_c_fsd
# RUN global.fpu_store_c_fsdsp ...
# OK global.fpu_store_c_fsdsp
ok 22 global.fpu_store_c_fsdsp
# RUN global.gen_sigbus ...
[12797.988647] misaligned[618]: unhandled signal 7 code 0x1 at 0x0000000000014dc0 in misaligned[4dc0,10000+76000]
[12797.988990] CPU: 0 UID: 0 PID: 618 Comm: misaligned Not tainted 6.13.0-rc6-00008-g4ec4468967c9-dirty #51
[12797.989169] Hardware name: riscv-virtio,qemu (DT)
[12797.989264] epc : 0000000000014dc0 ra : 0000000000014d00 sp : 00007fffe165d100
[12797.989407] gp : 000000000008f6e8 tp : 0000000000095760 t0 : 0000000000000008
[12797.989544] t1 : 00000000000965d8 t2 : 000000000008e830 s0 : 00007fffe165d160
[12797.989692] s1 : 000000000000001a a0 : 0000000000000000 a1 : 0000000000000002
[12797.989831] a2 : 0000000000000000 a3 : 0000000000000000 a4 : ffffffffdeadbeef
[12797.989964] a5 : 000000000008ef61 a6 : 626769735f6e0000 a7 : fffffffffffff000
[12797.990094] s2 : 0000000000000001 s3 : 00007fffe165d838 s4 : 00007fffe165d848
[12797.990238] s5 : 000000000000001a s6 : 0000000000010442 s7 : 0000000000010200
[12797.990391] s8 : 000000000000003a s9 : 0000000000094508 s10: 0000000000000000
[12797.990526] s11: 0000555567460668 t3 : 00007fffe165d070 t4 : 00000000000965d0
[12797.990656] t5 : fefefefefefefeff t6 : 0000000000000073
[12797.990756] status: 0000000200004020 badaddr: 000000000008ef61 cause: 0000000000000006
[12797.990911] Code: 8793 8791 3423 fcf4 3783 fc84 c737 dead 0713 eef7 (c398) 0001
# OK global.gen_sigbus
ok 23 global.gen_sigbus
# PASSED: 23 / 23 tests passed.
# Totals: pass:23 fail:0 xfail:0 xpass:0 skip:0 error:0
With kvm-tools:
# lkvm run -k sbi.flat -m 128
Info: # lkvm run -k sbi.flat -m 128 -c 1 --name guest-97
Info: Removed ghost socket file "/root/.lkvm//guest-97.sock".
##########################################################################
# kvm-unit-tests
##########################################################################
... [test messages elided]
PASS: sbi: fwft: FWFT extension probing no error
PASS: sbi: fwft: get/set reserved feature 0x6 error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0x3fffffff error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0x80000000 error == SBI_ERR_DENIED
PASS: sbi: fwft: get/set reserved feature 0xbfffffff error == SBI_ERR_DENIED
PASS: sbi: fwft: misaligned_deleg: Get misaligned deleg feature no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature invalid value error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature invalid value error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value 0
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value no error
PASS: sbi: fwft: misaligned_deleg: Set misaligned deleg feature value 1
PASS: sbi: fwft: misaligned_deleg: Verify misaligned load exception trap in supervisor
SUMMARY: 50 tests, 2 unexpected failures, 12 skipped
This series is available at [6].
Link: https://github.com/riscv-non-isa/riscv-sbi-doc/releases/download/vv3.0-rc2/… [1]
Link: https://github.com/rivosinc/qemu/tree/dev/cleger/misaligned [2]
Link: https://lore.kernel.org/all/20241211211933.198792-3-fkonrad@amd.com/T/ [3]
Link: https://github.com/clementleger/kvm-unit-tests/tree/dev/cleger/fwft_v1 [4]
Link: https://github.com/clementleger/unaligned_test [5]
Link: https://github.com/rivosinc/linux/tree/dev/cleger/fwft_v1 [6]
---
V4:
- Check SBI version 3.0 instead of 2.0 for FWFT presence
- Use long for kvm_sbi_fwft operation return value
- Init KVM sbi extension even if default_disabled
- Remove revert_on_fail parameter for sbi_fwft_feature_set().
- Fix comments for sbi_fwft_set/get()
- Only handle local features (there are no globals yet in the spec)
- Add new SBI errors to sbi_err_map_linux_errno()
V3:
- Added comment about kvm sbi fwft supported/set/get callback
requirements
- Move struct kvm_sbi_fwft_feature in kvm_sbi_fwft.c
- Add a FWFT interface
V2:
- Added Kselftest for misaligned testing
- Added get_user() usage instead of __get_user()
- Reenable interrupt when possible in misaligned access handling
- Document that riscv supports unaligned-traps
- Fix KVM extension state when an init function is present
- Rework SBI misaligned accesses trap delegation code
- Added support for CPU hotplugging
- Added KVM SBI reset callback
- Added reset for KVM SBI FWFT lock
- Return SBI_ERR_DENIED_LOCKED when LOCK flag is set
Clément Léger (18):
riscv: add Firmware Feature (FWFT) SBI extensions definitions
riscv: sbi: add new SBI error mappings
riscv: sbi: add FWFT extension interface
riscv: sbi: add SBI FWFT extension calls
riscv: misaligned: request misaligned exception from SBI
riscv: misaligned: use on_each_cpu() for scalar misaligned access
probing
riscv: misaligned: use correct CONFIG_ ifdef for
misaligned_access_speed
riscv: misaligned: move emulated access uniformity check in a function
riscv: misaligned: add a function to check misalign trap delegability
riscv: misaligned: factorize trap handling
riscv: misaligned: enable IRQs while handling misaligned accesses
riscv: misaligned: use get_user() instead of __get_user()
Documentation/sysctl: add riscv to unaligned-trap supported archs
selftests: riscv: add misaligned access testing
RISC-V: KVM: add SBI extension init()/deinit() functions
RISC-V: KVM: add SBI extension reset callback
RISC-V: KVM: add support for FWFT SBI extension
RISC-V: KVM: add support for SBI_FWFT_MISALIGNED_DELEG
Documentation/admin-guide/sysctl/kernel.rst | 4 +-
arch/riscv/include/asm/cpufeature.h | 8 +-
arch/riscv/include/asm/kvm_host.h | 5 +-
arch/riscv/include/asm/kvm_vcpu_sbi.h | 12 +
arch/riscv/include/asm/kvm_vcpu_sbi_fwft.h | 29 ++
arch/riscv/include/asm/sbi.h | 62 +++++
arch/riscv/include/uapi/asm/kvm.h | 1 +
arch/riscv/kernel/sbi.c | 95 +++++++
arch/riscv/kernel/traps.c | 57 ++--
arch/riscv/kernel/traps_misaligned.c | 118 +++++++-
arch/riscv/kernel/unaligned_access_speed.c | 11 +-
arch/riscv/kvm/Makefile | 1 +
arch/riscv/kvm/vcpu.c | 7 +-
arch/riscv/kvm/vcpu_sbi.c | 54 ++++
arch/riscv/kvm/vcpu_sbi_fwft.c | 252 +++++++++++++++++
arch/riscv/kvm/vcpu_sbi_sta.c | 3 +-
.../selftests/riscv/misaligned/.gitignore | 1 +
.../selftests/riscv/misaligned/Makefile | 12 +
.../selftests/riscv/misaligned/common.S | 33 +++
.../testing/selftests/riscv/misaligned/fpu.S | 180 +++++++++++++
tools/testing/selftests/riscv/misaligned/gp.S | 103 +++++++
.../selftests/riscv/misaligned/misaligned.c | 254 ++++++++++++++++++
22 files changed, 1255 insertions(+), 47 deletions(-)
create mode 100644 arch/riscv/include/asm/kvm_vcpu_sbi_fwft.h
create mode 100644 arch/riscv/kvm/vcpu_sbi_fwft.c
create mode 100644 tools/testing/selftests/riscv/misaligned/.gitignore
create mode 100644 tools/testing/selftests/riscv/misaligned/Makefile
create mode 100644 tools/testing/selftests/riscv/misaligned/common.S
create mode 100644 tools/testing/selftests/riscv/misaligned/fpu.S
create mode 100644 tools/testing/selftests/riscv/misaligned/gp.S
create mode 100644 tools/testing/selftests/riscv/misaligned/misaligned.c
--
2.47.2
Replacing all occurrences of `addr_of!(place)` with
`&raw const place`.
This will allow us to reduce macro complexity, and improve consistency
with existing reference syntax as `&raw const` is similar to `&` making
it fit more naturally with other existing code.
Suggested-by: Benno Lossin <benno.lossin(a)proton.me>
Link: https://github.com/Rust-for-Linux/linux/issues/1148
Signed-off-by: Antonio Hickey <contact(a)antoniohickey.com>
---
rust/kernel/kunit.rs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs
index 824da0e9738a..a17ef3b2e860 100644
--- a/rust/kernel/kunit.rs
+++ b/rust/kernel/kunit.rs
@@ -128,9 +128,9 @@ unsafe impl Sync for UnaryAssert {}
unsafe {
$crate::bindings::__kunit_do_failed_assertion(
kunit_test,
- core::ptr::addr_of!(LOCATION.0),
+ &raw const LOCATION.0,
$crate::bindings::kunit_assert_type_KUNIT_ASSERTION,
- core::ptr::addr_of!(ASSERTION.0.assert),
+ &raw const ASSERTION.0.assert,
Some($crate::bindings::kunit_unary_assert_format),
core::ptr::null(),
);
This patch set convert iptables to nftables for wireguard testing, as
iptables is deparated and nftables is the default framework of most releases.
v3: drop iptables directly (Jason A. Donenfeld)
Also convert to using nft for qemu testing (Jason A. Donenfeld)
v2: use one nft table for testing (Phil Sutter)
Hangbin Liu (2):
selftests: wireguards: convert iptables to nft
selftests: wireguard: update to using nft for qemu test
tools/testing/selftests/wireguard/netns.sh | 29 +++++++++-----
.../testing/selftests/wireguard/qemu/Makefile | 40 ++++++++++++++-----
.../selftests/wireguard/qemu/kernel.config | 7 ++--
3 files changed, 53 insertions(+), 23 deletions(-)
--
2.46.0
Greetings:
Welcome to the RFC.
Currently, when a user app uses sendfile the user app has no way to know
if the bytes were transmit; sendfile simply returns, but it is possible
that a slow client on the other side may take time to receive and ACK
the bytes. In the meantime, the user app which called sendfile has no
way to know whether it can overwrite the data on disk that it just
sendfile'd.
One way to fix this is to add zerocopy notifications to sendfile similar
to how MSG_ZEROCOPY works with sendmsg. This is possible thanks to the
extensive work done by Pavel [1].
To support this, two important user ABI changes are proposed:
- A new splice flag, SPLICE_F_ZC, which allows users to signal that
splice should generate zerocopy notifications if possible.
- A new system call, sendfile2, which is similar to sendfile64 except
that it takes an additional argument, flags, which allows the user
to specify either a "regular" sendfile or a sendfile with zerocopy
notifications enabled.
In either case, user apps can read notifications from the error queue
(like they would with MSG_ZEROCOPY) to determine when their call to
sendfile has completed.
I tested this RFC using the selftest modified in the last patch and also
by using the selftest between two different physical hosts:
# server
./msg_zerocopy -4 -i eth0 -t 2 -v -r tcp
# client (does the sendfiling)
dd if=/dev/zero of=sendfile_data bs=1M count=8
./msg_zerocopy -4 -i eth0 -D $SERVER_IP -v -l 1 -t 2 -z -f sendfile_data tcp
I would love to get high level feedback from folks on a few things:
- Is this functionality, at a high level, something that would be
desirable / useful? I think so, but I'm of course I am biased ;)
- Is this approach generally headed in the right direction? Are the
proposed user ABI changes reasonable?
If the above two points are generally agreed upon then I'd welcome
feedback on the patches themselves :)
This is kind of a net thing, but also kind of a splice thing so hope I
am sending this to right places to get appropriate feedback. I based my
code on the vfs/for-next tree, but am happy to rebase on another tree if
desired. The cc-list got a little out of control, so I manually trimmed
it down quite a bit; sorry if I missed anyone I should have CC'd in the
process.
Thanks,
Joe
[1]: https://lore.kernel.org/netdev/cover.1657643355.git.asml.silence@gmail.com/
Joe Damato (10):
splice: Add ubuf_info to prepare for ZC
splice: Add helper that passes through splice_desc
splice: Factor splice_socket into a helper
splice: Add SPLICE_F_ZC and attach ubuf
fs: Add splice_write_sd to file operations
fs: Extend do_sendfile to take a flags argument
fs: Add sendfile2 which accepts a flags argument
fs: Add sendfile flags for sendfile2
fs: Add sendfile2 syscall
selftests: Add sendfile zerocopy notification test
arch/alpha/kernel/syscalls/syscall.tbl | 1 +
arch/arm/tools/syscall.tbl | 1 +
arch/arm64/tools/syscall_32.tbl | 1 +
arch/m68k/kernel/syscalls/syscall.tbl | 1 +
arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
arch/parisc/kernel/syscalls/syscall.tbl | 1 +
arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
arch/s390/kernel/syscalls/syscall.tbl | 1 +
arch/sh/kernel/syscalls/syscall.tbl | 1 +
arch/sparc/kernel/syscalls/syscall.tbl | 1 +
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
fs/read_write.c | 40 +++++++---
fs/splice.c | 87 +++++++++++++++++----
include/linux/fs.h | 2 +
include/linux/sendfile.h | 10 +++
include/linux/splice.h | 7 +-
include/linux/syscalls.h | 2 +
include/uapi/asm-generic/unistd.h | 4 +-
net/socket.c | 1 +
scripts/syscall.tbl | 1 +
tools/testing/selftests/net/msg_zerocopy.c | 54 ++++++++++++-
tools/testing/selftests/net/msg_zerocopy.sh | 5 ++
27 files changed, 200 insertions(+), 29 deletions(-)
create mode 100644 include/linux/sendfile.h
base-commit: 2e72b1e0aac24a12f3bf3eec620efaca7ab7d4de
--
2.43.0
This is one of just 3 remaining "Test Module" kselftests (the others
being printf and scanf), the rest having been converted to KUnit.
I tested this using:
$ tools/testing/kunit/kunit.py run --arch arm64 --make_options LLVM=1 bitmap.
I've already sent out a conversion series for each of printf[0] and scanf[1].
There was a previous attempt[2] to do this in July 2024. Please bear
with me as I try to understand and address the objections from that
time. I've spoken with Muhammad Usama Anjum, the author of that series,
and received their approval to "take over" this work. Here we go...
On 7/26/24 11:45 PM, John Hubbard wrote:
>
> This changes the situation from "works for Linus' tab completion
> case", to "causes a tab completion problem"! :)
>
> I think a tests/ subdir is how we eventually decided to do this [1],
> right?
>
> So:
>
> lib/tests/bitmap_kunit.c
>
> [1] https://lore.kernel.org/20240724201354.make.730-kees@kernel.org
This is true and unfortunate, but not trivial to fix because new
kallsyms tests were placed in lib/tests in commit 84b4a51fce4c
("selftests: add new kallsyms selftests") *after* the KUnit filename
best practices were adopted.
I propose that the KUnit maintainers blaze this trail using
`string_kunit.c` which currently still lives in lib/ despite the KUnit
docs giving it as an example at lib/tests/.
On 7/27/24 12:24 AM, Shuah Khan wrote:
>
> This change will take away the ability to run bitmap tests during
> boot on a non-kunit kernel.
>
> Nack on this change. I wan to see all tests that are being removed
> from lib because they have been converted - also it doesn't make
> sense to convert some tests like this one that add the ability test
> during boot.
This point was also discussed in another thread[3] in which:
On 7/27/24 12:35 AM, Shuah Khan wrote:
>
> Please make sure you aren't taking away the ability to run these tests during
> boot.
>
> It doesn't make sense to convert every single test especially when it
> is intended to be run during boot without dependencies - not as a kunit test
> but a regression test during boot.
>
> bitmap is one example - pay attention to the config help test - bitmap
> one clearly states it runs regression testing during boot. Any test that
> says that isn't a candidate for conversion.
>
> I am going to nack any such conversions.
The crux of the argument seems to be that the config help text is taken
to describe the author's intent with the fragment "at boot". I think
this may be a case of confirmation bias: I see at least the following
KUnit tests with "at boot" in their help text:
- CPUMASK_KUNIT_TEST
- BITFIELD_KUNIT
- CHECKSUM_KUNIT
- UTIL_MACROS_KUNIT
It seems to me that the inference being made is that any test that runs
"at boot" is intended to be run by both developers and users, but I find
no evidence that bitmap in particular would ever provide additional
value when run by users.
There's further discussion about KUnit not being "ideal for cases where
people would want to check a subsystem on a running kernel", but I find
no evidence that bitmap in particular is actually testing the running
kernel; it is a unit test of the bitmap functions, which is also stated
in the config help text.
David Gow made many of the same points in his final reply[4], which was
never replied to.
Link: https://lore.kernel.org/all/20250207-printf-kunit-convert-v2-0-057b23860823… [0]
Link: https://lore.kernel.org/all/20250207-scanf-kunit-convert-v4-0-a23e2afaede8@… [1]
Link: https://lore.kernel.org/all/20240726110658.2281070-1-usama.anjum@collabora.… [2]
Link: https://lore.kernel.org/all/327831fb-47ab-4555-8f0b-19a8dbcaacd7@collabora.… [3]
Link: https://lore.kernel.org/all/CABVgOSmMoPD3JfzVd4VTkzGL2fZCo8LfwzaVSzeFimPrhg… [4]
Thanks for your attention.
Signed-off-by: Tamir Duberstein <tamird(a)gmail.com>
---
Tamir Duberstein (3):
bitmap: remove _check_eq_u32_array
bitmap: convert self-test to KUnit
bitmap: break kunit into test cases
MAINTAINERS | 2 +-
arch/m68k/configs/amiga_defconfig | 1 -
arch/m68k/configs/apollo_defconfig | 1 -
arch/m68k/configs/atari_defconfig | 1 -
arch/m68k/configs/bvme6000_defconfig | 1 -
arch/m68k/configs/hp300_defconfig | 1 -
arch/m68k/configs/mac_defconfig | 1 -
arch/m68k/configs/multi_defconfig | 1 -
arch/m68k/configs/mvme147_defconfig | 1 -
arch/m68k/configs/mvme16x_defconfig | 1 -
arch/m68k/configs/q40_defconfig | 1 -
arch/m68k/configs/sun3_defconfig | 1 -
arch/m68k/configs/sun3x_defconfig | 1 -
arch/powerpc/configs/ppc64_defconfig | 1 -
lib/Kconfig.debug | 24 +-
lib/Makefile | 2 +-
lib/{test_bitmap.c => bitmap_kunit.c} | 454 +++++++++++++---------------------
tools/testing/selftests/lib/bitmap.sh | 3 -
tools/testing/selftests/lib/config | 1 -
19 files changed, 195 insertions(+), 304 deletions(-)
---
base-commit: 2014c95afecee3e76ca4a56956a936e23283f05b
change-id: 20250207-bitmap-kunit-convert-92d3147b2eee
Best regards,
--
Tamir Duberstein <tamird(a)gmail.com>
Replacing all occurrences of `addr_of!(place)` and `addr_of_mut!(place)`
with `&raw const place` and `&raw mut place` respectively.
This will allow us to reduce macro complexity, and improve consistency
with existing reference syntax as `&raw const`, `&raw mut` are similar
to `&`, `&mut` making it fit more naturally with other existing code.
Suggested-by: Benno Lossin <benno.lossin(a)proton.me>
Link: https://github.com/Rust-for-Linux/linux/issues/1148
Signed-off-by: Antonio Hickey <contact(a)antoniohickey.com>
---
rust/kernel/kunit.rs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs
index 824da0e9738a..a17ef3b2e860 100644
--- a/rust/kernel/kunit.rs
+++ b/rust/kernel/kunit.rs
@@ -128,9 +128,9 @@ unsafe impl Sync for UnaryAssert {}
unsafe {
$crate::bindings::__kunit_do_failed_assertion(
kunit_test,
- core::ptr::addr_of!(LOCATION.0),
+ &raw const LOCATION.0,
$crate::bindings::kunit_assert_type_KUNIT_ASSERTION,
- core::ptr::addr_of!(ASSERTION.0.assert),
+ &raw const ASSERTION.0.assert,
Some($crate::bindings::kunit_unary_assert_format),
core::ptr::null(),
);
--
2.48.1
I am submitting a series of patches that introduce a new feature for the
netconsole subsystem, specifically the addition of the 'release' field
to the sysdata structure. This feature allows the kernel release/version
to be appended to the userdata dictionary in every message sent,
enhancing the information available for debugging and monitoring
purposes.
This complements the already supported release prepend feature, which
was added some time ago. The release prepend appends the release
information at the message header, which is not ideal for two reasons:
1) It is difficult to determine if a message includes this information,
making it hard and resource-intensive to parse.
2) When a message is fragmented, the release information is appended to
every message fragment, consuming valuable space in the packet.
The "release prepend" feature was created before the concept of userdata
and sysdata. Now that this format has proven successful, we are
implementing the release feature as part of this enhanced structure.
This patch series aims to improve the netconsole subsystem by providing
a more efficient and user-friendly way to include kernel release
information in messages. I believe these changes will significantly aid
in system analysis and troubleshooting.
Suggested-by: Manu Bretelle <chantr4(a)gmail.com>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
Breno Leitao (6):
netconsole: introduce 'release' as a new sysdata field
netconsole: implement configfs for release_enabled
netconsole: add 'sysdata' suffix to related functions
netconsole: append release to sysdata
selftests: netconsole: Add tests for 'release' feature in sysdata
docs: netconsole: document release feature
Documentation/networking/netconsole.rst | 25 ++++++++
drivers/net/netconsole.c | 71 ++++++++++++++++++++--
.../selftests/drivers/net/netcons_sysdata.sh | 44 +++++++++++++-
3 files changed, 133 insertions(+), 7 deletions(-)
---
base-commit: 941defcea7e11ad7ff8f0d4856716dd637d757dd
change-id: 20250314-netcons_release-dc1f1f5ca0f7
Best regards,
--
Breno Leitao <leitao(a)debian.org>
Lei Chen raised an issue with CLOCK_MONOTONIC_COARSE seeing
time inconsistencies.
Lei tracked down that this was being caused by the adjustment
tk->tkr_mono.xtime_nsec -= offset;
which is made to compensate for the unaccumulated cycles in
offset when the mult value is adjusted forward, so that
the non-_COARSE clockids don't see inconsistencies.
However, the _COARSE clockids don't use the mult*offset value
in their calculations, so this subtraction can cause the
_COARSE clock ids to jump back a bit.
Now, by design, this negative adjustment should be fine, because
the logic run from timekeeping_adjust() is done after we
accumulate approx mult*interval_cycles into xtime_nsec.
The accumulated (mult*interval_cycles) will be larger then the
(mult_adj*offset) value subtracted from xtime_nsec, and both
operations are done together under the tk_core.lock, so the net
change to xtime_nsec should always be positive.
However, do_adjtimex() calls into timekeeping_advance() as well,
since we want to apply the ntp freq adjustment immediately.
In this case, we don't return early when the offset is smaller
then interval_cycles, so we don't end up accumulating any time
into xtime_nsec. But we do go on to call timekeeping_adjust(),
which modifies the mult value, and subtracts from xtime_nsec
to correct for the new mult value.
Here because we did not accumulate anything, we have a window
where the _COARSE clockids that don't utilize the mult*offset
value, can see an inconsistency.
So to fix this, rework the timekeeping_advance() logic a bit
so that when we are called from do_adjtimex() and the offset
is smaller then cycle_interval, that we call
timekeeping_forward(), to first accumulate the sub-interval
time into xtime_nsec. Then with no unaccumulated cycles in
offset, we can do the mult adjustment without worry of the
subtraction having an impact.
NOTE: This was implemented as a potential alternative to
Thomas' approach here:
https://lore.kernel.org/lkml/87cyej5rid.ffs@tglx/
And similarly, it needs some additional review and testing,
as it was developed while packing for conference travel.
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Stephen Boyd <sboyd(a)kernel.org>
Cc: Anna-Maria Behnsen <anna-maria(a)linutronix.de>
Cc: Frederic Weisbecker <frederic(a)kernel.org>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Miroslav Lichvar <mlichvar(a)redhat.com>
Cc: linux-kselftest(a)vger.kernel.org
Cc: kernel-team(a)android.com
Cc: Lei Chen <lei.chen(a)smartx.com>
Fixes: da15cfdae033 ("time: Introduce CLOCK_REALTIME_COARSE")
Reported-by: Lei Chen <lei.chen(a)smartx.com>
Closes: https://lore.kernel.org/lkml/20250310030004.3705801-1-lei.chen@smartx.com/
Diagnosed-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: John Stultz <jstultz(a)google.com>
---
kernel/time/timekeeping.c | 87 ++++++++++++++++++++++++++++-----------
1 file changed, 62 insertions(+), 25 deletions(-)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f1955..6f3a145e7b113 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -682,18 +682,18 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
}
/**
- * timekeeping_forward_now - update clock to the current time
+ * timekeeping_forward - update clock to given cycle now value
* @tk: Pointer to the timekeeper to update
+ * @cycle_now: Current clocksource read value
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void timekeeping_forward_now(struct timekeeper *tk)
+static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now)
{
- u64 cycle_now, delta;
+ u64 delta;
- cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
tk->tkr_mono.cycle_last = cycle_now;
@@ -710,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
}
+/**
+ * timekeeping_forward_now - update clock to the current time
+ * @tk: Pointer to the timekeeper to update
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void timekeeping_forward_now(struct timekeeper *tk)
+{
+ u64 cycle_now = tk_clock_read(&tk->tkr_mono);
+
+ timekeeping_forward(tk, cycle_now);
+}
+
/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
@@ -2151,6 +2166,45 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}
+static u64 timekeeping_accumulate(struct timekeeper *tk, u64 now, u64 offset,
+ unsigned int *clock_set)
+{
+ struct timekeeper *real_tk = &tk_core.timekeeper;
+ int shift = 0, maxshift;
+
+ /*
+ * If we have a sub-cycle_interval offset, we
+ * are likely doing a TK_FREQ_ADJ, so accumulate
+ * everything so we don't have a remainder offset
+ * when later adjusting the multiplier
+ */
+ if (offset < real_tk->cycle_interval) {
+ timekeeping_forward(tk, now);
+ *clock_set = 1;
+ return 0;
+ }
+
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller than the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
+ */
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1;
+ shift = min(shift, maxshift);
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift, clock_set);
+ if (offset < tk->cycle_interval << shift)
+ shift--;
+ }
+ return offset;
+}
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
@@ -2160,8 +2214,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
struct timekeeper *tk = &tk_core.shadow_timekeeper;
struct timekeeper *real_tk = &tk_core.timekeeper;
unsigned int clock_set = 0;
- int shift = 0, maxshift;
- u64 offset;
+ u64 cycle_now, offset;
guard(raw_spinlock_irqsave)(&tk_core.lock);
@@ -2169,7 +2222,8 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (unlikely(timekeeping_suspended))
return false;
- offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
+ cycle_now = tk_clock_read(&tk->tkr_mono);
+ offset = clocksource_delta(cycle_now,
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
@@ -2177,24 +2231,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
- /*
- * With NO_HZ we may have to accumulate many cycle_intervals
- * (think "ticks") worth of time at once. To do this efficiently,
- * we calculate the largest doubling multiple of cycle_intervals
- * that is smaller than the offset. We then accumulate that
- * chunk in one go, and then try to consume the next smaller
- * doubled multiple.
- */
- shift = ilog2(offset) - ilog2(tk->cycle_interval);
- shift = max(0, shift);
- /* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
- shift = min(shift, maxshift);
- while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
- if (offset < tk->cycle_interval<<shift)
- shift--;
- }
+ offset = timekeeping_accumulate(tk, cycle_now, offset, &clock_set);
/* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
--
2.49.0.rc1.451.g8f38331e32-goog
Hi all,
This is v8 of the Rust/KUnit integration patch. I think all of the
suggestions have at least been responded to (even if there are a few I'm
leaving as either future projects or matters of taste). Hopefully this
is good-to-go for 6.15, so we can start using it concurrently with
making any additional improvements we may wish.
This series was originally written by José Expósito, and has been
modified and updated by Matt Gilbride, Miguel Ojeda, and myself. The
original version can be found here:
https://github.com/Rust-for-Linux/linux/pull/950
Add support for writing KUnit tests in Rust. While Rust doctests are
already converted to KUnit tests and run, they're really better suited
for examples, rather than as first-class unit tests.
This series implements a series of direct Rust bindings for KUnit tests,
as well as a new macro which allows KUnit tests to be written using a
close variant of normal Rust unit test syntax. The only change required
is replacing '#[cfg(test)]' with '#[kunit_tests(kunit_test_suite_name)]'
An example test would look like:
#[kunit_tests(rust_kernel_hid_driver)]
mod tests {
use super::*;
use crate::{c_str, driver, hid, prelude::*};
use core::ptr;
struct SimpleTestDriver;
impl Driver for SimpleTestDriver {
type Data = ();
}
#[test]
fn rust_test_hid_driver_adapter() {
let mut hid = bindings::hid_driver::default();
let name = c_str!("SimpleTestDriver");
static MODULE: ThisModule = unsafe { ThisModule::from_ptr(ptr::null_mut()) };
let res = unsafe {
<hid::Adapter<SimpleTestDriver> as driver::DriverOps>::register(&mut hid, name, &MODULE)
};
assert_eq!(res, Err(ENODEV)); // The mock returns -19
}
}
Please give this a go, and make sure I haven't broken it! There's almost
certainly a lot of improvements which can be made -- and there's a fair
case to be made for replacing some of this with generated C code which
can use the C macros -- but this is hopefully an adequate implementation
for now, and the interface can (with luck) remain the same even if the
implementation changes.
A few small notable missing features:
- Attributes (like the speed of a test) are hardcoded to the default
value.
- Similarly, the module name attribute is hardcoded to NULL. In C, we
use the KBUILD_MODNAME macro, but I couldn't find a way to use this
from Rust which wasn't more ugly than just disabling it.
- Assertions are not automatically rewritten to use KUnit assertions.
---
Changes since v7:
https://lore.kernel.org/rust-for-linux/20250214074051.1619256-1-davidgow@go…
- Reworked the SAFETY comment for addr_of_mut! use with statics in
kunit_unsafe_test_suite!() (again)
- Removed the second mocking example, which was causing confusion.
The first example of in_kunit_test() should be clear enough.
Changes since v6:
https://lore.kernel.org/rust-for-linux/20250214074051.1619256-1-davidgow@go…
- Fixed an [allow(unused_unsafe)] which ended up in patch 2 instead of
patch 1. (Thanks, Tamir!)
- Doc comments now have several useful links. (Thanks, Tamir!)
- Fix a potential compile error under macos. (Thanks, Tamir!)
- Several small tidy-ups to limit unsafe usage. (Thanks, Tamir!)
Changes since v5:
https://lore.kernel.org/all/20241213081035.2069066-1-davidgow@google.com/
- Rebased against 6.14-rc1
- Fixed a bunch of warnings / clippy lints introduced in Rust 1.83 and
1.84.
- No longer needs static_mut_refs / const_mut_refs, and is much cleaned
up as a result. (Thanks, Miguel)
- Major documentation and example fixes. (Thanks, Miguel)
Changes since v4:
https://lore.kernel.org/linux-kselftest/20241101064505.3820737-1-davidgow@g…
- Rebased against 6.13-rc1
- Allowed an unused_unsafe warning after the behaviour of addr_of_mut!()
changed in Rust 1.82. (Thanks Boqun, Miguel)
- "Expect" that the sample assert_eq!(1+1, 2) produces a clippy warning
due to a redundant assertion. (Thanks Boqun, Miguel)
- Fix some missing safety comments, and remove some unneeded 'unsafe'
blocks. (Thanks Boqun)
- Fix a couple of minor rustfmt issues which were triggering checkpatch
warnings.
Changes since v3:
https://lore.kernel.org/linux-kselftest/20241030045719.3085147-2-davidgow@g…
- The kunit_unsafe_test_suite!() macro now panic!s if the suite name is
too long, triggering a compile error. (Thanks, Alice!)
- The #[kunit_tests()] macro now preserves span information, so
errors can be better reported. (Thanks, Boqun!)
- The example tests have been updated to no longer use assert_eq!() with
a constant bool argument (which triggered a clippy warning now we
have the span info).
Changes since v2:
https://lore.kernel.org/linux-kselftest/20241029092422.2884505-1-davidgow@g…
- Include missing rust/macros/kunit.rs file from v2. (Thanks Boqun!)
- The kunit_unsafe_test_suite!() macro will truncate the name of the
suite if it is too long. (Thanks Alice!)
- The proc macro now emits an error if the suite name is too long.
- We no longer needlessly use UnsafeCell<> in
kunit_unsafe_test_suite!(). (Thanks Alice!)
Changes since v1:
https://lore.kernel.org/lkml/20230720-rustbind-v1-0-c80db349e3b5@google.com…
- Rebase on top of the latest rust-next (commit 718c4069896c)
- Make kunit_case a const fn, rather than a macro (Thanks Boqun)
- As a result, the null terminator is now created with
kernel::kunit::kunit_case_null()
- Use the C kunit_get_current_test() function to implement
in_kunit_test(), rather than re-implementing it (less efficiently)
ourselves.
Changes since the GitHub PR:
- Rebased on top of kselftest/kunit
- Add const_mut_refs feature
This may conflict with https://lore.kernel.org/lkml/20230503090708.2524310-6-nmi@metaspace.dk/
- Add rust/macros/kunit.rs to the KUnit MAINTAINERS entry
---
José Expósito (3):
rust: kunit: add KUnit case and suite macros
rust: macros: add macro to easily run KUnit tests
rust: kunit: allow to know if we are in a test
MAINTAINERS | 1 +
rust/kernel/kunit.rs | 171 +++++++++++++++++++++++++++++++++++++++++++
rust/macros/kunit.rs | 161 ++++++++++++++++++++++++++++++++++++++++
rust/macros/lib.rs | 29 ++++++++
4 files changed, 362 insertions(+)
create mode 100644 rust/macros/kunit.rs
--
2.49.0.rc0.332.g42c0ae87b1-goog
Signal delivery during connect() may disconnect an already established
socket. Problem is that such socket might have been placed in a sockmap
before the connection was closed.
PATCH 1 ensures this race won't lead to an unconnected vsock staying in the
sockmap. PATCH 2 selftests it.
PATCH 3 fixes a related race. Note that selftest in PATCH 2 does test this
code as well, but winning this race variant may take more than 2 seconds,
so I'm not advertising it.
Signed-off-by: Michal Luczaj <mhal(a)rbox.co>
---
Changes in v4:
- Selftest: send signal to only our own process
- Link to v3: https://lore.kernel.org/r/20250316-vsock-trans-signal-race-v3-0-17a6862277c…
Changes in v3:
- Selftest: drop unnecessary variable initialization and reorder the calls
- Link to v2: https://lore.kernel.org/r/20250314-vsock-trans-signal-race-v2-0-421a41f60f4…
Changes in v2:
- Handle one more path of tripping the warning
- Add a selftest
- Collect R-b [Stefano]
- Link to v1: https://lore.kernel.org/r/20250307-vsock-trans-signal-race-v1-1-3aca3f771fb…
---
Michal Luczaj (3):
vsock/bpf: Fix EINTR connect() racing sockmap update
selftest/bpf: Add test for AF_VSOCK connect() racing sockmap update
vsock/bpf: Fix bpf recvmsg() racing transport reassignment
net/vmw_vsock/af_vsock.c | 10 ++-
net/vmw_vsock/vsock_bpf.c | 24 ++++--
.../selftests/bpf/prog_tests/sockmap_basic.c | 99 ++++++++++++++++++++++
3 files changed, 124 insertions(+), 9 deletions(-)
---
base-commit: da9e8efe7ee10e8425dc356a9fc593502c8e3933
change-id: 20250305-vsock-trans-signal-race-d62f7718d099
Best regards,
--
Michal Luczaj <mhal(a)rbox.co>
From: Björn Töpel <bjorn(a)rivosinc.com>
There are scenarios where env.{sub,}test_state->stdout_saved, can be
NULL, e.g. sometimes when the watchdog timeout kicks in, or if the
open_memstream syscall is not available.
Avoid crashing test_progs by adding an explicit NULL check prior the
fclose() call.
Signed-off-by: Björn Töpel <bjorn(a)rivosinc.com>
---
tools/testing/selftests/bpf/test_progs.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index d4ec9586b98c..309d9d4a8ace 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -103,12 +103,14 @@ static void stdio_restore(void)
pthread_mutex_lock(&stdout_lock);
if (env.subtest_state) {
- fclose(env.subtest_state->stdout_saved);
+ if (env.subtest_state->stdout_saved)
+ fclose(env.subtest_state->stdout_saved);
env.subtest_state->stdout_saved = NULL;
stdout = env.test_state->stdout_saved;
stderr = env.test_state->stdout_saved;
} else {
- fclose(env.test_state->stdout_saved);
+ if (env.test_state->stdout_saved)
+ fclose(env.test_state->stdout_saved);
env.test_state->stdout_saved = NULL;
stdout = env.stdout_saved;
stderr = env.stderr_saved;
base-commit: f3f8649585a445414521a6d5b76f41b51205086d
--
2.45.2
Hi all, this is the v3 version.
===
Syzkaller reported this issue [1].
The current sockmap has a dependency on sk_socket in both read and write
stages, but there is a possibility that sk->sk_socket is released during
the process, leading to panic situations. For a detailed reproduction,
please refer to the description in the v2:
https://lore.kernel.org/bpf/20250228055106.58071-1-jiayuan.chen@linux.dev/
The corresponding fix approaches are described in the commit messages of
each patch.
By the way, the current sockmap lacks statistical information, especially
global statistics, such as the number of successful or failed rx and tx
operations. These statistics cannot be obtained from the socket interface
itself.
These data will be of great help in troubleshooting issues and observing
sockmap behavior.
If the maintainer/reviewer does not object, I think we can provide these
statistical information in the future, either through proc/trace/bpftool.
[1] https://syzkaller.appspot.com/bug?extid=dd90a702f518e0eac072
---
v2 -> v3:
1. Michal Luczaj reported similar race issue under sockmap sending path.
2. Rcu lock is conflict with mutex_lock in unix socket read implementation.
https://lore.kernel.org/bpf/20250228055106.58071-1-jiayuan.chen@linux.dev/
v1 -> v2:
1. Add Fixes tag.
2. Extend selftest of edge case for TCP/UDP sockets.
3. Add Reviewed-by and Acked-by tag.
https://lore.kernel.org/bpf/20250226132242.52663-1-jiayuan.chen@linux.dev/T…
Jiayuan Chen (3):
bpf, sockmap: avoid using sk_socket after free when sending
bpf, sockmap: avoid using sk_socket after free when reading
selftests/bpf: Add edge case tests for sockmap
net/core/skmsg.c | 22 ++++++-
.../selftests/bpf/prog_tests/socket_helpers.h | 13 +++-
.../selftests/bpf/prog_tests/sockmap_basic.c | 60 +++++++++++++++++++
3 files changed, 91 insertions(+), 4 deletions(-)
--
2.47.1
Fix test count with late test plan.
For example,
TAP version 13
ok 1 test1
1..4
Returns a count of 1 passed, 1 crashed (because it expects tests after
the test plan): returning the total count of 2 tests
Change this to be 1 passed, 1 error: total count of 1 test
Signed-off-by: Rae Moar <rmoar(a)google.com>
---
tools/testing/kunit/kunit_parser.py | 4 ++++
tools/testing/kunit/kunit_tool_test.py | 4 ++--
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index da53a709773a..c176487356e6 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -809,6 +809,10 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
test.log.extend(parse_diagnostic(lines))
if test.name != "" and not peek_test_name_match(lines, test):
test.add_error(printer, 'missing subtest result line!')
+ elif not lines:
+ print_log(test.log, printer)
+ test.status = TestStatus.NO_TESTS
+ test.add_error(printer, 'No more test results!')
else:
parse_test_result(lines, test, expected_num, printer)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 5ff4f6ffd873..bbba921e0eac 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -371,8 +371,8 @@ class KUnitParserTest(unittest.TestCase):
"""
result = kunit_parser.parse_run_tests(output.splitlines(), stdout)
# Missing test results after test plan should alert a suspected test crash.
- self.assertEqual(kunit_parser.TestStatus.TEST_CRASHED, result.status)
- self.assertEqual(result.counts, kunit_parser.TestCounts(passed=1, crashed=1, errors=1))
+ self.assertEqual(kunit_parser.TestStatus.SUCCESS, result.status)
+ self.assertEqual(result.counts, kunit_parser.TestCounts(passed=1, errors=2))
def line_stream_from_strs(strs: Iterable[str]) -> kunit_parser.LineStream:
return kunit_parser.LineStream(enumerate(strs, start=1))
base-commit: 2e0cf2b32f72b20b0db5cc665cd8465d0f257278
--
2.49.0.395.g12beb8f557-goog
A bug was identified where the KTAP below caused an infinite loop:
TAP version 13
ok 4 test_case
1..4
The infinite loop was caused by the parser not parsing a test plan
if following a test result line.
Fix this bug by parsing test plan line to avoid the infinite loop.
Signed-off-by: Rae Moar <rmoar(a)google.com>
---
Changes since v2:
- None, adds test in second patch
tools/testing/kunit/kunit_parser.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 29fc27e8949b..da53a709773a 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -759,7 +759,7 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
# If parsing the main/top-level test, parse KTAP version line and
# test plan
test.name = "main"
- ktap_line = parse_ktap_header(lines, test, printer)
+ parse_ktap_header(lines, test, printer)
test.log.extend(parse_diagnostic(lines))
parse_test_plan(lines, test)
parent_test = True
@@ -768,13 +768,12 @@ def parse_test(lines: LineStream, expected_num: int, log: List[str], is_subtest:
# the KTAP version line and/or subtest header line
ktap_line = parse_ktap_header(lines, test, printer)
subtest_line = parse_test_header(lines, test)
+ test.log.extend(parse_diagnostic(lines))
+ parse_test_plan(lines, test)
parent_test = (ktap_line or subtest_line)
if parent_test:
- # If KTAP version line and/or subtest header is found, attempt
- # to parse test plan and print test header
- test.log.extend(parse_diagnostic(lines))
- parse_test_plan(lines, test)
print_test_header(test, printer)
+
expected_count = test.expected_count
subtests = []
test_num = 1
base-commit: 0619a4868fc1b32b07fb9ed6c69adc5e5cf4e4b2
--
2.49.0.rc1.451.g8f38331e32-goog
Here are a few cleanups, preparation work for the new PM ops, and sysctl
knobs.
- Patch 1: reorg: move generic NL code used by all PMs to pm_netlink.c.
- Patch 2: use kmemdup() instead of kmalloc + copy.
- Patch 3: small cleanup to use pm var instead of msk->pm.
- Patch 4: reorg: id_avail_bitmap is only used by the in-kernel PM.
- Patch 5: use struct_group to easily reset a subset of PM data vars.
- Patch 6: introduce the minimal skeleton for the new PM ops.
- Patch 7: register in-kernel and userspace PM ops.
- Patch 8: new net.mptcp.path_manager sysctl knob, deprecating pm_type.
- Patch 9: map the new path_manager sysctl knob with pm_type.
- Patch 10: map the old pm_type sysctl knob with path_manager.
- Patch 11: new net.mptcp.available_path_managers sysctl knob.
- Patch 12: new test to validate path_manager and pm_type mapping.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Geliang Tang (11):
mptcp: pm: in-kernel: use kmemdup helper
mptcp: pm: use pm variable instead of msk->pm
mptcp: pm: only fill id_avail_bitmap for in-kernel pm
mptcp: pm: add struct_group in mptcp_pm_data
mptcp: pm: define struct mptcp_pm_ops
mptcp: pm: register in-kernel and userspace PM
mptcp: sysctl: set path manager by name
mptcp: sysctl: map path_manager to pm_type
mptcp: sysctl: map pm_type to path_manager
mptcp: sysctl: add available_path_managers
selftests: mptcp: add pm sysctl mapping tests
Matthieu Baerts (NGI0) (1):
mptcp: pm: split netlink and in-kernel init
Documentation/networking/mptcp-sysctl.rst | 23 +++++
include/net/mptcp.h | 14 +++
net/mptcp/ctrl.c | 113 +++++++++++++++++++++-
net/mptcp/pm.c | 97 ++++++++++++++++---
net/mptcp/pm_kernel.c | 16 +--
net/mptcp/pm_netlink.c | 6 ++
net/mptcp/pm_userspace.c | 10 ++
net/mptcp/protocol.h | 17 ++++
tools/testing/selftests/net/mptcp/userspace_pm.sh | 30 +++++-
9 files changed, 301 insertions(+), 25 deletions(-)
---
base-commit: e016cf5f39e9c53e274a7b7122a949d8839b8782
change-id: 20250312-net-next-mptcp-pm-ops-intro-01510135cd5e
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
For platforms on powerpc architecture with a default page size greater
than 4096, there was an inconsistency in fragment size calculation.
This caused the BPF selftest xdp_adjust_tail/xdp_adjust_frags_tail_grow
to fail on powerpc.
The issue occurred because the fragment buffer size in
bpf_prog_test_run_xdp() was set to 4096, while the actual data size in
the fragment within the shared skb was checked against PAGE_SIZE
(65536 on powerpc) in min_t, causing it to exceed 4096 and be set
accordingly. This discrepancy led to an overflow when
bpf_xdp_frags_increase_tail() checked for tailroom, as skb_frag_size(frag)
could be greater than rxq->frag_size (when PAGE_SIZE > 4096).
This change fixes:
1. test_run by getting the correct arch dependent PAGE_SIZE.
2. selftest by caculating tailroom and getting correct PAGE_SIZE.
Changes:
v1 -> v2:
* Address comments from Alexander
* Use dynamic page size, cacheline size and size of
struct skb_shared_info to calculate parameters.
* Fixed both test_run and selftest.
v1: https://lore.kernel.org/all/20250122183720.1411176-1-skb99@linux.ibm.com/
Saket Kumar Bhaskar (2):
bpf, test_run: Replace hardcoded page size with dynamic PAGE_SIZE in
test_run
selftests/bpf: Refactor xdp_adjust_tail selftest with dynamic sizing
.../bpf/prog_tests/xdp_adjust_tail.c | 160 +++++++++++++-----
.../bpf/progs/test_xdp_adjust_tail_grow.c | 41 +++--
2 files changed, 149 insertions(+), 52 deletions(-)
--
2.43.5
Currently there is no means of determining whether a give page in a mapping
range is designated a guard region (as installed via madvise() using the
MADV_GUARD_INSTALL flag).
This is generally not an issue, but in some instances users may wish to
determine whether this is the case.
This series adds this ability via /proc/$pid/pagemap, updates the
documentation and adds a self test to assert that this functions correctly.
Lorenzo Stoakes (2):
fs/proc/task_mmu: add guard region bit to pagemap
tools/selftests: add guard region test for /proc/$pid/pagemap
Documentation/admin-guide/mm/pagemap.rst | 3 +-
fs/proc/task_mmu.c | 6 ++-
tools/testing/selftests/mm/guard-regions.c | 47 ++++++++++++++++++++++
tools/testing/selftests/mm/vm_util.h | 1 +
4 files changed, 55 insertions(+), 2 deletions(-)
--
2.48.1
Hi all,
This patch series continues the work to migrate the script tests into
prog_tests.
test_xdp_vlan.sh tests the ability of an XDP program to modify the VLAN
ids on the fly. This isn't currently covered by an other test in the
test_progs framework so I add a new file prog_tests/xdp_vlan.c that does
the exact same tests (same network topology, same BPF programs) and
remove the script.
Signed-off-by: Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
---
Bastien Curutchet (eBPF Foundation) (2):
selftests/bpf: test_xdp_vlan: Rename BPF sections
selftests/bpf: Migrate test_xdp_vlan.sh into test_progs
tools/testing/selftests/bpf/Makefile | 4 +-
tools/testing/selftests/bpf/prog_tests/xdp_vlan.c | 175 ++++++++++++++++
tools/testing/selftests/bpf/progs/test_xdp_vlan.c | 20 +-
tools/testing/selftests/bpf/test_xdp_vlan.sh | 233 ---------------------
.../selftests/bpf/test_xdp_vlan_mode_generic.sh | 9 -
.../selftests/bpf/test_xdp_vlan_mode_native.sh | 9 -
6 files changed, 186 insertions(+), 264 deletions(-)
---
base-commit: a814b9be27fb3c3f49343aee4b015b76f5875558
change-id: 20250130-xdp_vlan-e825cc4df14a
Best regards,
--
Bastien Curutchet (eBPF Foundation) <bastien.curutchet(a)bootlin.com>
When the rseq UAPI header is included 'union rseq' clashes with 'struct
rseq', it's not the case in the rseq selftests but it does break the KVM
selftests that also include this file.
Rename 'union rseq' to 'union rseq_tls' to fix this.
Fixes: e6644c967d3c ("rseq/selftests: Ensure the rseq ABI TLS is actually 1024 bytes")
Reported-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Michael Jeanson <mjeanson(a)efficios.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
---
tools/testing/selftests/rseq/rseq.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 6d8997d815ef..663a9cef1952 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -75,13 +75,13 @@ static int rseq_ownership;
* Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
* rseq registration that is larger than the current rseq ABI.
*/
-union rseq {
+union rseq_tls {
struct rseq_abi abi;
char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
};
static
-__thread union rseq __rseq __attribute__((tls_model("initial-exec"))) = {
+__thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
.abi = {
.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
},
--
2.43.0
Conntrack bridge only tracks untagged and 802.1q.
To make the bridge-fastpath experience more similar to the
forward-fastpath experience, add double vlan, pppoe and pppoe-in-q
tagged packets to bridge conntrack and to bridge filter chain.
Split from patch-set: bridge-fastpath and related improvements v9
Eric Woudstra (3):
netfilter: bridge: Add conntrack double vlan and pppoe
netfilter: nft_chain_filter: Add bridge double vlan and pppoe
selftests: netfilter: Add conntrack_bridge.sh
net/bridge/netfilter/nf_conntrack_bridge.c | 83 +++++++--
net/netfilter/nft_chain_filter.c | 20 +-
.../testing/selftests/net/netfilter/Makefile | 1 +
.../net/netfilter/conntrack_bridge.sh | 176 ++++++++++++++++++
4 files changed, 267 insertions(+), 13 deletions(-)
create mode 100755 tools/testing/selftests/net/netfilter/conntrack_bridge.sh
--
2.47.1
Make sure the test cleans up after itself. The XDP off statements
at the end of the test may not be reached.
Fixes: 75cc19c8ff89 ("selftests: drv-net: add xdp cases for ping.py")
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: ap420073(a)gmail.com
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/drivers/net/ping.py | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py
index 93f4b411b378..fc69bfcc37c4 100755
--- a/tools/testing/selftests/drivers/net/ping.py
+++ b/tools/testing/selftests/drivers/net/ping.py
@@ -7,7 +7,7 @@ from lib.py import ksft_run, ksft_exit
from lib.py import ksft_eq, KsftSkipEx, KsftFailEx
from lib.py import EthtoolFamily, NetDrvEpEnv
from lib.py import bkg, cmd, wait_port_listen, rand_port
-from lib.py import ethtool, ip
+from lib.py import defer, ethtool, ip
remote_ifname=""
no_sleep=False
@@ -60,6 +60,7 @@ no_sleep=False
prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o"
cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote)
cmd(f"ip link set dev {cfg.ifname} mtu 1500 xdpgeneric obj {prog} sec xdp", shell=True)
+ defer(cmd, f"ip link set dev {cfg.ifname} xdpgeneric off")
if no_sleep != True:
time.sleep(10)
@@ -68,7 +69,9 @@ no_sleep=False
test_dir = os.path.dirname(os.path.realpath(__file__))
prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o"
cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote)
+ defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote)
ip("link set dev %s mtu 9000 xdpgeneric obj %s sec xdp.frags" % (cfg.ifname, prog))
+ defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdpgeneric off")
if no_sleep != True:
time.sleep(10)
@@ -78,6 +81,7 @@ no_sleep=False
prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o"
cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote)
cmd(f"ip -j link set dev {cfg.ifname} mtu 1500 xdp obj {prog} sec xdp", shell=True)
+ defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off")
xdp_info = ip("-d link show %s" % (cfg.ifname), json=True)[0]
if xdp_info['xdp']['mode'] != 1:
"""
@@ -94,10 +98,11 @@ no_sleep=False
test_dir = os.path.dirname(os.path.realpath(__file__))
prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o"
cmd(f"ip link set dev {remote_ifname} mtu 9000", shell=True, host=cfg.remote)
+ defer(ip, f"link set dev {remote_ifname} mtu 1500", host=cfg.remote)
try:
cmd(f"ip link set dev {cfg.ifname} mtu 9000 xdp obj {prog} sec xdp.frags", shell=True)
+ defer(ip, f"link set dev {cfg.ifname} mtu 1500 xdp off")
except Exception as e:
- cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote)
raise KsftSkipEx('device does not support native-multi-buffer XDP')
if no_sleep != True:
@@ -111,6 +116,7 @@ no_sleep=False
cmd(f"ip link set dev {cfg.ifname} xdpoffload obj {prog} sec xdp", shell=True)
except Exception as e:
raise KsftSkipEx('device does not support offloaded XDP')
+ defer(ip, f"link set dev {cfg.ifname} xdpoffload off")
cmd(f"ip link set dev {remote_ifname} mtu 1500", shell=True, host=cfg.remote)
if no_sleep != True:
@@ -157,7 +163,6 @@ no_sleep=False
_test_v4(cfg)
_test_v6(cfg)
_test_tcp(cfg)
- ip("link set dev %s xdpgeneric off" % cfg.ifname)
def test_xdp_generic_mb(cfg, netnl) -> None:
_set_xdp_generic_mb_on(cfg)
@@ -169,7 +174,6 @@ no_sleep=False
_test_v4(cfg)
_test_v6(cfg)
_test_tcp(cfg)
- ip("link set dev %s xdpgeneric off" % cfg.ifname)
def test_xdp_native_sb(cfg, netnl) -> None:
_set_xdp_native_sb_on(cfg)
@@ -181,7 +185,6 @@ no_sleep=False
_test_v4(cfg)
_test_v6(cfg)
_test_tcp(cfg)
- ip("link set dev %s xdp off" % cfg.ifname)
def test_xdp_native_mb(cfg, netnl) -> None:
_set_xdp_native_mb_on(cfg)
@@ -193,14 +196,12 @@ no_sleep=False
_test_v4(cfg)
_test_v6(cfg)
_test_tcp(cfg)
- ip("link set dev %s xdp off" % cfg.ifname)
def test_xdp_offload(cfg, netnl) -> None:
_set_xdp_offload_on(cfg)
_test_v4(cfg)
_test_v6(cfg)
_test_tcp(cfg)
- ip("link set dev %s xdpoffload off" % cfg.ifname)
def main() -> None:
with NetDrvEpEnv(__file__) as cfg:
@@ -213,7 +214,6 @@ no_sleep=False
test_xdp_native_mb,
test_xdp_offload],
args=(cfg, EthtoolFamily()))
- set_interface_init(cfg)
ksft_exit()
--
2.48.1