- Linux-kselftest-mirror - lists.linaro.org

[PATCH v2] selftests/mm: Use generic read_sysfs in thuge-gen test

by Pu Lehui

From: Pu Lehui <pulehui(a)huawei.com> As generic read_sysfs is available in vm_utils, let's use is in thuge-gen test. Signed-off-by: Pu Lehui <pulehui(a)huawei.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com> --- v2: - Explicit warning when ps != getpagesize(). (Lorenzo) tools/testing/selftests/mm/thuge-gen.c | 38 +++++++------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 95b6f043a3cb..8e2b08dc5762 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -77,40 +77,20 @@ void show(unsigned long ps) system(buf); } -unsigned long thuge_read_sysfs(int warn, char *fmt, ...) +unsigned long read_free(unsigned long ps) { - char *line = NULL; - size_t linelen = 0; - char buf[100]; - FILE *f; - va_list ap; unsigned long val = 0; + char buf[100]; - va_start(ap, fmt); - vsnprintf(buf, sizeof buf, fmt, ap); - va_end(ap); + snprintf(buf, sizeof(buf), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + if (read_sysfs(buf, &val) && ps != getpagesize()) + ksft_print_msg("missing %s\n", buf); - f = fopen(buf, "r"); - if (!f) { - if (warn) - ksft_print_msg("missing %s\n", buf); - return 0; - } - if (getline(&line, &linelen, f) > 0) { - sscanf(line, "%lu", &val); - } - fclose(f); - free(line); return val; } -unsigned long read_free(unsigned long ps) -{ - return thuge_read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); -} - void test_mmap(unsigned long size, unsigned flags) { char *map; @@ -173,6 +153,7 @@ void test_shmget(unsigned long size, unsigned flags) void find_pagesizes(void) { unsigned long largest = getpagesize(); + unsigned long shmmax_val = 0; int i; glob_t g; @@ -195,7 +176,8 @@ void find_pagesizes(void) } globfree(&g); - if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) + read_sysfs("/proc/sys/kernel/shmmax", &shmmax_val); + if (shmmax_val < NUM_PAGES * largest) ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); -- 2.34.1

7 months

1
0
0 0

[PATCH] selftests/mm: Use generic read_sysfs in thuge-gen test

by Pu Lehui

From: Pu Lehui <pulehui(a)huawei.com> As generic read_sysfs is available in vm_utils, let's use is in thuge-gen test. Signed-off-by: Pu Lehui <pulehui(a)huawei.com> --- tools/testing/selftests/mm/thuge-gen.c | 37 +++++++------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 95b6f043a3cb..e11dfbfa661b 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -77,40 +77,19 @@ void show(unsigned long ps) system(buf); } -unsigned long thuge_read_sysfs(int warn, char *fmt, ...) +unsigned long read_free(unsigned long ps) { - char *line = NULL; - size_t linelen = 0; - char buf[100]; - FILE *f; - va_list ap; unsigned long val = 0; + char buf[100]; - va_start(ap, fmt); - vsnprintf(buf, sizeof buf, fmt, ap); - va_end(ap); + snprintf(buf, sizeof(buf), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + read_sysfs(buf, &val); - f = fopen(buf, "r"); - if (!f) { - if (warn) - ksft_print_msg("missing %s\n", buf); - return 0; - } - if (getline(&line, &linelen, f) > 0) { - sscanf(line, "%lu", &val); - } - fclose(f); - free(line); return val; } -unsigned long read_free(unsigned long ps) -{ - return thuge_read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); -} - void test_mmap(unsigned long size, unsigned flags) { char *map; @@ -173,6 +152,7 @@ void test_shmget(unsigned long size, unsigned flags) void find_pagesizes(void) { unsigned long largest = getpagesize(); + unsigned long shmmax_val = 0; int i; glob_t g; @@ -195,7 +175,8 @@ void find_pagesizes(void) } globfree(&g); - if (thuge_read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) + read_sysfs("/proc/sys/kernel/shmmax", &shmmax_val); + if (shmmax_val < NUM_PAGES * largest) ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); -- 2.34.1

7 months

2
4
0 0

[PATCH] Documentation: kunit: fix argument of MODULE_IMPORT_NS()

by Thomas Weißschuh

The argument to MODULE_IMPORT_NS() should be a string literal. See commit cdd30ebb1b9f ("module: Convert symbol namespace to string literal") Fixes: d208025db6d6 ("Documentation: kunit: improve example on testing static functions") Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de> --- Documentation/dev-tools/kunit/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 038f480074fd7aaa1f8e1b344bc74bf3426cc173..066ecda1dd98e73a01d50545e79c38a99a3e05a2 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -699,7 +699,7 @@ the template below. #include <kunit/visibility.h> #include <my_file.h> ... - MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); ... // Use do_interesting_thing() in tests --- base-commit: f09079bd04a924c72d555cd97942d5f8d7eca98c change-id: 20250611-kunit-example-d195e41c0b34 Best regards, -- Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>

7 months

1
0
0 0

[PATCH net-next 0/7] netpoll: Untangle netconsole and netpoll

by Breno Leitao

Initially netpoll and netconsole were created together, and some functions are in the wrong file. Seperate netconsole-only functions in netconsole, avoiding exports. 1. Expose netpoll logging macros in the public header to enable consistent log formatting across netpoll consumers. 2. Relocate netconsole-specific functions from netpoll to the netconsole module where they are actually used, reducing unnecessary coupling. 3. Remove unnecessary function exports 4. Rename netpoll parsing functions in netconsole to better reflect their specific usage. 5. Create a test to check that cmdline works fine. This was in my todo list since [1], this was a good time to add it here to make sure this patchset doesn't regress. PS: The code was split in a way that it is easy to review. When copying the functions from netpoll to netconsole, I do not change than other than adding `static`. This will make checkpatch unhappy, but, further patches will address the issues. It is done this way to make it easy for reviewers. Link: https://lore.kernel.org/netdev/Z36TlACdNMwFD7wv@dev-ushankar.dev.purestorag… [1] Signed-off-by: Breno Leitao <leitao(a)debian.org> --- Breno Leitao (7): netpoll: remove __netpoll_cleanup from exported API netpoll: expose netpoll logging macros in public header netpoll: relocate netconsole-specific functions to netconsole module netpoll: move netpoll_print_options to netconsole netconsole: rename functions to better reflect their purpose netconsole: improve code style in parser function selftest: netconsole: add test for cmdline configuration drivers/net/netconsole.c | 137 ++++++++++++++++++++- include/linux/netpoll.h | 10 +- net/core/netpoll.c | 136 +------------------- tools/testing/selftests/drivers/net/Makefile | 1 + .../selftests/drivers/net/lib/sh/lib_netcons.sh | 39 +++++- .../selftests/drivers/net/netcons_cmdline.sh | 52 ++++++++ 6 files changed, 228 insertions(+), 147 deletions(-) --- base-commit: 2c7e4a2663a1ab5a740c59c31991579b6b865a26 change-id: 20250603-rework-c175cad8d22e Best regards, -- Breno Leitao <leitao(a)debian.org>

7 months

1
8
0 0

[PATCH net-next] selftests/net: packetdrill: more xfail changes

by Jakub Kicinski

Most of the packetdrill tests have not flaked once last week. Add the few which did to the XFAIL list. Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: willemb(a)google.com CC: matttbe(a)kernel.org CC: linux-kselftest(a)vger.kernel.org Every time I sit down to add more I plan to just XFAIL all of packetdrill on slow machines, but then I convince myself otherwise. One last time? --- tools/testing/selftests/net/packetdrill/ksft_runner.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/net/packetdrill/ksft_runner.sh b/tools/testing/selftests/net/packetdrill/ksft_runner.sh index ef8b25a606d8..c5b01e1bd4c7 100755 --- a/tools/testing/selftests/net/packetdrill/ksft_runner.sh +++ b/tools/testing/selftests/net/packetdrill/ksft_runner.sh @@ -39,11 +39,15 @@ if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then # xfail tests that are known flaky with dbg config, not fixable. # still run them for coverage (and expect 100% pass without dbg). declare -ar xfail_list=( + "tcp_blocking_blocking-connect.pkt" + "tcp_blocking_blocking-read.pkt" "tcp_eor_no-coalesce-retrans.pkt" "tcp_fast_recovery_prr-ss.*.pkt" + "tcp_sack_sack-route-refresh-ip-tos.pkt" "tcp_slow_start_slow-start-after-win-update.pkt" "tcp_timestamping.*.pkt" "tcp_user_timeout_user-timeout-probe.pkt" + "tcp_zerocopy_cl.*.pkt" "tcp_zerocopy_epoll_.*.pkt" "tcp_tcp_info_tcp-info-.*-limited.pkt" ) -- 2.49.0

7 months

4
3
0 0

[PATCH] selftests: Add version file to kselftest installation dir

by Tianyi Cui

As titled, adding version file to kselftest installation dir, so the user of the tarball can know which kernel version the tarball belongs to. Signed-off-by: Tianyi Cui <1997cui(a)gmail.com> --- tools/testing/selftests/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index a0a6ba47d600..246e9863b45b 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -291,6 +291,12 @@ ifdef INSTALL_PATH $(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET COLLECTION=$$TARGET \ -C $$TARGET emit_tests >> $(TEST_LIST); \ done; + @if git describe HEAD > /dev/null 2>&1; then \ + git describe HEAD > $(INSTALL_PATH)/VERSION; \ + printf "Version saved to $(INSTALL_PATH)/VERSION\n"; \ + else \ + printf "Unable to get version from git describe\n"; \ + fi else $(error Error: set INSTALL_PATH to use install) endif -- 2.47.1

7 months

2
5
0 0

[PATCH net-next v3 0/4] netconsole: Optimize console registration and improve testing

by Breno Leitao

During performance analysis of console subsystem latency, I discovered that netconsole registers console handlers even when no active targets exist. These orphaned console handlers are invoked on every printk() call, get the lock, iterate through empty target lists, and consume CPU cycles without performing any useful work. This patch series addresses the inefficiency by: 1. Implementing dynamic console registration/unregistration based on target availability, ensuring console handlers are only active when needed 2. Adding automatic cleanup of unused console registrations when targets are disabled or removed 3. Extending the selftest suite to cover non-extended console format, which was previously untested The optimization reduces printk() overhead by eliminating unnecessary function calls and list traversals when netconsole targets are not configured, improving overall system performance during heavy logging scenarios. --- Changes in v3: - Set CON_ENABLED before re-enabling the console, to fix a selftest that was failing, as reported by Jakub on v2. - Link to v2: https://lore.kernel.org/r/20250602-netcons_ext-v2-0-ef88d999326d@debian.org Changes in v2: - Added selftests to test the new mechanism - Unregister the console if the last target got disabled - Sending to net-next instead of net (Jakub) - Link to v1: https://lore.kernel.org/r/20250528-netcons_ext-v1-1-69f71e404e00@debian.org --- Breno Leitao (4): netconsole: Only register console drivers when targets are configured netconsole: Add automatic console unregistration on target removal selftests: netconsole: Do not exit from inside the validation function selftests: netconsole: Add support for basic netconsole target format drivers/net/netconsole.c | 67 +++++++++++++++++++--- .../selftests/drivers/net/lib/sh/lib_netcons.sh | 27 +++++++-- .../testing/selftests/drivers/net/netcons_basic.sh | 50 ++++++++++------ 3 files changed, 112 insertions(+), 32 deletions(-) --- base-commit: 2c7e4a2663a1ab5a740c59c31991579b6b865a26 change-id: 20250528-netcons_ext-572982619bea Best regards, -- Breno Leitao <leitao(a)debian.org>

7 months

2
5
0 0

[PATCH RFC net-next v2] page_pool: import Jesper's page_pool benchmark

by Mina Almasry

From: Jesper Dangaard Brouer <hawk(a)kernel.org> We frequently consult with Jesper's out-of-tree page_pool benchmark to evaluate page_pool changes. Import the benchmark into the upstream linux kernel tree so that (a) we're all running the same version, (b) pave the way for shared improvements, and (c) maybe one day integrate it with nipa, if possible. Import bench_page_pool_simple from commit 35b1716d0c30 ("Add page_bench06_walk_all"), from this repository: https://github.com/netoptimizer/prototype-kernel.git Changes done during upstreaming: - Fix checkpatch issues. - Remove the tasklet logic not needed. - Move under tools/testing - Create ksft for the benchmark. - Changed slightly how the benchmark gets build. Out of tree, time_bench is built as an independent .ko. Here it is included in bench_page_pool.ko Steps to run: ``` mkdir -p /tmp/run-pp-bench make -C ./tools/testing/selftests/net/bench make -C ./tools/testing/selftests/net/bench install INSTALL_PATH=/tmp/run-pp-bench rsync --delete -avz --progress /tmp/run-pp-bench mina@$SERVER:~/ ssh mina@$SERVER << EOF cd ~/run-pp-bench && sudo ./test_bench_page_pool.sh EOF ``` Output: ``` (benchmrk dmesg logs) Fast path results: no-softirq-page_pool01 Per elem: 11 cycles(tsc) 4.368 ns ptr_ring results: no-softirq-page_pool02 Per elem: 527 cycles(tsc) 195.187 ns slow path results: no-softirq-page_pool03 Per elem: 549 cycles(tsc) 203.466 ns ``` Cc: Jesper Dangaard Brouer <hawk(a)kernel.org> Cc: Ilias Apalodimas <ilias.apalodimas(a)linaro.org> Cc: Jakub Kicinski <kuba(a)kernel.org> Cc: Toke Høiland-Jørgensen <toke(a)toke.dk> Signed-off-by: Mina Almasry <almasrymina(a)google.com> --- v2: - Move under tools/selftests (Jakub) - Create ksft for it. - Remove the tasklet logic no longer needed (Jesper + Toke) RFC discussion points: - Desirable to import it? - Can the benchmark be imported as-is for an initial version? Or needs lots of modifications? - Code location. I retained the location in Jesper's tree, but a path like net/core/bench/ may make more sense. --- tools/testing/selftests/net/bench/Makefile | 7 + .../selftests/net/bench/page_pool/Makefile | 17 + .../bench/page_pool/bench_page_pool_simple.c | 275 ++++++++++++ .../bench/page_pool/test_bench_page_pool.sh | 32 ++ .../net/bench/page_pool/time_bench.c | 406 ++++++++++++++++++ .../net/bench/page_pool/time_bench.h | 259 +++++++++++ 6 files changed, 996 insertions(+) create mode 100644 tools/testing/selftests/net/bench/Makefile create mode 100644 tools/testing/selftests/net/bench/page_pool/Makefile create mode 100644 tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c create mode 100755 tools/testing/selftests/net/bench/page_pool/test_bench_page_pool.sh create mode 100644 tools/testing/selftests/net/bench/page_pool/time_bench.c create mode 100644 tools/testing/selftests/net/bench/page_pool/time_bench.h diff --git a/tools/testing/selftests/net/bench/Makefile b/tools/testing/selftests/net/bench/Makefile new file mode 100644 index 000000000000..4ebce5d71b18 --- /dev/null +++ b/tools/testing/selftests/net/bench/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_MODS_DIR := page_pool + +TEST_PROGS += page_pool/test_bench_page_pool.sh + +include ../../lib.mk diff --git a/tools/testing/selftests/net/bench/page_pool/Makefile b/tools/testing/selftests/net/bench/page_pool/Makefile new file mode 100644 index 000000000000..0549a16ba275 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/Makefile @@ -0,0 +1,17 @@ +BENCH_PAGE_POOL_SIMPLE_TEST_DIR := $(realpath $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +KDIR ?= /lib/modules/$(shell uname -r)/build + +ifeq ($(V),1) +Q = +else +Q = @ +endif + +obj-m += bench_page_pool.o +bench_page_pool-y += bench_page_pool_simple.o time_bench.o + +all: + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) modules + +clean: + +$(Q)make -C $(KDIR) M=$(BENCH_PAGE_POOL_SIMPLE_TEST_DIR) clean diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c new file mode 100644 index 000000000000..53d168cce27d --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c @@ -0,0 +1,275 @@ +/* + * Benchmark module for page_pool. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/mutex.h> + +#include <linux/version.h> +#include <net/page_pool/helpers.h> + +#include <linux/interrupt.h> +#include <linux/limits.h> + +#include "time_bench.h" + +static int verbose = 1; +#define MY_POOL_SIZE 1024 + +static inline void _page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + +/* Makes tests selectable. Useful for perf-record to analyze a single test. + * Hint: Bash shells support writing binary number like: $((2#101010) + * + * # modprobe bench_page_pool_simple run_flags=$((2#100)) + */ +static unsigned long run_flags = 0xFFFFFFFF; +module_param(run_flags, ulong, 0); +MODULE_PARM_DESC(run_flags, "Limit which bench test that runs"); +/* Count the bit number from the enum */ +enum benchmark_bit { + bit_run_bench_baseline, + bit_run_bench_no_softirq01, + bit_run_bench_no_softirq02, + bit_run_bench_no_softirq03, +}; +#define bit(b) (1 << (b)) +#define enabled(b) ((run_flags & (bit(b)))) + +/* notice time_bench is limited to U32_MAX nr loops */ +static unsigned long loops = 10000000; +module_param(loops, ulong, 0); +MODULE_PARM_DESC(loops, "Specify loops bench will run"); + +/* Timing at the nanosec level, we need to know the overhead + * introduced by the for loop itself */ +static int time_bench_for_loop(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + int i; + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + } + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +static int time_bench_atomic_inc(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + atomic_t cnt; + int i; + + atomic_set(&cnt, 0); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + atomic_inc(&cnt); + barrier(); /* avoid compiler to optimize this loop */ + } + loops_cnt = atomic_read(&cnt); + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +/* The ptr_ping in page_pool uses a spinlock. We need to know the minimum + * overhead of taking+releasing a spinlock, to know the cycles that can be saved + * by e.g. amortizing this via bulking. + */ +static int time_bench_lock(struct time_bench_record *rec, void *data) +{ + uint64_t loops_cnt = 0; + spinlock_t lock; + int i; + + spin_lock_init(&lock); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + spin_lock(&lock); + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + spin_unlock(&lock); + } + time_bench_stop(rec, loops_cnt); + return loops_cnt; +} + +/* Helper for filling some page's into ptr_ring */ +static void pp_fill_ptr_ring(struct page_pool *pp, int elems) +{ + gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC needed when under run softirq */ + struct page **array; + int i; + + array = kzalloc(sizeof(struct page *) * elems, gfp_mask); + + for (i = 0; i < elems; i++) { + array[i] = page_pool_alloc_pages(pp, gfp_mask); + } + for (i = 0; i < elems; i++) { + _page_pool_put_page(pp, array[i], false); + } + + kfree(array); +} + +enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; + +/* Depends on compile optimizing this function */ +static __always_inline int time_bench_page_pool(struct time_bench_record *rec, + void *data, enum test_type type, + const char *func) +{ + uint64_t loops_cnt = 0; + gfp_t gfp_mask = GFP_ATOMIC; /* GFP_ATOMIC is not really needed */ + int i, err; + + struct page_pool *pp; + struct page *page; + + struct page_pool_params pp_params = { + .order = 0, + .flags = 0, + .pool_size = MY_POOL_SIZE, + .nid = NUMA_NO_NODE, + .dev = NULL, /* Only use for DMA mapping */ + .dma_dir = DMA_BIDIRECTIONAL, + }; + + pp = page_pool_create(&pp_params); + if (IS_ERR(pp)) { + err = PTR_ERR(pp); + pr_warn("%s: Error(%d) creating page_pool\n", func, err); + goto out; + } + pp_fill_ptr_ring(pp, 64); + + if (in_serving_softirq()) + pr_warn("%s(): in_serving_softirq fast-path\n", func); + else + pr_warn("%s(): Cannot use page_pool fast-path\n", func); + + time_bench_start(rec); + /** Loop to measure **/ + for (i = 0; i < rec->loops; i++) { + /* Common fast-path alloc, that depend on in_serving_softirq() */ + page = page_pool_alloc_pages(pp, gfp_mask); + if (!page) + break; + loops_cnt++; + barrier(); /* avoid compiler to optimize this loop */ + + /* The benchmarks purpose it to test different return paths. + * Compiler should inline optimize other function calls out + */ + if (type == type_fast_path) { + /* Fast-path recycling e.g. XDP_DROP use-case */ + page_pool_recycle_direct(pp, page); + + } else if (type == type_ptr_ring) { + /* Normal return path */ + _page_pool_put_page(pp, page, false); + + } else if (type == type_page_allocator) { + /* Test if not pages are recycled, but instead + * returned back into systems page allocator + */ + get_page(page); /* cause no-recycling */ + _page_pool_put_page(pp, page, false); + put_page(page); + } else { + BUILD_BUG(); + } + } + time_bench_stop(rec, loops_cnt); +out: + page_pool_destroy(pp); + return loops_cnt; +} + +static int time_bench_page_pool01_fast_path(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_fast_path, __func__); +} + +static int time_bench_page_pool02_ptr_ring(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_ptr_ring, __func__); +} + +static int time_bench_page_pool03_slow(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_page_allocator, __func__); +} + +static int run_benchmark_tests(void) +{ + uint32_t nr_loops = loops; + int passed_count = 0; + + /* Baseline tests */ + if (enabled(bit_run_bench_baseline)) { + time_bench_loop(nr_loops * 10, 0, "for_loop", NULL, + time_bench_for_loop); + time_bench_loop(nr_loops * 10, 0, "atomic_inc", NULL, + time_bench_atomic_inc); + time_bench_loop(nr_loops, 0, "lock", NULL, time_bench_lock); + } + + /* This test cannot activate correct code path, due to no-softirq ctx */ + if (enabled(bit_run_bench_no_softirq01)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool01", NULL, + time_bench_page_pool01_fast_path); + if (enabled(bit_run_bench_no_softirq02)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool02", NULL, + time_bench_page_pool02_ptr_ring); + if (enabled(bit_run_bench_no_softirq03)) + time_bench_loop(nr_loops, 0, "no-softirq-page_pool03", NULL, + time_bench_page_pool03_slow); + + return passed_count; +} + +static int __init bench_page_pool_simple_module_init(void) +{ + if (verbose) + pr_info("Loaded\n"); + + if (loops > U32_MAX) { + pr_err("Module param loops(%lu) exceeded U32_MAX(%u)\n", loops, + U32_MAX); + return -ECHRNG; + } + + run_benchmark_tests(); + + return 0; +} +module_init(bench_page_pool_simple_module_init); + +static void __exit bench_page_pool_simple_module_exit(void) +{ + if (verbose) + pr_info("Unloaded\n"); +} +module_exit(bench_page_pool_simple_module_exit); + +MODULE_DESCRIPTION("Benchmark of page_pool simple cases"); +MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer(a)brouer.com>"); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/selftests/net/bench/page_pool/test_bench_page_pool.sh b/tools/testing/selftests/net/bench/page_pool/test_bench_page_pool.sh new file mode 100755 index 000000000000..5eb48f28b659 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/test_bench_page_pool.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# + +set -e + +DRIVER="./page_pool/bench_page_pool.ko" +result="" + +function run_test() +{ + rmmod "bench_page_pool.ko" || true + insmod $DRIVER > /dev/null 2>&1 + result=$(dmesg | tail -10) + echo "$result" + + echo + echo "Fast path results:" + echo ${result} | grep -o -E "no-softirq-page_pool01 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" + + echo + echo "ptr_ring results:" + echo ${result} | grep -o -E "no-softirq-page_pool02 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" + + echo + echo "slow path results:" + echo ${result} | grep -o -E "no-softirq-page_pool03 Per elem: ([0-9]+) cycles$tsc$ ([0-9]+\.[0-9]+) ns" +} + +run_test + +exit 0 diff --git a/tools/testing/selftests/net/bench/page_pool/time_bench.c b/tools/testing/selftests/net/bench/page_pool/time_bench.c new file mode 100644 index 000000000000..257b1515c64e --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/time_bench.c @@ -0,0 +1,406 @@ +/* + * Benchmarking code execution time inside the kernel + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + * for licensing details see kernel-base/COPYING + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/time.h> + +#include <linux/perf_event.h> /* perf_event_create_kernel_counter() */ + +/* For concurrency testing */ +#include <linux/completion.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> + +#include "time_bench.h" + +static int verbose = 1; + +/** TSC (Time-Stamp Counter) based ** + * See: linux/time_bench.h + * tsc_start_clock() and tsc_stop_clock() + */ + +/** Wall-clock based ** + */ + +/** PMU (Performance Monitor Unit) based ** + */ +#define PERF_FORMAT \ + (PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \ + PERF_FORMAT_TOTAL_TIME_RUNNING) + +struct raw_perf_event { + uint64_t config; /* event */ + uint64_t config1; /* umask */ + struct perf_event *save; + char *desc; +}; + +/* if HT is enable a maximum of 4 events (5 if one is instructions + * retired can be specified, if HT is disabled a maximum of 8 (9 if + * one is instructions retired) can be specified. + * + * From Table 19-1. Architectural Performance Events + * Architectures Software Developer’s Manual Volume 3: System Programming Guide + */ +struct raw_perf_event perf_events[] = { + { 0x3c, 0x00, NULL, "Unhalted CPU Cycles" }, + { 0xc0, 0x00, NULL, "Instruction Retired" } +}; + +#define NUM_EVTS (sizeof(perf_events) / sizeof(struct raw_perf_event)) + +/* WARNING: PMU config is currently broken! + */ +bool time_bench_PMU_config(bool enable) +{ + int i; + struct perf_event_attr perf_conf; + struct perf_event *perf_event; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + pr_info("DEBUG: cpu:%d\n", cpu); + preempt_enable(); + + memset(&perf_conf, 0, sizeof(struct perf_event_attr)); + perf_conf.type = PERF_TYPE_RAW; + perf_conf.size = sizeof(struct perf_event_attr); + perf_conf.read_format = PERF_FORMAT; + perf_conf.pinned = 1; + perf_conf.exclude_user = 1; /* No userspace events */ + perf_conf.exclude_kernel = 0; /* Only kernel events */ + + for (i = 0; i < NUM_EVTS; i++) { + perf_conf.disabled = enable; + //perf_conf.disabled = (i == 0) ? 1 : 0; + perf_conf.config = perf_events[i].config; + perf_conf.config1 = perf_events[i].config1; + if (verbose) + pr_info("%s() enable PMU counter: %s\n", + __func__, perf_events[i].desc); + perf_event = perf_event_create_kernel_counter(&perf_conf, cpu, + NULL /* task */, + NULL /* overflow_handler*/, + NULL /* context */); + if (perf_event) { + perf_events[i].save = perf_event; + pr_info("%s():DEBUG perf_event success\n", __func__); + + perf_event_enable(perf_event); + } else { + pr_info("%s():DEBUG perf_event is NULL\n", __func__); + } + } + + return true; +} + +/** Generic functions ** + */ + +/* Calculate stats, store results in record */ +bool time_bench_calc_stats(struct time_bench_record *rec) +{ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ + uint64_t ns_per_call_tmp_rem = 0; + uint32_t ns_per_call_remainder = 0; + uint64_t pmc_ipc_tmp_rem = 0; + uint32_t pmc_ipc_remainder = 0; + uint32_t pmc_ipc_div = 0; + uint32_t invoked_cnt_precision = 0; + uint32_t invoked_cnt = 0; /* 32-bit due to div_u64_rem() */ + + if (rec->flags & TIME_BENCH_LOOP) { + if (rec->invoked_cnt < 1000) { + pr_err("ERR: need more(>1000) loops(%llu) for timing\n", + rec->invoked_cnt); + return false; + } + if (rec->invoked_cnt > ((1ULL << 32) - 1)) { + /* div_u64_rem() can only support div with 32bit*/ + pr_err("ERR: Invoke cnt(%llu) too big overflow 32bit\n", + rec->invoked_cnt); + return false; + } + invoked_cnt = (uint32_t)rec->invoked_cnt; + } + + /* TSC (Time-Stamp Counter) records */ + if (rec->flags & TIME_BENCH_TSC) { + rec->tsc_interval = rec->tsc_stop - rec->tsc_start; + if (rec->tsc_interval == 0) { + pr_err("ABORT: timing took ZERO TSC time\n"); + return false; + } + /* Calculate stats */ + if (rec->flags & TIME_BENCH_LOOP) + rec->tsc_cycles = rec->tsc_interval / invoked_cnt; + else + rec->tsc_cycles = rec->tsc_interval; + } + + /* Wall-clock time calc */ + if (rec->flags & TIME_BENCH_WALLCLOCK) { + rec->time_start = rec->ts_start.tv_nsec + + (NANOSEC_PER_SEC * rec->ts_start.tv_sec); + rec->time_stop = rec->ts_stop.tv_nsec + + (NANOSEC_PER_SEC * rec->ts_stop.tv_sec); + rec->time_interval = rec->time_stop - rec->time_start; + if (rec->time_interval == 0) { + pr_err("ABORT: timing took ZERO wallclock time\n"); + return false; + } + /* Calculate stats */ + /*** Division in kernel it tricky ***/ + /* Orig: time_sec = (time_interval / NANOSEC_PER_SEC); */ + /* remainder only correct because NANOSEC_PER_SEC is 10^9 */ + rec->time_sec = div_u64_rem(rec->time_interval, NANOSEC_PER_SEC, + &rec->time_sec_remainder); + //TODO: use existing struct timespec records instead of div? + + if (rec->flags & TIME_BENCH_LOOP) { + /*** Division in kernel it tricky ***/ + /* Orig: ns = ((double)time_interval / invoked_cnt); */ + /* First get quotient */ + rec->ns_per_call_quotient = + div_u64_rem(rec->time_interval, invoked_cnt, + &ns_per_call_remainder); + /* Now get decimals .xxx precision (incorrect roundup)*/ + ns_per_call_tmp_rem = ns_per_call_remainder; + invoked_cnt_precision = invoked_cnt / 1000; + if (invoked_cnt_precision > 0) { + rec->ns_per_call_decimal = + div_u64_rem(ns_per_call_tmp_rem, + invoked_cnt_precision, + &ns_per_call_remainder); + } + } + } + + /* Performance Monitor Unit (PMU) counters */ + if (rec->flags & TIME_BENCH_PMU) { + //FIXME: Overflow handling??? + rec->pmc_inst = rec->pmc_inst_stop - rec->pmc_inst_start; + rec->pmc_clk = rec->pmc_clk_stop - rec->pmc_clk_start; + + /* Calc Instruction Per Cycle (IPC) */ + /* First get quotient */ + rec->pmc_ipc_quotient = div_u64_rem(rec->pmc_inst, rec->pmc_clk, + &pmc_ipc_remainder); + /* Now get decimals .xxx precision (incorrect roundup)*/ + pmc_ipc_tmp_rem = pmc_ipc_remainder; + pmc_ipc_div = rec->pmc_clk / 1000; + if (pmc_ipc_div > 0) { + rec->pmc_ipc_decimal = div_u64_rem(pmc_ipc_tmp_rem, + pmc_ipc_div, + &pmc_ipc_remainder); + } + } + + return true; +} + +/* Generic function for invoking a loop function and calculating + * execution time stats. The function being called/timed is assumed + * to perform a tight loop, and update the timing record struct. + */ +bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, + int (*func)(struct time_bench_record *record, void *data)) +{ + struct time_bench_record rec; + + /* Setup record */ + memset(&rec, 0, sizeof(rec)); /* zero func might not update all */ + rec.version_abi = 1; + rec.loops = loops; + rec.step = step; + rec.flags = (TIME_BENCH_LOOP|TIME_BENCH_TSC|TIME_BENCH_WALLCLOCK); +// rec.flags = (TIME_BENCH_LOOP|TIME_BENCH_TSC| +// TIME_BENCH_WALLCLOCK|TIME_BENCH_PMU); + //TODO: Add/copy txt to rec + + /*** Loop function being timed ***/ + if (!func(&rec, data)) { + pr_err("ABORT: function being timed failed\n"); + return false; + } + + if (rec.invoked_cnt < loops) + pr_warn("WARNING: Invoke count(%llu) smaller than loops(%d)\n", + rec.invoked_cnt, loops); + + /* Calculate stats */ + time_bench_calc_stats(&rec); + + pr_info("Type:%s Per elem: %llu cycles(tsc) %llu.%03llu ns (step:%d)" + " - (measurement period time:%llu.%09u sec time_interval:%llu)" + " - (invoke count:%llu tsc_interval:%llu)\n", + txt, rec.tsc_cycles, + rec.ns_per_call_quotient, rec.ns_per_call_decimal, rec.step, + rec.time_sec, rec.time_sec_remainder, rec.time_interval, + rec.invoked_cnt, rec.tsc_interval); +/* pr_info("DEBUG check is %llu/%llu == %llu.%03llu ?\n", + rec.time_interval, rec.invoked_cnt, + rec.ns_per_call_quotient, rec.ns_per_call_decimal); +*/ + if (rec.flags & TIME_BENCH_PMU) { + pr_info("Type:%s PMU inst/clock" + "%llu/%llu = %llu.%03llu IPC (inst per cycle)\n", + txt, rec.pmc_inst, rec.pmc_clk, + rec.pmc_ipc_quotient, rec.pmc_ipc_decimal); + } + return true; +} + +/* Function getting invoked by kthread */ +static int invoke_test_on_cpu_func(void *private) +{ + struct time_bench_cpu *cpu = private; + struct time_bench_sync *sync = cpu->sync; + cpumask_t newmask = CPU_MASK_NONE; + void *data = cpu->data; + + /* Restrict CPU */ + cpumask_set_cpu(cpu->rec.cpu, &newmask); + set_cpus_allowed_ptr(current, &newmask); + + /* Synchronize start of concurrency test */ + atomic_inc(&sync->nr_tests_running); + wait_for_completion(&sync->start_event); + + /* Start benchmark function */ + if (!cpu->bench_func(&cpu->rec, data)) { + pr_err("ERROR: function being timed failed on CPU:%d(%d)\n", + cpu->rec.cpu, smp_processor_id()); + } else { + if (verbose) + pr_info("SUCCESS: ran on CPU:%d(%d)\n", cpu->rec.cpu, + smp_processor_id()); + } + cpu->did_bench_run = true; + + /* End test */ + atomic_dec(&sync->nr_tests_running); + /* Wait for kthread_stop() telling us to stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +void time_bench_print_stats_cpumask(const char *desc, + struct time_bench_cpu *cpu_tasks, + const struct cpumask *mask) +{ + uint64_t average = 0; + int cpu; + int step = 0; + struct sum { + uint64_t tsc_cycles; + int records; + } sum = { 0 }; + + /* Get stats */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + struct time_bench_record *rec = &c->rec; + + /* Calculate stats */ + time_bench_calc_stats(rec); + + pr_info("Type:%s CPU(%d) %llu cycles(tsc) %llu.%03llu ns" + " (step:%d)" + " - (measurement period time:%llu.%09u sec time_interval:%llu)" + " - (invoke count:%llu tsc_interval:%llu)\n", + desc, cpu, rec->tsc_cycles, rec->ns_per_call_quotient, + rec->ns_per_call_decimal, rec->step, rec->time_sec, + rec->time_sec_remainder, rec->time_interval, + rec->invoked_cnt, rec->tsc_interval); + + /* Collect average */ + sum.records++; + sum.tsc_cycles += rec->tsc_cycles; + step = rec->step; + } + + if (sum.records) /* avoid div-by-zero */ + average = sum.tsc_cycles / sum.records; + pr_info("Sum Type:%s Average: %llu cycles(tsc) CPUs:%d step:%d\n", desc, + average, sum.records, step); +} + +void time_bench_run_concurrent( + uint32_t loops, int step, void *data, + const struct cpumask *mask, /* Support masking outsome CPUs*/ + struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, + int (*func)(struct time_bench_record *record, void *data)) +{ + int cpu, running = 0; + + if (verbose) // DEBUG + pr_warn("%s() Started on CPU:%d\n", __func__, + smp_processor_id()); + + /* Reset sync conditions */ + atomic_set(&sync->nr_tests_running, 0); + init_completion(&sync->start_event); + + /* Spawn off jobs on all CPUs */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + + running++; + c->sync = sync; /* Send sync variable along */ + c->data = data; /* Send opaque along */ + + /* Init benchmark record */ + memset(&c->rec, 0, sizeof(struct time_bench_record)); + c->rec.version_abi = 1; + c->rec.loops = loops; + c->rec.step = step; + c->rec.flags = (TIME_BENCH_LOOP|TIME_BENCH_TSC| + TIME_BENCH_WALLCLOCK); + c->rec.cpu = cpu; + c->bench_func = func; + c->task = kthread_run(invoke_test_on_cpu_func, c, + "time_bench%d", cpu); + if (IS_ERR(c->task)) { + pr_err("%s(): Failed to start test func\n", __func__); + return; /* Argh, what about cleanup?! */ + } + } + + /* Wait until all processes are running */ + while (atomic_read(&sync->nr_tests_running) < running) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(10); + } + /* Kick off all CPU concurrently on completion event */ + complete_all(&sync->start_event); + + /* Wait for CPUs to finish */ + while (atomic_read(&sync->nr_tests_running)) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(10); + } + + /* Stop the kthreads */ + for_each_cpu(cpu, mask) { + struct time_bench_cpu *c = &cpu_tasks[cpu]; + kthread_stop(c->task); + } + + if (verbose) // DEBUG - happens often, finish on another CPU + pr_warn("%s() Finished on CPU:%d\n", __func__, + smp_processor_id()); +} diff --git a/tools/testing/selftests/net/bench/page_pool/time_bench.h b/tools/testing/selftests/net/bench/page_pool/time_bench.h new file mode 100644 index 000000000000..7331b5789490 --- /dev/null +++ b/tools/testing/selftests/net/bench/page_pool/time_bench.h @@ -0,0 +1,259 @@ +/* + * Benchmarking code execution time inside the kernel + * + * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer + * for licensing details see kernel-base/COPYING + */ +#ifndef _LINUX_TIME_BENCH_H +#define _LINUX_TIME_BENCH_H + +/* Main structure used for recording a benchmark run */ +struct time_bench_record { + uint32_t version_abi; + uint32_t loops; /* Requested loop invocations */ + uint32_t step; /* option for e.g. bulk invocations */ + + uint32_t flags; /* Measurements types enabled */ +#define TIME_BENCH_LOOP (1<<0) +#define TIME_BENCH_TSC (1<<1) +#define TIME_BENCH_WALLCLOCK (1<<2) +#define TIME_BENCH_PMU (1<<3) + + uint32_t cpu; /* Used when embedded in time_bench_cpu */ + + /* Records */ + uint64_t invoked_cnt; /* Returned actual invocations */ + uint64_t tsc_start; + uint64_t tsc_stop; + struct timespec64 ts_start; + struct timespec64 ts_stop; + /** PMU counters for instruction and cycles + * instructions counter including pipelined instructions */ + uint64_t pmc_inst_start; + uint64_t pmc_inst_stop; + /* CPU unhalted clock counter */ + uint64_t pmc_clk_start; + uint64_t pmc_clk_stop; + + /* Result records */ + uint64_t tsc_interval; + uint64_t time_start, time_stop, time_interval; /* in nanosec */ + uint64_t pmc_inst, pmc_clk; + + /* Derived result records */ + uint64_t tsc_cycles; // +decimal? + uint64_t ns_per_call_quotient, ns_per_call_decimal; + uint64_t time_sec; + uint32_t time_sec_remainder; + uint64_t pmc_ipc_quotient, pmc_ipc_decimal; /* inst per cycle */ +}; + +/* For synchronizing parallel CPUs to run concurrently */ +struct time_bench_sync { + atomic_t nr_tests_running; + struct completion start_event; +}; + +/* Keep track of CPUs executing our bench function. + * + * Embed a time_bench_record for storing info per cpu + */ +struct time_bench_cpu { + struct time_bench_record rec; + struct time_bench_sync *sync; /* back ptr */ + struct task_struct *task; + /* "data" opaque could have been placed in time_bench_sync, + * but to avoid any false sharing, place it per CPU + */ + void *data; + /* Support masking outsome CPUs, mark if it ran */ + bool did_bench_run; + /* int cpu; // note CPU stored in time_bench_record */ + int (*bench_func)(struct time_bench_record *record, void *data); +}; + +/* + * Below TSC assembler code is not compatible with other archs, and + * can also fail on guests if cpu-flags are not correct. + * + * The way TSC reading is used, many iterations, does not require as + * high accuracy as described below (in Intel Doc #324264). + * + * Considering changing to use get_cycles() (#include <asm/timex.h>). + */ + +/** TSC (Time-Stamp Counter) based ** + * Recommend reading, to understand details of reading TSC accurately: + * Intel Doc #324264, "How to Benchmark Code Execution Times on Intel" + * + * Consider getting exclusive ownership of CPU by using: + * unsigned long flags; + * preempt_disable(); + * raw_local_irq_save(flags); + * _your_code_ + * raw_local_irq_restore(flags); + * preempt_enable(); + * + * Clobbered registers: "%rax", "%rbx", "%rcx", "%rdx" + * RDTSC only change "%rax" and "%rdx" but + * CPUID clears the high 32-bits of all (rax/rbx/rcx/rdx) + */ +static __always_inline uint64_t tsc_start_clock(void) +{ + /* See: Intel Doc #324264 */ + unsigned hi, lo; + asm volatile("CPUID\n\t" + "RDTSC\n\t" + "mov %%edx, %0\n\t" + "mov %%eax, %1\n\t" + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); + //FIXME: on 32bit use clobbered %eax + %edx + return ((uint64_t)lo) | (((uint64_t)hi) << 32); +} + +static __always_inline uint64_t tsc_stop_clock(void) +{ + /* See: Intel Doc #324264 */ + unsigned hi, lo; + asm volatile("RDTSCP\n\t" + "mov %%edx, %0\n\t" + "mov %%eax, %1\n\t" + "CPUID\n\t" + : "=r"(hi), "=r"(lo)::"%rax", "%rbx", "%rcx", "%rdx"); + return ((uint64_t)lo) | (((uint64_t)hi) << 32); +} + +/* Notes for RDTSC and RDTSCP + * + * Hannes found out that __builtin_ia32_rdtsc and + * __builtin_ia32_rdtscp are undocumented available in gcc, so there + * is no need to write inline assembler functions for them any more. + * + * unsigned long long __builtin_ia32_rdtscp(unsigned int *foo); + * (where foo is set to: numa_node << 12 | cpu) + * and + * unsigned long long __builtin_ia32_rdtsc(void); + * + * Above we combine the calls with CPUID, thus I don't see how this is + * directly appreciable. + */ + +/* +inline uint64_t rdtsc(void) +{ + uint32_t low, high; + asm volatile("rdtsc" : "=a" (low), "=d" (high)); + return low | (((uint64_t )high ) << 32); +} +*/ + +/** Wall-clock based ** + * + * use: getnstimeofday() + * getnstimeofday(&rec->ts_start); + * getnstimeofday(&rec->ts_stop); + * + * API changed see: Documentation/core-api/timekeeping.rst + * https://www.kernel.org/doc/html/latest/core-api/timekeeping.html#c.getnstim… + * + * We should instead use: ktime_get_real_ts64() is a direct + * replacement, but consider using monotonic time (ktime_get_ts64()) + * and/or a ktime_t based interface (ktime_get()/ktime_get_real()). + */ + +/** PMU (Performance Monitor Unit) based ** + * + * Needed for calculating: Instructions Per Cycle (IPC) + * - The IPC number tell how efficient the CPU pipelining were + */ +//lookup: perf_event_create_kernel_counter() + +bool time_bench_PMU_config(bool enable); + +/* Raw reading via rdpmc() using fixed counters + * + * From: https://github.com/andikleen/simple-pmu + */ +enum { + FIXED_SELECT = (1U << 30), /* == 0x40000000 */ + FIXED_INST_RETIRED_ANY = 0, + FIXED_CPU_CLK_UNHALTED_CORE = 1, + FIXED_CPU_CLK_UNHALTED_REF = 2, +}; + +static __always_inline unsigned long long p_rdpmc(unsigned in) +{ + unsigned d, a; + + asm volatile("rdpmc" : "=d"(d), "=a"(a) : "c"(in) : "memory"); + return ((unsigned long long)d << 32) | a; +} + +/* These PMU counter needs to be enabled, but I don't have the + * configure code implemented. My current hack is running: + * sudo perf stat -e cycles:k -e instructions:k insmod lib/ring_queue_test.ko + */ +/* Reading all pipelined instruction */ +static __always_inline unsigned long long pmc_inst(void) +{ + return p_rdpmc(FIXED_SELECT | FIXED_INST_RETIRED_ANY); +} + +/* Reading CPU clock cycles */ +static __always_inline unsigned long long pmc_clk(void) +{ + return p_rdpmc(FIXED_SELECT | FIXED_CPU_CLK_UNHALTED_CORE); +} + +/* Raw reading via MSR rdmsr() is likely wrong + * FIXME: How can I know which raw MSR registers are conf for what? + */ +#define MSR_IA32_PCM0 0x400000C1 /* PERFCTR0 */ +#define MSR_IA32_PCM1 0x400000C2 /* PERFCTR1 */ +#define MSR_IA32_PCM2 0x400000C3 +static inline uint64_t msr_inst(unsigned long long *msr_result) +{ + return rdmsrl_safe(MSR_IA32_PCM0, msr_result); +} + +/** Generic functions ** + */ +bool time_bench_loop(uint32_t loops, int step, char *txt, void *data, + int (*func)(struct time_bench_record *rec, void *data)); +bool time_bench_calc_stats(struct time_bench_record *rec); + +void time_bench_run_concurrent( + uint32_t loops, int step, void *data, + const struct cpumask *mask, /* Support masking outsome CPUs*/ + struct time_bench_sync *sync, struct time_bench_cpu *cpu_tasks, + int (*func)(struct time_bench_record *record, void *data)); +void time_bench_print_stats_cpumask(const char *desc, + struct time_bench_cpu *cpu_tasks, + const struct cpumask *mask); + +//FIXME: use rec->flags to select measurement, should be MACRO +static __always_inline void time_bench_start(struct time_bench_record *rec) +{ + //getnstimeofday(&rec->ts_start); + ktime_get_real_ts64(&rec->ts_start); + if (rec->flags & TIME_BENCH_PMU) { + rec->pmc_inst_start = pmc_inst(); + rec->pmc_clk_start = pmc_clk(); + } + rec->tsc_start = tsc_start_clock(); +} + +static __always_inline void time_bench_stop(struct time_bench_record *rec, + uint64_t invoked_cnt) +{ + rec->tsc_stop = tsc_stop_clock(); + if (rec->flags & TIME_BENCH_PMU) { + rec->pmc_inst_stop = pmc_inst(); + rec->pmc_clk_stop = pmc_clk(); + } + //getnstimeofday(&rec->ts_stop); + ktime_get_real_ts64(&rec->ts_stop); + rec->invoked_cnt = invoked_cnt; +} + +#endif /* _LINUX_TIME_BENCH_H */ base-commit: ea15e046263b19e91ffd827645ae5dfa44ebd044 -- 2.49.0.1151.ga128411c76-goog

7 months

5
10
0 0

[PATCH] kunit: qemu_configs: Add riscv32 config

by Thomas Weißschuh

Add a basic config to run kunit tests on riscv32. Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de> --- tools/testing/kunit/qemu_configs/riscv32.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/testing/kunit/qemu_configs/riscv32.py b/tools/testing/kunit/qemu_configs/riscv32.py new file mode 100644 index 0000000000000000000000000000000000000000..b79ba0ae30f8573035b3401be337b379eba97e26 --- /dev/null +++ b/tools/testing/kunit/qemu_configs/riscv32.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 + +from ..qemu_config import QemuArchParams + +QEMU_ARCH = QemuArchParams(linux_arch='riscv', + kconfig=''' +CONFIG_NONPORTABLE=y +CONFIG_ARCH_RV32I=y +CONFIG_ARCH_VIRT=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +''', + qemu_arch='riscv32', + kernel_path='arch/riscv/boot/Image', + kernel_command_line='console=ttyS0', + extra_qemu_params=['-machine', 'virt']) --- base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8 change-id: 20250214-kunit-qemu-riscv32-fb38d659c373 Best regards, -- Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>

7 months

3
2
0 0

[PATCH net-next 14/14] selftests: forwarding: Add a test for verifying VXLAN MC underlay

by Petr Machata

Add tests for MC-routing underlay VXLAN traffic. Signed-off-by: Petr Machata <petrm(a)nvidia.com> --- Notes: CC: Shuah Khan <shuah(a)kernel.org> CC: linux-kselftest(a)vger.kernel.org .../testing/selftests/net/forwarding/Makefile | 1 + .../net/forwarding/vxlan_bridge_1q_mc_ul.sh | 757 ++++++++++++++++++ 2 files changed, 758 insertions(+) create mode 100755 tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile index 00bde7b6f39e..d7bb2e80e88c 100644 --- a/tools/testing/selftests/net/forwarding/Makefile +++ b/tools/testing/selftests/net/forwarding/Makefile @@ -102,6 +102,7 @@ TEST_PROGS = bridge_fdb_learning_limit.sh \ vxlan_bridge_1d_port_8472.sh \ vxlan_bridge_1d.sh \ vxlan_bridge_1q_ipv6.sh \ + vxlan_bridge_1q_mc_ul.sh \ vxlan_bridge_1q_port_8472_ipv6.sh \ vxlan_bridge_1q_port_8472.sh \ vxlan_bridge_1q.sh \ diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh new file mode 100755 index 000000000000..e01e7ccf2c8d --- /dev/null +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_mc_ul.sh @@ -0,0 +1,757 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +-----------------------------------------+ +# | + $h1.10 + $h1.20 | +# | | 192.0.2.1/28 | 2001:db8:1::1/64 | +# | \________ ________/ | +# | \ / | +# | + $h1 H1 (vrf) | +# +-----------|-----------------------------+ +# | +# +-----------|----------------------------------------------------------------+ +# | +---------|--------------------------------------+ SWITCH (main vrf) | +# | | + $swp1 BR1 (802.1q) | | +# | | vid 10 20 | | +# | | | | +# | | + vx10 (vxlan) + vx20 (vxlan) | + lo10 (dummy) | +# | | local 192.0.2.100 local 2001:db8:4::1 | 192.0.2.100/28 | +# | | group 233.252.0.1 group ff0e::1:2:3 | 2001:db8:4::1/64 | +# | | id 1000 id 2000 | | +# | | vid 10 pvid untagged vid 20 pvid untagged | | +# | +------------------------------------------------+ | +# | | +# | + $swp2 $swp3 + | +# | | 192.0.2.33/28 192.0.2.65/28 | | +# | | 2001:db8:2::1/64 2001:db8:3::1/64 | | +# | | | | +# +---|--------------------------------------------------------------------|---+ +# | | +# +---|--------------------------------+ +--------------------------------|---+ +# | | H2 (vrf) | | H3 (vrf) | | +# | +-|----------------------------+ | | +-----------------------------|-+ | +# | | + $h2 BR2 (802.1d) | | | | BR3 (802.1d) $h3 + | | +# | | | | | | | | +# | | + v1$h2 (veth) | | | | v1$h3 (veth) + | | +# | +-|----------------------------+ | | +-----------------------------|-+ | +# | | | | | | +# +---|--------------------------------+ +--------------------------------|---+ +# | | +# +---|--------------------------------+ +--------------------------------|---+ +# | + v2$h2 (veth) NS2 (netns) | | NS3 (netns) v2$h3 (veth) + | +# | 192.0.2.34/28 | | 192.0.2.66/28 | +# | 2001:db8:2::2/64 | | 2001:db8:3::2/64 | +# | | | | +# | +--------------------------------+ | | +--------------------------------+ | +# | | BR1 (802.1q) | | | | BR1 (802.1q) | | +# | | + vx10 (vxlan) | | | | + vx10 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | group 233.252.0.1 dev v2$h2 | | | | group 233.252.0.1 dev v2$h3 | | +# | | id 1000 dstport $VXPORT | | | | id 1000 dstport $VXPORT | | +# | | vid 10 pvid untagged | | | | vid 10 pvid untagged | | +# | | | | | | | | +# | | + vx20 (vxlan) | | | | + vx20 (vxlan) | | +# | | local 2001:db8:2::2 | | | | local 2001:db8:3::2 | | +# | | group ff0e::1:2:3 dev v2$h2 | | | | group ff0e::1:2:3 dev v2$h3 | | +# | | id 2000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | | +# | | vid 20 pvid untagged | | | | vid 20 pvid untagged | | +# | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 10 20 | | | | | vid 10 20 | | +# | +--|-----------------------------+ | | +--|-----------------------------+ | +# | | | | | | +# | +--|-----------------------------+ | | +--|-----------------------------+ | +# | | + w2 (veth) VW2 (vrf) | | | | + w2 (veth) VW2 (vrf) | | +# | | |\ | | | | |\ | | +# | | | + w2.10 | | | | | + w2.10 | | +# | | | 192.0.2.3/28 | | | | | 192.0.2.4/28 | | +# | | | | | | | | | | +# | | + w2.20 | | | | + w2.20 | | +# | | 2001:db8:1::3/64 | | | | 2001:db8:1::4/64 | | +# | +--------------------------------+ | | +--------------------------------+ | +# +------------------------------------+ +------------------------------------+ + +: "${VXPORT:=4789}" +export VXPORT + +: "${GROUP4:=233.252.0.1}" +export GROUP4 + +: "${GROUP6:=ff0e::1:2:3}" +export GROUP6 + +: "${IPMR:=lo10}" + +ALL_TESTS=" + ipv4_nomcroute + ipv4_mcroute + ipv4_mcroute_changelink + ipv4_mcroute_starg + ipv4_mcroute_noroute + ipv4_mcroute_fdb + ipv4_mcroute_fdb_oif0 + ipv4_mcroute_fdb_oif0_sep + + ipv6_nomcroute + ipv6_mcroute + ipv6_mcroute_changelink + ipv6_mcroute_starg + ipv6_mcroute_noroute + ipv6_mcroute_fdb + ipv6_mcroute_fdb_oif0 + + ipv4_nomcroute_rx + ipv4_mcroute_rx + ipv4_mcroute_starg_rx + ipv4_mcroute_fdb_oif0_sep_rx + ipv4_mcroute_fdb_sep_rx + + ipv6_nomcroute_rx + ipv6_mcroute_rx + ipv6_mcroute_starg_rx + ipv6_mcroute_fdb_sep_rx +" + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 + defer simple_if_fini $h1 + + ip_link_add $h1.10 master v$h1 link $h1 type vlan id 10 + ip_link_set_up $h1.10 + ip_addr_add $h1.10 192.0.2.1/28 + + ip_link_add $h1.20 master v$h1 link $h1 type vlan id 20 + ip_link_set_up $h1.20 + ip_addr_add $h1.20 2001:db8:1::1/64 +} + +install_capture() +{ + local dev=$1; shift + + tc qdisc add dev $dev clsact + defer tc qdisc del dev $dev clsact + + tc filter add dev $dev ingress proto ip pref 104 \ + flower skip_hw ip_proto udp dst_port $VXPORT \ + action pass + defer tc filter del dev $dev ingress proto ip pref 104 + + tc filter add dev $dev ingress proto ipv6 pref 106 \ + flower skip_hw ip_proto udp dst_port $VXPORT \ + action pass + defer tc filter del dev $dev ingress proto ipv6 pref 106 +} + +h2_create() +{ + # $h2 + ip_link_set_up $h2 + + # H2 + vrf_create v$h2 + defer vrf_destroy v$h2 + + ip_link_set_up v$h2 + + # br2 + ip_link_add br2 type bridge vlan_filtering 0 mcast_snooping 0 + ip_link_set_master br2 v$h2 + ip_link_set_up br2 + + # $h2 + ip_link_set_master $h2 br2 + install_capture $h2 + + # v1$h2 + ip_link_set_up v1$h2 + ip_link_set_master v1$h2 br2 +} + +h3_create() +{ + # $h3 + ip_link_set_up $h3 + + # H3 + vrf_create v$h3 + defer vrf_destroy v$h3 + + ip_link_set_up v$h3 + + # br3 + ip_link_add br3 type bridge vlan_filtering 0 mcast_snooping 0 + ip_link_set_master br3 v$h3 + ip_link_set_up br3 + + # $h3 + ip_link_set_master $h3 br3 + install_capture $h3 + + # v1$h3 + ip_link_set_up v1$h3 + ip_link_set_master v1$h3 br3 +} + +switch_create() +{ + # br1 + ip_link_add br1 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 0 + ip_link_set_addr br1 $(mac_get $swp1) + ip_link_set_up br1 + + # A dummy to force the IPv6 OIF=0 test to install a suitable MC route on + # $IPMR to be deterministic. Also used for the IPv6 RX!=TX ping test. + ip_link_add "X$IPMR" up type dummy + + # IPMR + ip_link_add "$IPMR" up type dummy + ip_addr_add "$IPMR" 192.0.2.100/28 + ip_addr_add "$IPMR" 2001:db8:4::1/64 + + # $swp1 + ip_link_set_up $swp1 + ip_link_set_master $swp1 br1 + bridge_vlan_add vid 10 dev $swp1 + bridge_vlan_add vid 20 dev $swp1 + + # $swp2 + ip_link_set_up $swp2 + ip_addr_add $swp2 192.0.2.33/28 + ip_addr_add $swp2 2001:db8:2::1/64 + + # $swp3 + ip_link_set_up $swp3 + ip_addr_add $swp3 192.0.2.65/28 + ip_addr_add $swp3 2001:db8:3::1/64 +} + +vx_create() +{ + local name=$1; shift + local vid=$1; shift + + ip_link_add "$name" up type vxlan dstport "$VXPORT" \ + nolearning noudpcsum tos inherit ttl 16 \ + "$@" + ip_link_set_master "$name" br1 + bridge_vlan_add vid $vid dev "$name" pvid untagged +} +export -f vx_create + +vx_wait() +{ + # Wait for all the ARP, IGMP etc. noise to settle down so that the + # tunnel is clear for measurements. + sleep 10 +} + +vx10_create() +{ + vx_create vx10 10 id 1000 "$@" +} +export -f vx10_create + +vx20_create() +{ + vx_create vx20 20 id 2000 "$@" +} +export -f vx20_create + +vx10_create_wait() +{ + vx10_create "$@" + vx_wait +} + +vx20_create_wait() +{ + vx20_create "$@" + vx_wait +} + +ns_init_common() +{ + local ns=$1; shift + local if_in=$1; shift + local ipv4_in=$1; shift + local ipv6_in=$1; shift + local ipv4_host=$1; shift + local ipv6_host=$1; shift + + # v2$h2 / v2$h3 + ip_link_set_up $if_in + ip_addr_add $if_in $ipv4_in + ip_addr_add $if_in $ipv6_in + + # br1 + ip_link_add br1 type bridge vlan_filtering 1 \ + vlan_default_pvid 0 mcast_snooping 0 + ip_link_set_up br1 + + # vx10, vx20 + vx10_create local ${ipv4_in%/*} group $GROUP4 dev $if_in + vx20_create local ${ipv6_in%/*} group $GROUP6 dev $if_in + + # w1 + ip_link_add w1 type veth peer name w2 + ip_link_set_master w1 br1 + ip_link_set_up w1 + bridge_vlan_add vid 10 dev w1 + bridge_vlan_add vid 20 dev w1 + + # w2 + simple_if_init w2 + defer simple_if_fini w2 + + # w2.10 + ip_link_add w2.10 master vw2 link w2 type vlan id 10 + ip_link_set_up w2.10 + ip_addr_add w2.10 $ipv4_host + + # w2.20 + ip_link_add w2.20 master vw2 link w2 type vlan id 20 + ip_link_set_up w2.20 + ip_addr_add w2.20 $ipv6_host +} +export -f ns_init_common + +ns2_create() +{ + # NS2 + ip netns add ns2 + defer ip netns del ns2 + + # v2$h2 + ip link set dev v2$h2 netns ns2 + defer ip -n ns2 link set dev v2$h2 netns 1 + + in_ns ns2 \ + ns_init_common ns2 v2$h2 \ + 192.0.2.34/28 2001:db8:2::2/64 \ + 192.0.2.3/28 2001:db8:1::3/64 +} + +ns3_create() +{ + # NS3 + ip netns add ns3 + defer ip netns del ns3 + + # v2$h3 + ip link set dev v2$h3 netns ns3 + defer ip -n ns3 link set dev v2$h3 netns 1 + + ip -n ns3 link set dev v2$h3 up + + in_ns ns3 \ + ns_init_common ns3 v2$h3 \ + 192.0.2.66/28 2001:db8:3::2/64 \ + 192.0.2.4/28 2001:db8:1::4/64 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + swp3=${NETIFS[p5]} + h3=${NETIFS[p6]} + + vrf_prepare + defer vrf_cleanup + + forwarding_enable + defer forwarding_restore + + ip_link_add v1$h2 type veth peer name v2$h2 + ip_link_add v1$h3 type veth peer name v2$h3 + + h1_create + h2_create + h3_create + switch_create + ns2_create + ns3_create +} + +adf_install_broken_sg() +{ + adf_mcd_start "$IPMR" || exit $EXIT_STATUS + + mc_cli add $swp2 192.0.2.100 $GROUP4 $swp1 $swp3 + defer mc_cli remove $swp2 192.0.2.100 $GROUP4 $swp1 $swp3 + + mc_cli add $swp2 2001:db8:4::1 $GROUP6 $swp1 $swp3 + defer mc_cli remove $swp2 2001:db8:4::1 $GROUP6 $swp1 $swp3 +} + +adf_install_rx() +{ + mc_cli add $swp2 0.0.0.0 $GROUP4 "$IPMR" + defer mc_cli remove $swp2 0.0.0.0 $GROUP4 lo10 + + mc_cli add $swp3 0.0.0.0 $GROUP4 "$IPMR" + defer mc_cli remove $swp3 0.0.0.0 $GROUP4 lo10 + + mc_cli add $swp2 :: $GROUP6 "$IPMR" + defer mc_cli remove $swp2 :: $GROUP6 lo10 + + mc_cli add $swp3 :: $GROUP6 "$IPMR" + defer mc_cli remove $swp3 :: $GROUP6 lo10 +} + +adf_install_sg() +{ + adf_mcd_start "$IPMR" || exit $EXIT_STATUS + + mc_cli add "$IPMR" 192.0.2.100 $GROUP4 $swp2 $swp3 + defer mc_cli remove "$IPMR" 192.0.2.33 $GROUP4 $swp2 $swp3 + + mc_cli add "$IPMR" 2001:db8:4::1 $GROUP6 $swp2 $swp3 + defer mc_cli remove "$IPMR" 2001:db8:4::1 $GROUP6 $swp2 $swp3 + + adf_install_rx +} + +adf_install_sg_sep() +{ + adf_mcd_start lo || exit $EXIT_STATUS + + mc_cli add lo 192.0.2.120 $GROUP4 $swp2 $swp3 + defer mc_cli remove lo 192.0.2.120 $GROUP4 $swp2 $swp3 + + mc_cli add lo 2001:db8:5::1 $GROUP6 $swp2 $swp3 + defer mc_cli remove lo 2001:db8:5::1 $GROUP6 $swp2 $swp3 +} + +adf_install_sg_sep_rx() +{ + local lo=$1; shift + + adf_mcd_start "$IPMR" "$lo" || exit $EXIT_STATUS + + mc_cli add "$lo" 192.0.2.120 $GROUP4 $swp2 $swp3 + defer mc_cli remove "$lo" 192.0.2.120 $GROUP4 $swp2 $swp3 + + mc_cli add "$lo" 2001:db8:5::1 $GROUP6 $swp2 $swp3 + defer mc_cli remove "$lo" 2001:db8:5::1 $GROUP6 $swp2 $swp3 + + adf_install_rx +} + +adf_install_starg() +{ + adf_mcd_start "$IPMR" || exit $EXIT_STATUS + + mc_cli add "$IPMR" 0.0.0.0 $GROUP4 $swp2 $swp3 + defer mc_cli remove "$IPMR" 0.0.0.0 $GROUP4 $swp2 $swp3 + + mc_cli add "$IPMR" :: $GROUP6 $swp2 $swp3 + defer mc_cli remove "$IPMR" :: $GROUP6 $swp2 $swp3 + + adf_install_rx +} + +do_packets_v4() +{ + local mac=$(mac_get $h2) + + $MZ $h1 -Q 10 -c 10 -d 100msec -p 64 -a own -b $mac \ + -A 192.0.2.1 -B 192.0.2.2 -t udp sp=1234,dp=2345 -q +} + +do_packets_v6() +{ + local mac=$(mac_get $h2) + + $MZ -6 $h1 -Q 20 -c 10 -d 100msec -p 64 -a own -b $mac \ + -A 2001:db8:1::1 -B 2001:db8:1::2 -t udp sp=1234,dp=2345 -q +} + +do_test() +{ + local ipv=$1; shift + local expect_h2=$1; shift + local expect_h3=$1; shift + local what=$1; shift + + local pref=$((100 + ipv)) + + RET=0 + + local t0_h2=$(tc_rule_stats_get $h2 $pref ingress) + local t0_h3=$(tc_rule_stats_get $h3 $pref ingress) + + do_packets_v$ipv + sleep 1 + + local t1_h2=$(tc_rule_stats_get $h2 $pref ingress) + local t1_h3=$(tc_rule_stats_get $h3 $pref ingress) + + local d_h2=$((t1_h2 - t0_h2)) + local d_h3=$((t1_h3 - t0_h3)) + + ((d_h2 == expect_h2)) + check_err $? "Expected $expect_h2 packets on H2, got $d_h2" + + ((d_h3 == expect_h3)) + check_err $? "Expected $expect_h3 packets on H3, got $d_h3" + + log_test "VXLAN MC flood $what" +} + +ipv4_do_test_rx() +{ + local h3_should_fail=$1; shift + local what=$1; shift + + RET=0 + + ping_do $h1.10 192.0.2.3 + check_err $? "H2 should respond" + + ping_do $h1.10 192.0.2.4 + check_err_fail $h3_should_fail $? "H3 responds" + + log_test "VXLAN MC flood $what" +} + +ipv6_do_test_rx() +{ + local h3_should_fail=$1; shift + local what=$1; shift + + RET=0 + + ping6_do $h1.20 2001:db8:1::3 + check_err $? "H2 should respond" + + ping6_do $h1.20 2001:db8:1::4 + check_err_fail $h3_should_fail $? "H3 responds" + + log_test "VXLAN MC flood $what" +} + +ipv4_nomcroute() +{ + # Install a misleading (S,G) rule to attempt to trick the system into + # pushing the packets elsewhere. + adf_install_broken_sg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$swp2" + do_test 4 10 0 "IPv4 nomcroute" +} + +ipv6_nomcroute() +{ + # Like for IPv4, install a misleading (S,G). + adf_install_broken_sg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$swp2" + do_test 6 10 0 "IPv6 nomcroute" +} + +ipv4_nomcroute_rx() +{ + vx10_create local 192.0.2.100 group $GROUP4 dev "$swp2" + ipv4_do_test_rx 1 "IPv4 nomcroute ping" +} + +ipv6_nomcroute_rx() +{ + vx20_create local 2001:db8:4::1 group $GROUP6 dev "$swp2" + ipv6_do_test_rx 1 "IPv6 nomcroute ping" +} + +ipv4_mcroute() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + do_test 4 10 10 "IPv4 mcroute" +} + +ipv6_mcroute() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + do_test 6 10 10 "IPv6 mcroute" +} + +ipv4_mcroute_rx() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + ipv4_do_test_rx 0 "IPv4 mcroute ping" +} + +ipv6_mcroute_rx() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + ipv6_do_test_rx 0 "IPv6 mcroute ping" +} + +ipv4_mcroute_changelink() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" + ip link set dev vx10 type vxlan mcroute + sleep 1 + do_test 4 10 10 "IPv4 mcroute changelink" +} + +ipv6_mcroute_changelink() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + ip link set dev vx20 type vxlan mcroute + sleep 1 + do_test 6 10 10 "IPv6 mcroute changelink" +} + +ipv4_mcroute_starg() +{ + adf_install_starg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + do_test 4 10 10 "IPv4 mcroute (*,G)" +} + +ipv6_mcroute_starg() +{ + adf_install_starg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + do_test 6 10 10 "IPv6 mcroute (*,G)" +} + +ipv4_mcroute_starg_rx() +{ + adf_install_starg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + ipv4_do_test_rx 0 "IPv4 mcroute (*,G) ping" +} + +ipv6_mcroute_starg_rx() +{ + adf_install_starg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + ipv6_do_test_rx 0 "IPv6 mcroute (*,G) ping" +} + +ipv4_mcroute_noroute() +{ + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + do_test 4 0 0 "IPv4 mcroute, no route" +} + +ipv6_mcroute_noroute() +{ + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + do_test 6 0 0 "IPv6 mcroute, no route" +} + +ipv4_mcroute_fdb() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 dev "$IPMR" mcroute + bridge fdb add dev vx10 \ + 00:00:00:00:00:00 self static dst $GROUP4 via "$IPMR" + do_test 4 10 10 "IPv4 mcroute FDB" +} + +ipv6_mcroute_fdb() +{ + adf_install_sg + vx20_create_wait local 2001:db8:4::1 dev "$IPMR" mcroute + bridge -6 fdb add dev vx20 \ + 00:00:00:00:00:00 self static dst $GROUP6 via "$IPMR" + do_test 6 10 10 "IPv6 mcroute FDB" +} + +# Use FDB to configure VXLAN in a way where oif=0 for purposes of FIB lookup. +ipv4_mcroute_fdb_oif0() +{ + adf_install_sg + vx10_create_wait local 192.0.2.100 group $GROUP4 dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst $GROUP4 + do_test 4 10 10 "IPv4 mcroute oif=0" +} + +ipv6_mcroute_fdb_oif0() +{ + # The IPv6 tunnel lookup does not fall back to selection by source + # address. Instead it just does a FIB match, and that would find one of + # the several ff00::/8 multicast routes -- each device has one. In order + # to reliably force the $IPMR device, add a /128 route for the + # destination group address. + ip -6 route add table local multicast $GROUP6/128 dev "$IPMR" + defer ip -6 route del table local multicast $GROUP6/128 dev "$IPMR" + + adf_install_sg + vx20_create_wait local 2001:db8:4::1 group $GROUP6 dev "$IPMR" mcroute + bridge -6 fdb del dev vx20 00:00:00:00:00:00 + bridge -6 fdb add dev vx20 00:00:00:00:00:00 self static dst $GROUP6 + do_test 6 10 10 "IPv6 mcroute oif=0" +} + +# In oif=0 test as above, have FIB lookup resolve to loopback instead of IPMR. +# This doesn't work with IPv6 -- a MC route on lo would be marked as RTF_REJECT. +ipv4_mcroute_fdb_oif0_sep() +{ + adf_install_sg_sep + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group $GROUP4 dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst $GROUP4 + do_test 4 10 10 "IPv4 mcroute TX!=RX oif=0" +} + +ipv4_mcroute_fdb_oif0_sep_rx() +{ + adf_install_sg_sep_rx lo + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group $GROUP4 dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst $GROUP4 + ipv4_do_test_rx 0 "IPv4 mcroute TX!=RX oif=0 ping" +} + +ipv4_mcroute_fdb_sep_rx() +{ + adf_install_sg_sep_rx lo + + ip_addr_add lo 192.0.2.120/28 + vx10_create_wait local 192.0.2.120 group $GROUP4 dev "$IPMR" mcroute + bridge fdb del dev vx10 00:00:00:00:00:00 + bridge fdb add dev vx10 00:00:00:00:00:00 self static dst $GROUP4 via lo + ipv4_do_test_rx 0 "IPv4 mcroute TX!=RX ping" +} + +ipv6_mcroute_fdb_sep_rx() +{ + adf_install_sg_sep_rx "X$IPMR" + + ip_addr_add "X$IPMR" 2001:db8:5::1/64 + vx20_create_wait local 2001:db8:5::1 group $GROUP6 dev "$IPMR" mcroute + bridge -6 fdb del dev vx20 00:00:00:00:00:00 + bridge -6 fdb add dev vx20 00:00:00:00:00:00 \ + self static dst $GROUP6 via "X$IPMR" + ipv6_do_test_rx 0 "IPv6 mcroute TX!=RX ping" +} + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS -- 2.49.0

7 months

2
2
0 0

[PATCH v10 0/6] rust: reduce `as` casts, enable related lints

by Tamir Duberstein

This started with a patch that enabled `clippy::ptr_as_ptr`. Benno Lossin suggested I also look into `clippy::ptr_cast_constness` and I discovered `clippy::as_ptr_cast_mut`. This series now enables all 3 lints. It also enables `clippy::as_underscore` which ensures other pointer casts weren't missed. As a later addition, `clippy::cast_lossless` and `clippy::ref_as_ptr` are also enabled. This series depends on "rust: retain pointer mut-ness in `container_of!`"[1]. Link: https://lore.kernel.org/all/20250409-container-of-mutness-v1-1-64f472b94534… [1] Signed-off-by: Tamir Duberstein <tamird(a)gmail.com> --- Changes in v10: - Move fragment from "rust: enable `clippy::ptr_cast_constness` lint" to "rust: enable `clippy::ptr_as_ptr` lint". (Boqun Feng) - Replace `(...).into()` with `T::from(...)` where the destination type isn't obvious in "rust: enable `clippy::cast_lossless` lint". (Boqun Feng) - Link to v9: https://lore.kernel.org/r/20250416-ptr-as-ptr-v9-0-18ec29b1b1f3@gmail.com Changes in v9: - Replace ref-to-ptr coercion using `let` bindings with `core::ptr::from_{ref,mut}`. (Boqun Feng). - Link to v8: https://lore.kernel.org/r/20250409-ptr-as-ptr-v8-0-3738061534ef@gmail.com Changes in v8: - Use coercion to go ref -> ptr. - rustfmt. - Rebase on v6.15-rc1. - Extract first commit to its own series as it is shared with other series. - Link to v7: https://lore.kernel.org/r/20250325-ptr-as-ptr-v7-0-87ab452147b9@gmail.com Changes in v7: - Add patch to enable `clippy::ref_as_ptr`. - Link to v6: https://lore.kernel.org/r/20250324-ptr-as-ptr-v6-0-49d1b7fd4290@gmail.com Changes in v6: - Drop strict provenance patch. - Fix URLs in doc comments. - Add patch to enable `clippy::cast_lossless`. - Rebase on rust-next. - Link to v5: https://lore.kernel.org/r/20250317-ptr-as-ptr-v5-0-5b5f21fa230a@gmail.com Changes in v5: - Use `pointer::addr` in OF. (Boqun Feng) - Add documentation on stubs. (Benno Lossin) - Mark stubs `#[inline]`. - Pick up Alice's RB on a shared commit from https://lore.kernel.org/all/Z9f-3Aj3_FWBZRrm@google.com/. - Link to v4: https://lore.kernel.org/r/20250315-ptr-as-ptr-v4-0-b2d72c14dc26@gmail.com Changes in v4: - Add missing SoB. (Benno Lossin) - Use `without_provenance_mut` in alloc. (Boqun Feng) - Limit strict provenance lints to the `kernel` crate to avoid complex logic in the build system. This can be revisited on MSRV >= 1.84.0. - Rebase on rust-next. - Link to v3: https://lore.kernel.org/r/20250314-ptr-as-ptr-v3-0-e7ba61048f4a@gmail.com Changes in v3: - Fixed clippy warning in rust/kernel/firmware.rs. (kernel test robot) Link: https://lore.kernel.org/all/202503120332.YTCpFEvv-lkp@intel.com/ - s/as u64/as bindings::phys_addr_t/g. (Benno Lossin) - Use strict provenance APIs and enable lints. (Benno Lossin) - Link to v2: https://lore.kernel.org/r/20250309-ptr-as-ptr-v2-0-25d60ad922b7@gmail.com Changes in v2: - Fixed typo in first commit message. - Added additional patches, converted to series. - Link to v1: https://lore.kernel.org/r/20250307-ptr-as-ptr-v1-1-582d06514c98@gmail.com --- Tamir Duberstein (6): rust: enable `clippy::ptr_as_ptr` lint rust: enable `clippy::ptr_cast_constness` lint rust: enable `clippy::as_ptr_cast_mut` lint rust: enable `clippy::as_underscore` lint rust: enable `clippy::cast_lossless` lint rust: enable `clippy::ref_as_ptr` lint Makefile | 6 ++++++ drivers/gpu/drm/drm_panic_qr.rs | 2 +- rust/bindings/lib.rs | 3 +++ rust/kernel/alloc/allocator_test.rs | 2 +- rust/kernel/alloc/kvec.rs | 4 ++-- rust/kernel/block/mq/operations.rs | 2 +- rust/kernel/block/mq/request.rs | 6 +++--- rust/kernel/device.rs | 4 ++-- rust/kernel/device_id.rs | 4 ++-- rust/kernel/devres.rs | 19 ++++++++++--------- rust/kernel/dma.rs | 6 +++--- rust/kernel/error.rs | 2 +- rust/kernel/firmware.rs | 3 ++- rust/kernel/fs/file.rs | 2 +- rust/kernel/io.rs | 18 +++++++++--------- rust/kernel/kunit.rs | 11 +++++++---- rust/kernel/list/impl_list_item_mod.rs | 2 +- rust/kernel/miscdevice.rs | 2 +- rust/kernel/net/phy.rs | 4 ++-- rust/kernel/of.rs | 6 +++--- rust/kernel/pci.rs | 11 +++++++---- rust/kernel/platform.rs | 4 +++- rust/kernel/print.rs | 6 +++--- rust/kernel/seq_file.rs | 2 +- rust/kernel/str.rs | 14 +++++++------- rust/kernel/sync/poll.rs | 2 +- rust/kernel/time/hrtimer/pin.rs | 2 +- rust/kernel/time/hrtimer/pin_mut.rs | 2 +- rust/kernel/uaccess.rs | 4 ++-- rust/kernel/workqueue.rs | 12 ++++++------ rust/uapi/lib.rs | 3 +++ 31 files changed, 96 insertions(+), 74 deletions(-) --- base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8 change-id: 20250307-ptr-as-ptr-21b1867fc4d4 prerequisite-change-id: 20250409-container-of-mutness-b153dab4388d:v1 prerequisite-patch-id: 53d5889db599267f87642bb0ae3063c29bc24863 Best regards, -- Tamir Duberstein <tamird(a)gmail.com>

7 months

3
13
0 0

[PATCH net-next v2 0/4] udp_tunnel: remove rtnl_lock dependency

by Stanislav Fomichev

Recently bnxt had to grow back a bunch of rtnl dependencies because of udp_tunnel's infra. Add separate (global) mutext to protect udp_tunnel state. v2: - move the lock into udp_tunnel_nic (Jakub) - reorder the lock ordering (Jakub) - move udp_ports_sleep removal into separate patch and update the test (Jakub) Cc: Michael Chan <michael.chan(a)broadcom.com> Stanislav Fomichev (4): udp_tunnel: remove rtnl_lock dependency net: remove redundant ASSERT_RTNL() in queue setup functions netdevsim: remove udp_ports_sleep Revert "bnxt_en: bring back rtnl_lock() in the bnxt_open() path" .../net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 42 ++++--------------- drivers/net/ethernet/emulex/benet/be_main.c | 3 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 - drivers/net/ethernet/intel/ice/ice_main.c | 1 - .../net/ethernet/mellanox/mlx4/en_netdev.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 3 +- .../ethernet/netronome/nfp/nfp_net_common.c | 3 +- .../net/ethernet/qlogic/qede/qede_filter.c | 3 -- .../net/ethernet/qlogic/qlcnic/qlcnic_main.c | 1 - drivers/net/ethernet/sfc/ef10.c | 1 - drivers/net/netdevsim/netdevsim.h | 2 - drivers/net/netdevsim/udp_tunnels.c | 12 ------ include/net/udp_tunnel.h | 8 ++-- net/core/dev.c | 2 - net/ipv4/udp_tunnel_nic.c | 30 +++++++------ .../drivers/net/netdevsim/udp_tunnel_nic.sh | 10 ----- 17 files changed, 31 insertions(+), 97 deletions(-) -- 2.49.0

7 months

4
9
0 0

[PATCH v3 0/4] KVM: arm64: selftests: arch_timer_edge_cases fixes

by Sebastian Ott

Some small fixes for arch_timer_edge_cases that I stumbled upon while debugging failures for this selftest on ampere-one. Changes since v1: * determine effective counter width based on suggestions from Marc Changes since v2: * new patch to fix xval initialization I've done tests with this on various machines - no issues during several hundreds of test runs. v1: https://lore.kernel.org/kvmarm/20250509143312.34224-1-sebott@redhat.com/ v2: https://lore.kernel.org/kvmarm/20250527142434.25209-1-sebott@redhat.com/ Sebastian Ott (4): KVM: arm64: selftests: fix help text for arch_timer_edge_cases KVM: arm64: selftests: fix thread migration in arch_timer_edge_cases KVM: arm64: selftests: arch_timer_edge_cases - fix xval init KVM: arm64: selftests: arch_timer_edge_cases - determine effective counter width .../kvm/arm64/arch_timer_edge_cases.c | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) base-commit: 0ff41df1cb268fc69e703a08a57ee14ae967d0ca -- 2.49.0

7 months

3
7
0 0

[PATCH v17 net-next 0/5] DUALPI2 patch

by chia-yu.chang＠nokia-bell-labs.com

From: Chia-Yu Chang <chia-yu.chang(a)nokia-bell-labs.com> Hello, Please find the DualPI2 patch v17. This patch serise adds DualPI Improved with a Square (DualPI2) with following features: * Supports congestion controls that comply with the Prague requirements in RFC9331 (e.g. TCP-Prague) * Coupled dual-queue that separates the L4S traffic in a low latency queue (L-queue), without harming remaining traffic that is scheduled in classic queue (C-queue) due to congestion-coupling using PI2 as defined in RFC9332 * Configurable overload strategies * Use of sojourn time to reliably estimate queue delay * Supports ECN L4S-identifier (IP.ECN==0b*1) to classify traffic into respective queues For more details of DualPI2, please refer IETF RFC9332 (https://datatracker.ietf.org/doc/html/rfc9332). Best regards, Chia-Yu --- v17 (25-May-2025, Resent at 10-Jun-2025) - Replace 0xffffffff with U32_MAX (Paolo Abeni <pabeni(a)redhat.com>) - Use helper function qdisc_dequeue_internal() and add new helper function skb_apply_step() (Paolo Abeni <pabeni(a)redhat.com>) - Add s64 casting when calculating the delta of the PI controller (Paolo Abeni <pabeni(a)redhat.com>) - Change the drop reason into SKB_DROP_REASON_QDISC_CONGESTED for drop_early (Paolo Abeni <pabeni(a)redhat.com>) - Modify the condition to remove the original skb when enqueuing multiple GSO segments (Paolo Abeni <pabeni(a)redhat.com>) - Add READ_ONCE() in dualpi2_dump_stat() (Paolo Abeni <pabeni(a)redhat.com>) - Add comments, brackets, and brackets for readability (Paolo Abeni <pabeni(a)redhat.com>) v16 (16-MAy-2025) - Add qdisc_lock() to dualpi2_timer() in dualpi2_timer (Paolo Abeni <pabeni(a)redhat.com>) - Introduce convert_ns_to_usec() to convert usec to nsec without overflow in #1 (Paolo Abeni <pabeni(a)redhat.com>) - Update convert_us_tonsec() to convert nsec to usec without overflow in #2 (Paolo Abeni <pabeni(a)redhat.com>) - Add more descriptions with respect to DualPI2 in the cover ltter and add changelog in each patch (Paolo Abeni <pabeni(a)redhat.com>) v15 (09-May-2025) - Add enum of TCA_DUALPI2_ECN_MASK_CLA_ECT to remove potential leakeage in #1 (Simon Horman <horms(a)kernel.org>) - Fix one typo in comment of #2 - Update tc.yaml in #5 to aligh with the updated enum of pkt_sched.h v14 (05-May-2025) - Modify tc.yaml: (1) Replace flags with enum and remove enum-as-flags, (2) Remove credit-queue in xstats, and (3) Change attribute types (Donald Hunter <donald.hun - Add enum and fix the ordering of variables in pkt_sched.h to align with the modified tc.yaml (Donald Hunter <donald.hunter(a)gmail.com>) - Add validators for DROP_OVERLOAD, DROP_EARLY, ECN_MASK, and SPLIT_GSO in sch_dualpi2.c (Donald Hunter <donald.hunter(a)gmail.com>) - Update dualpi2.json to align with the updated variable order in pkt_sched.h - Reorder patches (Donald Hunter <donald.hunter(a)gmail.com>) v13 (26-Apr-2025) - Use dashes in member names to follow YNL conventions in tc.yaml (Donald Hunter <donald.hunter(a)gmail.com>) - Define enumerations separately for flags of drop-early, drop-overload, ecn-mask, credit-queue in tc.yaml (Donald Hunter <donald.hunter(a)gmail.com>) - Change the types of split-gso and step-packets into flag in tc.yaml (Donald Hunter <donald.hunter(a)gmail.com>) - Revert to u32/u8 types for tc-dualpi2-xstats members in tc.yaml (Donald Hunter <donald.hunter(a)gmail.com>) - Add new test cases in tc-tests/qdiscs/dualpi2.json to cover all dualpi2 parameters (Donald Hunter <donald.hunter(a)gmail.com>) - Change the type of TCA_DUALPI2_STEP_PACKETS into NLA_FLAG (Donald Hunter <donald.hunter(a)gmail.com>) v12 (22-Apr-2025) - Remove anonymous struct in sch_dualpi2.c (Paolo Abeni <pabeni(a)redhat.com>) - Replace u32/u8 with uint and s32 with int in tc spec document (Paolo Abeni <pabeni(a)redhat.com>) - Introduce get_memory_limit function to handle potential overflow when multipling limit with MTU (Paolo Abeni <pabeni(a)redhat.com>) - Double the packet length to further include packet overhead in memory_limit (Paolo Abeni <pabeni(a)redhat.com>) - Remove the check of qdisc_qlen(sch) when calling qdisc_tree_reduce_backlog (Paolo Abeni <pabeni(a)redhat.com>) v11 (15-Apr-2025) - Replace hstimer_init with hstimer_setup in sch_dualpi2.c v10 (25-Mar-2025) - Remove leftover include in include/linux/netdevice.h and anonymous struct in sch_dualpi2.c (Paolo Abeni <pabeni(a)redhat.com>) - Use kfree_skb_reason() and add SKB_DROP_REASON_DUALPI2_STEP_DROP drop reason (Paolo Abeni <pabeni(a)redhat.com>) - Split sch_dualpi2.c into 3 patches (and overall 5 patches): Struct definition & parsing, Dump stats & configuration, Enqueue/Dequeue (Paolo Abeni <pabeni(a)redhat.com>) v9 (16-Mar-2025) - Fix mem_usage error in previous version - Add min_qlen_step to the dualpi2 attribute as the minimum queue length in number of packets in the L-queue to start step threshold marking. In previous versions, this value was fixed to 2, so the step threshold was applied to mark packets in the L queue only when the queue length of the L queue was greater than or equal to 2 packets. This will cause larger queuing delays for L4S traffic at low rates (<20Mbps). So we parameterize it and change the default value to 0. Comparison of tcp_1down run 'HTB 20Mbit + DUALPI2 + 10ms base delay' Old versions: avg median # data pts Ping (ms) ICMP : 11.55 11.70 ms 350 TCP upload avg : 18.96 N/A Mbits/s 350 TCP upload sum : 18.96 N/A Mbits/s 350 New version (v9): avg median # data pts Ping (ms) ICMP : 10.81 10.70 ms 350 TCP upload avg : 18.91 N/A Mbits/s 350 TCP upload sum : 18.91 N/A Mbits/s 350 Comparison of tcp_1down run 'HTB 10Mbit + DUALPI2 + 10ms base delay' Old versions: avg median # data pts Ping (ms) ICMP : 12.61 12.80 ms 350 TCP upload avg : 9.48 N/A Mbits/s 350 TCP upload sum : 9.48 N/A Mbits/s 350 New version (v9): avg median # data pts Ping (ms) ICMP : 11.06 10.80 ms 350 TCP upload avg : 9.43 N/A Mbits/s 350 TCP upload sum : 9.43 N/A Mbits/s 350 Comparison of tcp_1down run 'HTB 10Mbit + DUALPI2 + 10ms base delay' Old versions: avg median # data pts Ping (ms) ICMP : 40.86 37.45 ms 350 TCP upload avg : 0.88 N/A Mbits/s 350 TCP upload sum : 0.88 N/A Mbits/s 350 TCP upload::1 : 0.88 0.97 Mbits/s 350 New version (v9): avg median # data pts Ping (ms) ICMP : 11.07 10.40 ms 350 TCP upload avg : 0.55 N/A Mbits/s 350 TCP upload sum : 0.55 N/A Mbits/s 350 TCP upload::1 : 0.55 0.59 Mbits/s 350 v8 (11-Mar-2025) - Fix warning messages in v7 v7 (07-Mar-2025) - Separate into 3 patches to avoid mixing changes of documentation, selftest, and code. (Cong Wang <xiyou.wangcong(a)gmail.com>) v6 (04-Mar-2025) - Add modprobe for dulapi2 in tc-testing script tc-testing/tdc.sh (Jakub Kicinski <kuba(a)kernel.org>) - Update test cases in dualpi2.json - Update commit message v5 (22-Feb-2025) - A comparison was done between MQ + DUALPI2, MQ + FQ_PIE, MQ + FQ_CODEL: Unshaped 1gigE with 4 download streams test: - Summary of tcp_4down run 'MQ + FQ_CODEL': avg median # data pts Ping (ms) ICMP : 1.19 1.34 ms 349 TCP download avg : 235.42 N/A Mbits/s 349 TCP download sum : 941.68 N/A Mbits/s 349 TCP download::1 : 235.19 235.39 Mbits/s 349 TCP download::2 : 235.03 235.35 Mbits/s 349 TCP download::3 : 236.89 235.44 Mbits/s 349 TCP download::4 : 234.57 235.19 Mbits/s 349 - Summary of tcp_4down run 'MQ + FQ_PIE' avg median # data pts Ping (ms) ICMP : 1.21 1.37 ms 350 TCP download avg : 235.42 N/A Mbits/s 350 TCP download sum : 941.61 N/A Mbits/s 350 TCP download::1 : 232.54 233.13 Mbits/s 350 TCP download::2 : 232.52 232.80 Mbits/s 350 TCP download::3 : 233.14 233.78 Mbits/s 350 TCP download::4 : 243.41 241.48 Mbits/s 350 - Summary of tcp_4down run 'MQ + DUALPI2' avg median # data pts Ping (ms) ICMP : 1.19 1.34 ms 349 TCP download avg : 235.42 N/A Mbits/s 349 TCP download sum : 941.68 N/A Mbits/s 349 TCP download::1 : 235.19 235.39 Mbits/s 349 TCP download::2 : 235.03 235.35 Mbits/s 349 TCP download::3 : 236.89 235.44 Mbits/s 349 TCP download::4 : 234.57 235.19 Mbits/s 349 Unshaped 1gigE with 128 download streams test: - Summary of tcp_128down run 'MQ + FQ_CODEL': avg median # data pts Ping (ms) ICMP : 1.88 1.86 ms 350 TCP download avg : 7.39 N/A Mbits/s 350 TCP download sum : 946.47 N/A Mbits/s 350 - Summary of tcp_128down run 'MQ + FQ_PIE': avg median # data pts Ping (ms) ICMP : 1.88 1.86 ms 350 TCP download avg : 7.39 N/A Mbits/s 350 TCP download sum : 946.47 N/A Mbits/s 350 - Summary of tcp_128down run 'MQ + DUALPI2': avg median # data pts Ping (ms) ICMP : 1.88 1.86 ms 350 TCP download avg : 7.39 N/A Mbits/s 350 TCP download sum : 946.47 N/A Mbits/s 350 Unshaped 10gigE with 4 download streams test: - Summary of tcp_4down run 'MQ + FQ_CODEL': avg median # data pts Ping (ms) ICMP : 0.22 0.23 ms 350 TCP download avg : 2354.08 N/A Mbits/s 350 TCP download sum : 9416.31 N/A Mbits/s 350 TCP download::1 : 2353.65 2352.81 Mbits/s 350 TCP download::2 : 2354.54 2354.21 Mbits/s 350 TCP download::3 : 2353.56 2353.78 Mbits/s 350 TCP download::4 : 2354.56 2354.45 Mbits/s 350 - Summary of tcp_4down run 'MQ + FQ_PIE': avg median # data pts Ping (ms) ICMP : 0.20 0.19 ms 350 TCP download avg : 2354.76 N/A Mbits/s 350 TCP download sum : 9419.04 N/A Mbits/s 350 TCP download::1 : 2354.77 2353.89 Mbits/s 350 TCP download::2 : 2353.41 2354.29 Mbits/s 350 TCP download::3 : 2356.18 2354.19 Mbits/s 350 TCP download::4 : 2354.68 2353.15 Mbits/s 350 - Summary of tcp_4down run 'MQ + DUALPI2': avg median # data pts Ping (ms) ICMP : 0.24 0.24 ms 350 TCP download avg : 2354.11 N/A Mbits/s 350 TCP download sum : 9416.43 N/A Mbits/s 350 TCP download::1 : 2354.75 2353.93 Mbits/s 350 TCP download::2 : 2353.15 2353.75 Mbits/s 350 TCP download::3 : 2353.49 2353.72 Mbits/s 350 TCP download::4 : 2355.04 2353.73 Mbits/s 350 Unshaped 10gigE with 128 download streams test: - Summary of tcp_128down run 'MQ + FQ_CODEL': avg median # data pts Ping (ms) ICMP : 7.57 8.69 ms 350 TCP download avg : 73.97 N/A Mbits/s 350 TCP download sum : 9467.82 N/A Mbits/s 350 - Summary of tcp_128down run 'MQ + FQ_PIE': avg median # data pts Ping (ms) ICMP : 7.82 8.91 ms 350 TCP download avg : 73.97 N/A Mbits/s 350 TCP download sum : 9468.42 N/A Mbits/s 350 - Summary of tcp_128down run 'MQ + DUALPI2': avg median # data pts Ping (ms) ICMP : 6.87 7.93 ms 350 TCP download avg : 73.95 N/A Mbits/s 350 TCP download sum : 9465.87 N/A Mbits/s 350 From the results shown above, we see small differences between combinations. - Update commit message to include results of no_split_gso and split_gso (Dave Taht <dave.taht(a)gmail.com> and Paolo Abeni <pabeni(a)redhat.com>) - Add memlimit in the dualpi2 attribute, and add memory_used, max_memory_used, memory_limit in dualpi2 stats (Dave Taht <dave.taht(a)gmail.com>) - Update note in sch_dualpi2.c related to BBRv3 status (Dave Taht <dave.taht(a)gmail.com>) - Update license identifier (Dave Taht <dave.taht(a)gmail.com>) - Add selftest in tools/testing/selftests/tc-testing (Cong Wang <xiyou.wangcong(a)gmail.com>) - Use netlink policies for parameter checks (Jamal Hadi Salim <jhs(a)mojatatu.com>) - Modify texts & fix typos in Documentation/netlink/specs/tc.yaml (Dave Taht <dave.taht(a)gmail.com>) - Add descriptions of packet counter statistics and the reset function of sch_dualpi2.c - Fix step_thresh in packets - Update code comments in sch_dualpi2.c v4 (22-Oct-2024) - Update statement in Kconfig for DualPI2 (Stephen Hemminger <stephen(a)networkplumber.org>) - Put a blank line after #define in sch_dualpi2.c (Stephen Hemminger <stephen(a)networkplumber.org>) - Fix line length warning. v3 (19-Oct-2024) - Fix compilaiton error - Update Documentation/netlink/specs/tc.yaml (Jakub Kicinski <kuba(a)kernel.org>) v2 (18-Oct-2024) - Add Documentation/netlink/specs/tc.yaml (Jakub Kicinski <kuba(a)kernel.org>) - Use dualpi2 instead of skb prefix (Jamal Hadi Salim <jhs(a)mojatatu.com>) - Replace nla_parse_nested_deprecated with nla_parse_nested (Jamal Hadi Salim <jhs(a)mojatatu.com>) - Fix line length warning --- Chia-Yu Chang (4): sched: Struct definition and parsing of dualpi2 qdisc sched: Dump configuration and statistics of dualpi2 qdisc selftests/tc-testing: Add selftests for qdisc DualPI2 Documentation: netlink: specs: tc: Add DualPI2 specification Koen De Schepper (1): sched: Add enqueue/dequeue of dualpi2 qdisc Documentation/netlink/specs/tc.yaml | 156 +++ include/net/dropreason-core.h | 6 + include/uapi/linux/pkt_sched.h | 68 + net/sched/Kconfig | 12 + net/sched/Makefile | 1 + net/sched/sch_dualpi2.c | 1146 +++++++++++++++++ tools/testing/selftests/tc-testing/config | 1 + .../tc-testing/tc-tests/qdiscs/dualpi2.json | 254 ++++ tools/testing/selftests/tc-testing/tdc.sh | 1 + 9 files changed, 1645 insertions(+) create mode 100644 net/sched/sch_dualpi2.c create mode 100644 tools/testing/selftests/tc-testing/tc-tests/qdiscs/dualpi2.json -- 2.34.1

7 months

1
5
0 0

[PATCH bpf-next v4 0/9] bpf: Mitigate Spectre v1 using barriers

by Luis Gerhorst

This improves the expressiveness of unprivileged BPF by inserting speculation barriers instead of rejecting the programs. The approach was previously presented at LPC'24 [1] and RAID'24 [2]. To mitigate the Spectre v1 (PHT) vulnerability, the kernel rejects potentially-dangerous unprivileged BPF programs as of commit 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches"). In [2], we have analyzed 364 object files from open source projects (Linux Samples and Selftests, BCC, Loxilb, Cilium, libbpf Examples, Parca, and Prevail) and found that this affects 31% to 54% of programs. To resolve this in the majority of cases this patchset adds a fall-back for mitigating Spectre v1 using speculation barriers. The kernel still optimistically attempts to verify all speculative paths but uses speculation barriers against v1 when unsafe behavior is detected. This allows for more programs to be accepted without disabling the BPF Spectre mitigations (e.g., by setting cpu_mitigations_off()). For this, it relies on the fact that speculation barriers generally prevent all later instructions from executing if the speculation was not correct (not only loads). See patch 7 ("bpf: Fall back to nospec for Spectre v1") for a detailed description and references to the relevant vendor documentation (AMD and Intel x86-64, ARM64, and PowerPC). In [1] we have measured the overhead of this approach relative to having mitigations off and including the upstream Spectre v4 mitigations. For event tracing and stack-sampling profilers, we found that mitigations increase BPF program execution time by 0% to 62%. For the Loxilb network load balancer, we have measured a 14% slowdown in SCTP performance but no significant slowdown for TCP. This overhead only applies to programs that were previously rejected. I reran the expressiveness-evaluation with v6.14 and made sure the main results still match those from [1] and [2] (which used v6.5). Main design decisions are: * Do not use separate bytecode insns for v1 and v4 barriers (inspired by Daniel Borkmann's question at LPC). This simplifies the verifier significantly and has the only downside that performance on PowerPC is not as high as it could be. * Allow archs to still disable v1/v4 mitigations separately by setting bpf_jit_bypass_spec_v1/v4(). This has the benefit that archs can benefit from improved BPF expressiveness / performance if they are not vulnerable (e.g., ARM64 for v4 in the kernel). * Do not remove the empty BPF_NOSPEC implementation for backends for which it is unknown whether they are vulnerable to Spectre v1. [1] https://lpc.events/event/18/contributions/1954/ ("Mitigating Spectre-PHT using Speculation Barriers in Linux eBPF") [2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux Kernel Extensions") Changes: * v3 -> v4: - Remove insn parameter from do_check_insn() and extract process_bpf_exit_full as a function as requested by Eduard - Investigate apparent sanitize_check_bounds() bug reported by Kartikeya (does appear to not be a bug but only confusing code), sent separate patch to document it and add an assert - Remove already-merged commit 1 ("selftests/bpf: Fix caps for __xlated/jited_unpriv") - Drop former commit 10 ("bpf: Allow nospec-protected var-offset stack access") as it did not include a test and there are other places where var-off is rejected. Also, none of the tested real-world programs used var-off in the paper. Therefore keep the old behavior for now and potentially prepare a patch that converts all cases later if required. - Add link to AMD lfence and PowerPC speculation barrier (ori 31,31,0) documentation - Move detailed barrier documentation to commit 7 ("bpf: Fall back to nospec for Spectre v1") - Link to v3: https://lore.kernel.org/all/20250501073603.1402960-1-luis.gerhorst@fau.de/ * v2 -> v3: - Fix https://lore.kernel.org/oe-kbuild-all/202504212030.IF1SLhz6-lkp@intel.com/ and similar by moving the bpf_jit_bypass_spec_v1/v4() prototypes out of the #ifdef CONFIG_BPF_SYSCALL. Decided not to move them to filter.h (where similar bpf_jit_*() prototypes live) as they would still have to be duplicated in bpf.h to be usable to bpf_bypass_spec_v1/v4() (unless including filter.h in bpf.h is an option). - Fix https://lore.kernel.org/oe-kbuild-all/202504220035.SoGveGpj-lkp@intel.com/ by moving the variable declarations out of the switch-case. - Build touched C files with W=2 and bpf config on x86 to check that there are no other warnings introduced. - Found 3 more checkpatch warnings that can be fixed without degrading readability. - Rebase to bpf-next 2025-05-01 - Link to v2: https://lore.kernel.org/bpf/20250421091802.3234859-1-luis.gerhorst@fau.de/ * v1 -> v2: - Drop former commits 9 ("bpf: Return PTR_ERR from push_stack()") and 11 ("bpf: Fall back to nospec for spec path verification") as suggested by Alexei. This series therefore no longer changes push_stack() to return PTR_ERR. - Add detailed explanation of how lfence works internally and how it affects the algorithm. - Add tests checking that nospec instructions are inserted in expected locations using __xlated_unpriv as suggested by Eduard (also, include a fix for __xlated_unpriv) - Add a test for the mitigations from the description of commit 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches") - Remove unused variables from do_check[_insn]() as suggested by Eduard. - Remove INSN_IDX_MODIFIED to improve readability as suggested by Eduard. This also causes the nospec_result-check to run (and fail) for jumping-ops. Add a warning to assert that this check must never succeed in that case. - Add details on the safety of patch 10 ("bpf: Allow nospec-protected var-offset stack access") based on the feedback on v1. - Rebase to bpf-next-250420 - Link to v1: https://lore.kernel.org/all/20250313172127.1098195-1-luis.gerhorst@fau.de/ * RFC -> v1: - rebase to bpf-next-250313 - tests: mark expected successes/new errors - add bpt_jit_bypass_spec_v1/v4() to avoid #ifdef in bpf_bypass_spec_v1/v4() - ensure that nospec with v1-support is implemented for archs for which GCC supports speculation barriers, except for MIPS - arm64: emit speculation barrier - powerpc: change nospec to include v1 barrier - discuss potential security (archs that do not impl. BPF nospec) and performance (only PowerPC) regressions - Link to RFC: https://lore.kernel.org/bpf/20250224203619.594724-1-luis.gerhorst@fau.de/ Luis Gerhorst (9): bpf: Move insn if/else into do_check_insn() bpf: Return -EFAULT on misconfigurations bpf: Return -EFAULT on internal errors bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4() bpf, arm64, powerpc: Change nospec to include v1 barrier bpf: Rename sanitize_stack_spill to nospec_result bpf: Fall back to nospec for Spectre v1 selftests/bpf: Add test for Spectre v1 mitigation bpf: Fall back to nospec for sanitization-failures arch/arm64/net/bpf_jit.h | 5 + arch/arm64/net/bpf_jit_comp.c | 28 +- arch/powerpc/net/bpf_jit_comp64.c | 80 ++- include/linux/bpf.h | 11 +- include/linux/bpf_verifier.h | 3 +- include/linux/filter.h | 2 +- kernel/bpf/core.c | 32 +- kernel/bpf/verifier.c | 633 ++++++++++-------- tools/testing/selftests/bpf/progs/bpf_misc.h | 4 + .../selftests/bpf/progs/verifier_and.c | 8 +- .../selftests/bpf/progs/verifier_bounds.c | 66 +- .../bpf/progs/verifier_bounds_deduction.c | 45 +- .../selftests/bpf/progs/verifier_map_ptr.c | 20 +- .../selftests/bpf/progs/verifier_movsx.c | 16 +- .../selftests/bpf/progs/verifier_unpriv.c | 65 +- .../bpf/progs/verifier_value_ptr_arith.c | 101 ++- .../selftests/bpf/verifier/dead_code.c | 3 +- tools/testing/selftests/bpf/verifier/jmp32.c | 33 +- tools/testing/selftests/bpf/verifier/jset.c | 10 +- 19 files changed, 755 insertions(+), 410 deletions(-) base-commit: cd2e103d57e5615f9bb027d772f93b9efd567224 -- 2.49.0

7 months

4
12
0 0

[PATCH] selftests/mm: Increase timeout from 180 to 900 seconds

by Shivank Garg

The mm selftests are timing out with the current 180-second limit. Testing shows that run_vmtests.sh takes approximately 11 minutes (664 seconds) to complete. Increase the timeout to 900 seconds (15 minutes) to provide sufficient buffer for the tests to complete successfully. Signed-off-by: Shivank Garg <shivankg(a)amd.com> --- tools/testing/selftests/mm/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index a953c96aa16e..e2206265f67c 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=180 +timeout=900 -- 2.43.0

7 months

2
2
0 0

[PATCH net v2] net: clear the dst when changing skb protocol

by Jakub Kicinski

A not-so-careful NAT46 BPF program can crash the kernel if it indiscriminately flips ingress packets from v4 to v6: BUG: kernel NULL pointer dereference, address: 0000000000000000 ip6_rcv_core (net/ipv6/ip6_input.c:190:20) ipv6_rcv (net/ipv6/ip6_input.c:306:8) process_backlog (net/core/dev.c:6186:4) napi_poll (net/core/dev.c:6906:9) net_rx_action (net/core/dev.c:7028:13) do_softirq (kernel/softirq.c:462:3) netif_rx (net/core/dev.c:5326:3) dev_loopback_xmit (net/core/dev.c:4015:2) ip_mc_finish_output (net/ipv4/ip_output.c:363:8) NF_HOOK (./include/linux/netfilter.h:314:9) ip_mc_output (net/ipv4/ip_output.c:400:5) dst_output (./include/net/dst.h:459:9) ip_local_out (net/ipv4/ip_output.c:130:9) ip_send_skb (net/ipv4/ip_output.c:1496:8) udp_send_skb (net/ipv4/udp.c:1040:8) udp_sendmsg (net/ipv4/udp.c:1328:10) The output interface has a 4->6 program attached at ingress. We try to loop the multicast skb back to the sending socket. Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr and changes skb->protocol to v6. We enter ip6_rcv_core which tries to use skb_dst(). But the dst is still an IPv4 one left after IPv4 mcast output. Clear the dst in all BPF helpers which change the protocol. Also clear the dst if we did an encap or decap as those will most likely make the dst stale. Try to preserve metadata dsts, those may carry non-routing metadata. Reviewed-by: Maciej Żenczykowski <maze(a)google.com> Acked-by: Daniel Borkmann <daniel(a)iogearbox.net> Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()") Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow") Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- v2: - drop on encap/decap - fix typo (protcol) - add the test to the Makefile v1: https://lore.kernel.org/20250604210604.257036-1-kuba@kernel.org I wonder if we should not skip ingress (tc_skip_classify?) for looped back packets in the first place. But that doesn't seem robust enough vs multiple redirections to solve the crash. Ignoring LOOPBACK packets (like the NAT46 prog should) doesn't work either, since BPF can change pkt_type arbitrarily. CC: martin.lau(a)linux.dev CC: daniel(a)iogearbox.net CC: john.fastabend(a)gmail.com CC: eddyz87(a)gmail.com CC: sdf(a)fomichev.me CC: haoluo(a)google.com CC: willemb(a)google.com CC: william.xuanziyang(a)huawei.com CC: alan.maguire(a)oracle.com CC: bpf(a)vger.kernel.org CC: edumazet(a)google.com CC: maze(a)google.com CC: shuah(a)kernel.org CC: linux-kselftest(a)vger.kernel.org CC: yonghong.song(a)linux.dev --- tools/testing/selftests/net/Makefile | 1 + net/core/filter.c | 31 +++++++++++++++++++------- tools/testing/selftests/net/nat6to4.sh | 15 +++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) create mode 100755 tools/testing/selftests/net/nat6to4.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ea84b88bcb30..ab996bd22a5f 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -27,6 +27,7 @@ TEST_PROGS += amt.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh TEST_PROGS += udpgro_frglist.sh +TEST_PROGS += nat6to4.sh TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh diff --git a/net/core/filter.c b/net/core/filter.c index 327ca73f9cd7..d5917d6446f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3406,8 +3406,14 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); + if (ret) + return ret; + bpf_compute_data_pointers(skb); - return ret; + if (skb_valid_dst(skb)) + skb_dst_drop(skb); + + return 0; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { @@ -3554,6 +3560,9 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); + + if (skb_valid_dst(skb)) + skb_dst_drop(skb); } if (skb_is_gso(skb)) { @@ -3581,6 +3590,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { + bool decap = flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | @@ -3603,13 +3613,18 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; - /* Match skb->protocol to new outer l3 protocol */ - if (skb->protocol == htons(ETH_P_IP) && - flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); - else if (skb->protocol == htons(ETH_P_IPV6) && - flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + if (decap) { + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + + if (skb_valid_dst(skb)) + skb_dst_drop(skb); + } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); diff --git a/tools/testing/selftests/net/nat6to4.sh b/tools/testing/selftests/net/nat6to4.sh new file mode 100755 index 000000000000..0ee859b622a4 --- /dev/null +++ b/tools/testing/selftests/net/nat6to4.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +NS="ns-peer-$(mktemp -u XXXXXX)" + +ip netns add "${NS}" +ip -netns "${NS}" link set lo up +ip -netns "${NS}" route add default via 127.0.0.2 dev lo + +tc -n "${NS}" qdisc add dev lo ingress +tc -n "${NS}" filter add dev lo ingress prio 4 protocol ip \ + bpf object-file nat6to4.bpf.o section schedcls/egress4/snat4 direct-action + +ip netns exec "${NS}" \ + bash -c 'echo 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789abc | socat - UDP4-DATAGRAM:224.1.0.1:6666,ip-multicast-loop=1' -- 2.49.0

7 months, 1 week

3
5
0 0

ATS 2025 Schedule Now Live – Register Today!

by Gustavo Padovan

Hello everyone, The schedule for the Automated Testing Summit (ATS) 2025 is now live! You can now explore the full program and speaker list at: 🔗 https://ats25.sched.com/ This year’s ATS will be packed with talks and discussions focused on scaling test infrastructure, improving collaboration across projects, and pushing the boundaries of automation in the Linux ecosystem. 📍 ATS 2025 will take place as a co-located event at the Open Source Summit North America, on June 26th in Denver, CO. If you haven’t yet registered, you can do so here: 🔗 https://events.linuxfoundation.org/open-source-summit-north-america/feature… You can attend in person or virtually. We look forward to seeing you there! Best regards, The KernelCI Team -- Gustavo Padovan Collabora Ltd.

7 months, 1 week

1
0
0 0

[RFC net-next 6/6] selftests: drv-net: add test for RSS on flow label

by Jakub Kicinski

Add a simple test for checking that RSS on flow label works, and that its rejected for IPv4 flows. # ./tools/testing/selftests/drivers/net/hw/rss_flow_label.py TAP version 13 1..2 ok 1 rss_flow_label.test_rss_flow_label ok 2 rss_flow_label.test_rss_flow_label_6only # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: sdf(a)fomichev.me CC: linux-kselftest(a)vger.kernel.org --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../drivers/net/hw/rss_flow_label.py | 151 ++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/rss_flow_label.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index df2c047ffa90..56bf1f1b8377 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -17,6 +17,7 @@ TEST_PROGS = \ loopback.sh \ pp_alloc_fail.py \ rss_ctx.py \ + rss_flow_label.py \ rss_input_xfrm.py \ tso.py \ xsk_reconfig.py \ diff --git a/tools/testing/selftests/drivers/net/hw/rss_flow_label.py b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py new file mode 100755 index 000000000000..e471e13160ae --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_flow_label.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Tests for RSS hashing on IPv6 Flow Label. +""" + +import glob +import socket +from lib.py import CmdExitFailure +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, ksft_in, \ + ksft_not_in, ksft_raises, KsftSkipEx +from lib.py import bkg, cmd, defer, fd_read_timeout, rand_port +from lib.py import NetDrvEpEnv + + +def _ethtool_get_cfg(cfg, fl_type): + descr = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + + converter = { + "IP SA": "s", + "IP DA": "d", + "L3 proto": "t", + "L4 bytes 0 & 1 [TCP/UDP src port]": "f", + "L4 bytes 2 & 3 [TCP/UDP dst port]": "n", + "IPv6 Flow Label": "l", + } + + ret = "" + for line in descr.split("\n")[1:-2]: + # if this raises we probably need to add more keys to converter above + ret += converter[line] + return ret + + +def _traffic(cfg, one_sock, one_cpu): + local_port = rand_port(socket.SOCK_DGRAM) + remote_port = rand_port(socket.SOCK_DGRAM) + + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + sock.bind(("", local_port)) + sock.connect((cfg.remote_addr_v["6"], 0)) + if one_sock: + send = f"exec 5<>/dev/udp/{cfg.addr_v['6']}/{local_port}; " \ + "for i in `seq 20`; do echo a >&5; sleep 0.02; done; exec 5>&-" + else: + send = "for i in `seq 20`; do echo a | socat -t0.02 - UDP6:" \ + f"[{cfg.addr_v['6']}]:{local_port},sourceport={remote_port}; done" + + cpus = set() + with bkg(send, shell=True, host=cfg.remote, exit_wait=True): + for _ in range(20): + fd_read_timeout(sock.fileno(), 1) + cpu = sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) + cpus.add(cpu) + + if one_cpu: + ksft_eq(len(cpus), 1, + f"{one_sock=} - expected one CPU, got traffic on: {cpus=}") + else: + ksft_ge(len(cpus), 2, + f"{one_sock=} - expected many CPUs, got traffic on: {cpus=}") + + +def test_rss_flow_label(cfg): + """ + Test hashing on IPv6 flow label. Send traffic over a single socket + and over multiple sockets. Depend on the remote having auto-label + enabled so that it randomizes the label per socket. + """ + + cfg.require_ipver("6") + cfg.require_cmd("socat", remote=True) + if not hasattr(socket, "SO_INCOMING_CPU"): + raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") + + # 1 is the default, if someone changed it we probably shouldn"t mess with it + af = cmd("cat /proc/sys/net/ipv6/auto_flowlabels", host=cfg.remote).stdout + if af.strip() != "1": + raise KsftSkipEx("Remote does not have auto_flowlabels enabled") + + qcnt = len(glob.glob(f"/sys/class/net/{cfg.ifname}/queues/rx-*")) + if qcnt < 2: + raise KsftSkipEx(f"Local has only {qcnt} queues") + + # Enable flow label hashing for UDP6 + initial = _ethtool_get_cfg(cfg, "udp6") + no_lbl = initial.replace("l", "") + if "l" not in initial: + try: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 l{no_lbl}") + except CmdExitFailure as exc: + raise KsftSkipEx("Device doesn't support Flow Label for UDP6") from exc + + defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _traffic(cfg, one_sock=True, one_cpu=True) + _traffic(cfg, one_sock=False, one_cpu=False) + + # Disable it, we should see no hashing (reset was already defer()ed) + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {no_lbl}") + + _traffic(cfg, one_sock=False, one_cpu=True) + + +def _check_v4_flow_types(cfg): + for fl_type in ["tcp4", "udp4", "ah4", "esp4", "sctp4"]: + try: + cur = cmd(f"ethtool -n {cfg.ifname} rx-flow-hash {fl_type}").stdout + ksft_not_in("Flow Label", cur, + comment=f"{fl_type=} has Flow Label:" + cur) + except CmdExitFailure: + # Probably does not support this flow type + pass + + +def test_rss_flow_label_6only(cfg): + """ + Test interactions with IPv4 flow types. It should not be possible to set + IPv6 Flow Label hashing for an IPv4 flow type. The Flow Label should also + not appear in the IPv4 "current config". + """ + + with ksft_raises(CmdExitFailure) as cm: + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash tcp4 sdfnl") + ksft_in("Invalid argument", cm.exception.cmd.stderr) + + _check_v4_flow_types(cfg) + + # Try to enable Flow Labels and check again, in case it leaks thru + initial = _ethtool_get_cfg(cfg, "udp6") + changed = initial.replace("l", "") if "l" in initial else initial + "l" + + cmd(f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {changed}") + restore = defer(cmd, f"ethtool -N {cfg.ifname} rx-flow-hash udp6 {initial}") + + _check_v4_flow_types(cfg) + restore.exec() + _check_v4_flow_types(cfg) + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_rss_flow_label, + test_rss_flow_label_6only], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() -- 2.49.0

7 months, 1 week

1
0
0 0

[RFC net-next 5/6] selftests: drv-net: import things in lib one by one

by Jakub Kicinski

pylint doesn't understand our path hacks, and it generates a lot of warnings for driver tests. Import what we use one by one, this is hopefully not too tedious and it makes pylint happy. Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: mohan.prasad(a)microchip.com CC: sdf(a)fomichev.me CC: dw(a)davidwei.uk CC: linux-kselftest(a)vger.kernel.org --- .../selftests/drivers/net/hw/lib/py/__init__.py | 17 +++++++++++++++++ .../selftests/drivers/net/lib/py/__init__.py | 14 ++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py index b582885786f5..56ff11074b55 100644 --- a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py @@ -7,8 +7,25 @@ KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve() try: sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * from drivers.net.lib.py import * + + # Import one by one to avoid pylint false positives + from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ + NlError, RtnlFamily + from net.lib.py import CmdExitFailure + from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \ + rand_port, tool, wait_port_listen + from net.lib.py import fd_read_timeout + from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx + from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ + ksft_setup + from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ + ksft_ne, ksft_not_in, ksft_raises, ksft_true + from net.lib.py import NetNSEnter + from drivers.net.lib.py import GenerateTraffic + from drivers.net.lib.py import NetDrvEnv, NetDrvEpEnv except ModuleNotFoundError as e: ksft_pr("Failed importing `net` library from kernel sources") ksft_pr(str(e)) diff --git a/tools/testing/selftests/drivers/net/lib/py/__init__.py b/tools/testing/selftests/drivers/net/lib/py/__init__.py index 401e70f7f136..9ed1d8f70524 100644 --- a/tools/testing/selftests/drivers/net/lib/py/__init__.py +++ b/tools/testing/selftests/drivers/net/lib/py/__init__.py @@ -7,7 +7,21 @@ KSFT_DIR = (Path(__file__).parent / "../../../..").resolve() try: sys.path.append(KSFT_DIR.as_posix()) + from net.lib.py import * + + # Import one by one to avoid pylint false positives + from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ + NlError, RtnlFamily + from net.lib.py import CmdExitFailure + from net.lib.py import bkg, cmd, defer, ethtool, fd_read_timeout, ip, \ + rand_port, tool, wait_port_listen + from net.lib.py import fd_read_timeout + from net.lib.py import KsftSkipEx, KsftFailEx, KsftXfailEx + from net.lib.py import ksft_disruptive, ksft_exit, ksft_pr, ksft_run, \ + ksft_setup + from net.lib.py import ksft_eq, ksft_ge, ksft_in, ksft_is, ksft_lt, \ + ksft_ne, ksft_not_in, ksft_raises, ksft_true except ModuleNotFoundError as e: ksft_pr("Failed importing `net` library from kernel sources") ksft_pr(str(e)) -- 2.49.0

7 months, 1 week

1
0
0 0

[PATCH] selftests: ir_decoder: Convert header comment to proper multi-line block

by Abdelrahman Fekry

The test file for the IR decoder used single-line comments at the top to document its purpose and licensing, which is inconsistent with the style used throughout the Linux kernel. in this patch i converted the file header to a proper multi-line comment block (/*) that aligns with standard kernel practices. This improves readability, consistency across selftests, and ensures the license and documentation are clearly visible in a familiar format. No functional changes have been made. Signed-off-by: Abdelrahman Fekry <Abdelrahmanfekry375(a)gmail.com> --- tools/testing/selftests/ir/ir_loopback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index f4a15cbdd5ea..2de4a6296f35 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -1,14 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 -// test ir decoder -// -// Copyright (C) 2018 Sean Young <sean(a)mess.org> - -// When sending LIRC_MODE_SCANCODE, the IR will be encoded. rc-loopback -// will send this IR to the receiver side, where we try to read the decoded -// IR. Decoding happens in a separate kernel thread, so we will need to -// wait until that is scheduled, hence we use poll to check for read -// readiness. - +/* Copyright (C) 2018 Sean Young <sean(a)mess.org> + * + * Selftest for IR decoder + * + * + * When sending LIRC_MODE_SCANCODE, the IR will be encoded. rc-loopback + * will send this IR to the receiver side, where we try to read the decoded + * IR. Decoding happens in a separate kernel thread, so we will need to + * wait until that is scheduled, hence we use poll to check for read + * readiness. +*/ #include <linux/lirc.h> #include <errno.h> #include <stdio.h> -- 2.25.1

7 months, 1 week

3
3
0 0

[PATCH v6 0/2] x86/fred: Prevent immediate repeat of single step trap on return from SIGTRAP handler

by Xin Li (Intel)

IDT event delivery has a debug hole in which it does not generate #DB upon returning to userspace before the first userspace instruction is executed if the Trap Flag (TF) is set. FRED closes this hole by introducing a software event flag, i.e., bit 17 of the augmented SS: if the bit is set and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. However I overlooked properly setting and clearing the bit in different situations. Thus when FRED is enabled, if the Trap Flag (TF) is set without an external debugger attached, it can lead to an infinite loop in the SIGTRAP handler. To avoid this, the software event flag in the augmented SS must be cleared, ensuring that no single-step trap remains pending when ERETU completes. This patch set combines the fix [1] and its corresponding selftest [2] (requested by Dave Hansen) into one patch set. [1] https://lore.kernel.org/lkml/20250523050153.3308237-1-xin@zytor.com/ [2] https://lore.kernel.org/lkml/20250530230707.2528916-1-xin@zytor.com/ This patch set is based on tip/x86/urgent branch. Link to v5 of this patch set: https://lore.kernel.org/lkml/20250606174528.1004756-1-xin@zytor.com/ Changes in v6: *) Replace a "sub $128, %rsp" with "add $-128, %rsp" (hpa). *) Declared loop_count_on_same_ip inside sigtrap() (Sohil). *) s/sigtrap/SIGTRAP (Sohil). *) Add TB from Sohil to the first patch. Xin Li (Intel) (2): x86/fred/signal: Prevent immediate repeat of single step trap on return from SIGTRAP handler selftests/x86: Add a test to detect infinite SIGTRAP handler loop arch/x86/include/asm/sighandling.h | 22 +++++ arch/x86/kernel/signal_32.c | 4 + arch/x86/kernel/signal_64.c | 4 + tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/sigtrap_loop.c | 101 +++++++++++++++++++++ 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/sigtrap_loop.c base-commit: dd2922dcfaa3296846265e113309e5f7f138839f -- 2.49.0

7 months, 1 week

2
3
0 0

[PATCH 0/2] kselftest/arm64: Add coverage for the interaction of vfork() and GCS

by Mark Brown

I had cause to look at the vfork() support for GCS and realised that we don't have any direct test coverage, this series does so by adding vfork() to nolibc and then using that in basic-gcs to provide some simple vfork() coverage. Signed-off-by: Mark Brown <broonie(a)kernel.org> --- Mark Brown (2): tools/nolibc: Provide vfork() kselftest/arm64: Add a test for vfork() with GCS tools/include/nolibc/sys.h | 29 ++++++++++++ tools/testing/selftests/arm64/gcs/basic-gcs.c | 63 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) --- base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494 change-id: 20250528-arm64-gcs-vfork-exit-4a7daf7652ee Best regards, -- Mark Brown <broonie(a)kernel.org>

7 months, 1 week

2
7
0 0

[PATCH v2] selftests/bpf: Validate UDP length in cls_redirect test

by Suchit Karunakaran

From: Suchit <suchitkarunakaran(a)gmail.com> Add validation step to ensure that the UDP payload is long enough to contain the expected GUE and UNIGUE encapsulation headers Signed-off-by: Suchit <suchitkarunakaran(a)gmail.com> --- Changes since v2: - Rebase tools/testing/selftests/bpf/progs/test_cls_redirect.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c index f344c6835e84..c1d2eaee2e77 100644 --- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c +++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c @@ -978,7 +978,14 @@ int cls_redirect(struct __sk_buff *skb) return TC_ACT_OK; } - /* TODO Check UDP length? */ + uint16_t udp_len = bpf_ntohs(encap->udp.len); + uint16_t min_encap_len = sizeof(encap->udp) + sizeof(encap->gue) + sizeof(encap->unigue); + + if (udp_len < min_encap_len) { + metrics->errors_total_malformed_encapsulation++; + return TC_ACT_SHOT; + } + if (encap->udp.dest != ENCAPSULATION_PORT) { return TC_ACT_OK; } -- 2.49.0

7 months, 1 week

3
2
1 0

[PATCH v5 0/2] x86/fred: Prevent immediate repeat of single step trap on return from SIGTRAP handler

by Xin Li (Intel)

IDT event delivery has a debug hole in which it does not generate #DB upon returning to userspace before the first userspace instruction is executed if the Trap Flag (TF) is set. FRED closes this hole by introducing a software event flag, i.e., bit 17 of the augmented SS: if the bit is set and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. However I overlooked properly setting and clearing the bit in different situations. Thus when FRED is enabled, if the Trap Flag (TF) is set without an external debugger attached, it can lead to an infinite loop in the SIGTRAP handler. To avoid this, the software event flag in the augmented SS must be cleared, ensuring that no single-step trap remains pending when ERETU completes. This patch set combines the fix [1] and its corresponding selftest [2] (requested by Dave Hansen) into one patch set. [1] https://lore.kernel.org/lkml/20250523050153.3308237-1-xin@zytor.com/ [2] https://lore.kernel.org/lkml/20250530230707.2528916-1-xin@zytor.com/ This patch set is based on tip/x86/urgent branch as of today. Link to v4 of this patch set: https://lore.kernel.org/lkml/20250605181020.590459-1-xin@zytor.com/ Changes in v5: *) Accurately rephrase the shortlog (hpa). *) Do "sub $-128, %rsp" rather than "add $128, %rsp", which is more efficient in code size (hpa). *) Add TB from Sohil. *) Add Cc: stable(a)vger.kernel.org to all patches. Xin Li (Intel) (2): x86/fred/signal: Prevent immediate repeat of single step trap on return from SIGTRAP handler selftests/x86: Add a test to detect infinite sigtrap handler loop arch/x86/include/asm/sighandling.h | 22 +++++ arch/x86/kernel/signal_32.c | 4 + arch/x86/kernel/signal_64.c | 4 + tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/sigtrap_loop.c | 98 ++++++++++++++++++++++ 5 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/sigtrap_loop.c base-commit: dd2922dcfaa3296846265e113309e5f7f138839f -- 2.49.0

7 months, 1 week

3
4
0 0

[PATCH RFC bpf-next 0/4] bpf, arm64: support up to 12 arguments

by Alexis Lothoré (eBPF Foundation)

Hello, this series is a revival of Xu Kuhoai's work to enable larger arguments count for BPF programs on ARM64 ([1]). His initial series received some positive feedback, but lacked some specific case handling around arguments alignment (see AAPCS64 C.14 rule in section 6.8.2, [2]). There as been another attempt from Puranjay Mohan, which was unfortunately missing the same thing ([3]). Since there has been some time between those series and this new one, I chose to send it as a new series rather than a new revision of the existing series. To support the increased argument counts and arguments larger than registers size (eg: structures), the trampoline does the following: - for bpf programs: arguments are retrieved from both registers and the function stack, and pushed in the trampoline stack as an array of u64 to generate the programs context. It is then passed by pointer to the bpf programs - when the trampoline is in charge of calling the original function: it restores the registers content, and generates a new stack layout for the additional arguments that do not fit in registers. This new attempt is based on Xu's series and aims to handle the missing alignment concern raised in the reviews discussions. The main novelties are then around arguments alignments: - the first commit is exposing some new info in the BTF function model passed to the JIT compiler to allow it to deduce the needed alignment when configuring the trampoline stack - the second commit is taken from Xu's series, and received the following modifications: - the calc_aux_args computes an expected alignment for each argument - the calc_aux_args computes two different stack space sizes: the one needed to store the bpf programs context, and the original function stacked arguments (which needs alignment). Those stack sizes are in bytes instead of "slots" - when saving/restoring arguments for bpf program or for the original function, make sure to align the load/store accordingly, when relevant - a few typos fixes and some rewording, raised by the review on the original series - the last commit introduces some explicit tests that ensure that the needed alignment is enforced by the trampoline I marked the series as RFC because it appears that the new tests trigger some failures in CI on x86 and s390, despite the series not touching any code related to those architectures. Some very early investigation/gdb debugging on the x86 side seems to hint that it could be related to the same missing alignment too (based on section 3.2.3 in [4], and so the x86 trampoline would need the same alignment handling ?). For s390 it looks less clear, as all values captured from the bpf test program are set to 0 in the CI output, and I don't have the proper setup yet to check the low level details. I am tempted to isolate those new tests (which were actually useful to spot real issues while tuning the ARM64 trampoline) and add them to the relevant DENYLIST files for x86/s390, but I guess this is not the right direction, so I would gladly take a second opinion on this. [1] https://lore.kernel.org/all/20230917150752.69612-1-xukuohai@huaweicloud.com… [2] https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#id82 [3] https://lore.kernel.org/bpf/20240705125336.46820-1-puranjay@kernel.org/ [4] https://refspecs.linuxbase.org/elf/x86_64-abi-0.99.pdf Signed-off-by: Alexis Lothoré (eBPF Foundation) <alexis.lothore(a)bootlin.com> --- Alexis Lothoré (eBPF Foundation) (3): bpf: add struct largest member size in func model bpf/selftests: add tests to validate proper arguments alignment on ARM64 bpf/selftests: enable tracing tests for ARM64 Xu Kuohai (1): bpf, arm64: Support up to 12 function arguments arch/arm64/net/bpf_jit_comp.c | 235 ++++++++++++++++----- include/linux/bpf.h | 1 + kernel/bpf/btf.c | 25 +++ tools/testing/selftests/bpf/DENYLIST.aarch64 | 3 - .../selftests/bpf/prog_tests/tracing_struct.c | 23 ++ tools/testing/selftests/bpf/progs/tracing_struct.c | 10 +- .../selftests/bpf/progs/tracing_struct_many_args.c | 67 ++++++ .../testing/selftests/bpf/test_kmods/bpf_testmod.c | 50 +++++ 8 files changed, 357 insertions(+), 57 deletions(-) --- base-commit: 91e7eb701b4bc389e7ddfd80ef6e82d1a6d2d368 change-id: 20250220-many_args_arm64-8bd3747e6948 Best regards, -- Alexis Lothoré, Bootlin Embedded Linux and Kernel engineering https://bootlin.com

7 months, 1 week

8
31
0 0

[PATCH v2 00/15] Consolidate iommu page table implementations (AMD)

by Jason Gunthorpe

Currently each of the iommu page table formats duplicates all of the logic to maintain the page table and perform map/unmap/etc operations. There are several different versions of the algorithms between all the different formats. The io-pgtable system provides an interface to help isolate the page table code from the iommu driver, but doesn't provide tools to implement the common algorithms. This makes it very hard to improve the state of the pagetable code under the iommu domains as any proposed improvement needs to alter a large number of different driver code paths. Combined with a lack of software based testing this makes improvement in this area very hard. iommufd wants several new page table operations: - More efficient map/unmap operations, using iommufd's batching logic - unmap that returns the physical addresses into a batch as it progresses - cut that allows splitting areas so large pages can have holes poked in them dynamically (ie guestmemfd hitless shared/private transitions) - More agressive freeing of table memory to avoid waste - Fragmenting large pages so that dirty tracking can be more granular - Reassembling large pages so that VMs can run at full IO performance in migration/dirty tracking error flows - KHO integration for kernel live upgrade Together these are algorithmically complex enough to be a very significant task to go and implement in all the page table formats we support. Just the "server" focused drivers use almost all the formats (ARMv8 S1&S2 / x86 PAE / AMDv1 / VT-D SS / RISCV) Instead of doing the duplicated work, this series takes the first step to consolidate the algorithms into one places. In spirit it is similar to the work Christoph did a few years back to pull the redundant get_user_pages() implementations out of the arch code into core MM. This unlocked a great deal of improvement in that space in the following years. I would like to see the same benefit in iommu as well. My first RFC showed a bigger picture with all most all formats and more algorithms. This series reorganizes that to be narrowly focused on just enough to convert the AMD driver to use the new mechanism. kunit tests are provided that allow good testing of the algorithms and all formats on x86, nothing is arch specific. AMD is one of the simpler options as the HW is quite uniform with few different options/bugs while still requiring the complicated contiguous pages support. The HW also has a very simple range based invalidation approach that is easy to implement. The AMD v1 and AMD v2 page table formats are implemented bit for bit identical to the current code, tested using a compare kunit test that checks against the io-pgtable version (on github, see below). Updating the AMD driver to replace the io-pgtable layer with the new stuff is fairly straightforward now. The layering is fixed up in the new version so that all the invalidation goes through function pointers. Several small fixing patches have come out of this as I've been fixing the problems that the test suite uncovers in the current code, and implementing the fixed version in iommupt. On performance, there is a quite wide variety of implementation designs across all the drivers. Looking at some key performance across the main formats: iommu_map(): pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 51,63 , 19.19 (AMDV1) 256*2^12, 386,1909 , 367,1795 , 79.79 256*2^21, 362,1633 , 355,1556 , 77.77 2^12, 56,62 , 52,59 , 11.11 (AMDv2) 256*2^12, 405,1355 , 357,1292 , 72.72 256*2^21, 393,1160 , 358,1114 , 67.67 2^12, 55,65 , 53,62 , 14.14 (VTD second stage) 256*2^12, 391,518 , 332,512 , 35.35 256*2^21, 383,635 , 336,624 , 46.46 2^12, 57,65 , 55,63 , 12.12 (ARM 64 bit) 256*2^12, 380,389 , 361,369 , 2.02 256*2^21, 358,419 , 345,400 , 13.13 iommu_unmap(): pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 69,88 , 65,85 , 23.23 (AMDv1) 256*2^12, 353,6498 , 331,6029 , 94.94 256*2^21, 373,6014 , 360,5706 , 93.93 2^12, 71,72 , 66,69 , 4.04 (AMDv2) 256*2^12, 228,891 , 206,871 , 76.76 256*2^21, 254,721 , 245,711 , 65.65 2^12, 69,87 , 65,82 , 20.20 (VTD second stage) 256*2^12, 210,321 , 200,315 , 36.36 256*2^21, 255,349 , 238,342 , 30.30 2^12, 72,77 , 68,74 , 8.08 (ARM 64 bit) 256*2^12, 521,357 , 447,346 , -29.29 256*2^21, 489,358 , 433,345 , -25.25 * Above numbers include additional patches to remove the iommu_pgsize() overheads. gcc 13.3.0, i7-12700 This version provides fairly consistent performance across formats. ARM unmap performance is quite different because this version supports contiguous pages and uses a very different algorithm for unmapping. Though why it is so worse compared to AMDv1 I haven't figured out yet. The per-format commits include a more detailed chart. There is a second branch: https://github.com/jgunthorpe/linux/commits/iommu_pt_all Containing supporting work and future steps: - ARM short descriptor (32 bit), ARM long descriptor (64 bit) formats - VT-D second stage format - DART v1 & v2 format - Draft of a iommufd 'cut' operation to break down huge pages - Draft of support for a DMA incoherent HW page table walker - A compare test that checks the iommupt formats against the iopgtable interface, including updating AMD to have a working iopgtable and patches to make VT-D have an iopgtable for testing. - A performance test to micro-benchmark map and unmap against iogptable My strategy is to go one by one for the drivers: - AMD driver conversion - RISCV page table and driver - Intel VT-D driver and VTDSS page table - ARM SMMUv3 And concurrently work on the algorithm side: - debugfs content dump, like VT-D has - Cut support - Increase/Decrease page size support - map/unmap batching - KHO As we make more algorithm improvements the value to convert the drivers increases. This is on github: https://github.com/jgunthorpe/linux/commits/iommu_pt v1: - AMD driver only, many code changes RFC: https://lore.kernel.org/all/0-v1-01fa10580981+1d-iommu_pt_jgg@nvidia.com/ Alejandro Jimenez (1): iommu/amd: Use the generic iommu page table Jason Gunthorpe (14): genpt: Generic Page Table base API genpt: Add Documentation/ files iommupt: Add the basic structure of the iommu implementation iommupt: Add the AMD IOMMU v1 page table format iommupt: Add iova_to_phys op iommupt: Add unmap_pages op iommupt: Add map_pages op iommupt: Add read_and_clear_dirty op iommupt: Add a kunit test for Generic Page Table iommupt: Add a mock pagetable format for iommufd selftest to use iommufd: Change the selftest to use iommupt instead of xarray iommupt: Add the x86 64 bit page table format iommu/amd: Remove AMD io_pgtable support iommupt: Add a kunit test for the IOMMU implementation .clang-format | 1 + Documentation/driver-api/generic_pt.rst | 105 ++ Documentation/driver-api/index.rst | 1 + drivers/iommu/Kconfig | 2 + drivers/iommu/Makefile | 1 + drivers/iommu/amd/Kconfig | 5 +- drivers/iommu/amd/Makefile | 2 +- drivers/iommu/amd/amd_iommu.h | 1 - drivers/iommu/amd/amd_iommu_types.h | 109 +- drivers/iommu/amd/io_pgtable.c | 560 -------- drivers/iommu/amd/io_pgtable_v2.c | 370 ------ drivers/iommu/amd/iommu.c | 493 ++++--- drivers/iommu/generic_pt/.kunitconfig | 13 + drivers/iommu/generic_pt/Kconfig | 72 ++ drivers/iommu/generic_pt/fmt/Makefile | 26 + drivers/iommu/generic_pt/fmt/amdv1.h | 407 ++++++ drivers/iommu/generic_pt/fmt/defs_amdv1.h | 21 + drivers/iommu/generic_pt/fmt/defs_x86_64.h | 21 + drivers/iommu/generic_pt/fmt/iommu_amdv1.c | 15 + drivers/iommu/generic_pt/fmt/iommu_mock.c | 10 + drivers/iommu/generic_pt/fmt/iommu_template.h | 48 + drivers/iommu/generic_pt/fmt/iommu_x86_64.c | 12 + drivers/iommu/generic_pt/fmt/x86_64.h | 241 ++++ drivers/iommu/generic_pt/iommu_pt.h | 1146 +++++++++++++++++ drivers/iommu/generic_pt/kunit_generic_pt.h | 721 +++++++++++ drivers/iommu/generic_pt/kunit_iommu.h | 183 +++ drivers/iommu/generic_pt/kunit_iommu_pt.h | 451 +++++++ drivers/iommu/generic_pt/pt_common.h | 351 +++++ drivers/iommu/generic_pt/pt_defs.h | 312 +++++ drivers/iommu/generic_pt/pt_fmt_defaults.h | 193 +++ drivers/iommu/generic_pt/pt_iter.h | 638 +++++++++ drivers/iommu/generic_pt/pt_log2.h | 130 ++ drivers/iommu/io-pgtable.c | 4 - drivers/iommu/iommufd/Kconfig | 1 + drivers/iommu/iommufd/iommufd_test.h | 11 +- drivers/iommu/iommufd/selftest.c | 439 +++---- include/linux/generic_pt/common.h | 166 +++ include/linux/generic_pt/iommu.h | 264 ++++ include/linux/io-pgtable.h | 2 - tools/testing/selftests/iommu/iommufd.c | 60 +- tools/testing/selftests/iommu/iommufd_utils.h | 12 + 41 files changed, 6046 insertions(+), 1574 deletions(-) create mode 100644 Documentation/driver-api/generic_pt.rst delete mode 100644 drivers/iommu/amd/io_pgtable.c delete mode 100644 drivers/iommu/amd/io_pgtable_v2.c create mode 100644 drivers/iommu/generic_pt/.kunitconfig create mode 100644 drivers/iommu/generic_pt/Kconfig create mode 100644 drivers/iommu/generic_pt/fmt/Makefile create mode 100644 drivers/iommu/generic_pt/fmt/amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/defs_amdv1.h create mode 100644 drivers/iommu/generic_pt/fmt/defs_x86_64.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_amdv1.c create mode 100644 drivers/iommu/generic_pt/fmt/iommu_mock.c create mode 100644 drivers/iommu/generic_pt/fmt/iommu_template.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_x86_64.c create mode 100644 drivers/iommu/generic_pt/fmt/x86_64.h create mode 100644 drivers/iommu/generic_pt/iommu_pt.h create mode 100644 drivers/iommu/generic_pt/kunit_generic_pt.h create mode 100644 drivers/iommu/generic_pt/kunit_iommu.h create mode 100644 drivers/iommu/generic_pt/kunit_iommu_pt.h create mode 100644 drivers/iommu/generic_pt/pt_common.h create mode 100644 drivers/iommu/generic_pt/pt_defs.h create mode 100644 drivers/iommu/generic_pt/pt_fmt_defaults.h create mode 100644 drivers/iommu/generic_pt/pt_iter.h create mode 100644 drivers/iommu/generic_pt/pt_log2.h create mode 100644 include/linux/generic_pt/common.h create mode 100644 include/linux/generic_pt/iommu.h base-commit: db37090502f67e46541e53b91f00bbd565c96bd0 -- 2.43.0

7 months, 1 week

8
37
0 0

[PATCH] selftests/mm: Skip failed memfd setups in gup_longterm

by Mark Brown

Unlike the other cases gup_longterm's memfd tests previously skipped the test when failing to set up the file descriptor to test, restore this behaviour. Signed-off-by: Mark Brown <broonie(a)kernel.org> --- tools/testing/selftests/mm/gup_longterm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 8a97ac5176a4..29047d2e0c49 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -298,8 +298,11 @@ static void run_with_memfd(test_fn fn, const char *desc) log_test_start("%s ... with memfd", desc); fd = memfd_create("test", 0); - if (fd < 0) + if (fd < 0) { ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno)); + log_test_result(KSFT_SKIP); + return; + } fn(fd, pagesize); close(fd); @@ -366,6 +369,8 @@ static void run_with_memfd_hugetlb(test_fn fn, const char *desc, fd = memfd_create("test", flags); if (fd < 0) { ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno)); + log_test_result(KSFT_SKIP); + return; } fn(fd, hugetlbsize); --- base-commit: ec7714e4947909190ffb3041a03311a975350fe0 change-id: 20250603-selftest-mm-gup-longterm-tweaks-e685a8ae9751 Best regards, -- Mark Brown <broonie(a)kernel.org>

7 months, 1 week

4
3
0 0

[PATCH v5 0/5] kunit: Add support for suppressing warning backtraces

by Alessandro Carminati

Some unit tests intentionally trigger warning backtraces by passing bad parameters to kernel API functions. Such unit tests typically check the return value from such calls, not the existence of the warning backtrace. Such intentionally generated warning backtraces are neither desirable nor useful for a number of reasons: - They can result in overlooked real problems. - A warning that suddenly starts to show up in unit tests needs to be investigated and has to be marked to be ignored, for example by adjusting filter scripts. Such filters are ad hoc because there is no real standard format for warnings. On top of that, such filter scripts would require constant maintenance. One option to address the problem would be to add messages such as "expected warning backtraces start/end here" to the kernel log. However, that would again require filter scripts, might result in missing real problematic warning backtraces triggered while the test is running, and the irrelevant backtrace(s) would still clog the kernel log. Solve the problem by providing a means to identify and suppress specific warning backtraces while executing test code. Support suppressing multiple backtraces while at the same time limiting changes to generic code to the absolute minimum. Overview: Patch#1 Introduces the suppression infrastructure. Patch#2 Mitigate the impact at WARN*() sites. Patch#3 Adds selftests to validate the functionality. Patch#4 Demonstrates real-world usage in the DRM subsystem. Patch#5 Documents the new API and usage guidelines. Design Notes: The objective is to suppress unwanted WARN*() generated messages. Although most major architectures share common bug handling via `lib/bug.c` and `report_bug()`, some minor or legacy architectures still rely on their own platform-specific handling. This divergence must be considered in any such feature. Additionally, a key challenge in implementing this feature is the fragmentation of `WARN*()` messages emission: specific part in the macro, common with BUG*() part in the exception handler. As a result, any intervention to suppress the message must occur before the illegal instruction. Lessons from the Previous Attempt In earlier iterations, suppression logic was added inside the `__report_bug()` function to intercept WARN*() messages not producing messages in the macro. To implement the check in the check in the bug handler code, two strategies were considered: * Strategy #1: Use `kallsyms` to infer the originating functionid, namely a pointer to the function. Since in any case, the user interface relies on function names, they must be translated in addresses at suppression- time or at check-time. Assuming to translate at suppression-time, the `kallsyms` subsystem needs to be used to determine the symbol address from the name, and again to produce the functionid from `bugaddr`. This approach proved unreliable due to compiler-induced transformations such as inlining, cloning, and code fragmentation. Attempts to preventing them is also unconvenient because several `WARN()` sites are in functions intentionally declared as `__always_inline`. * Strategy #2: Store function name `__func__` in `struct bug_entry` in the `__bug_table`. This implementation was used in the previous version. However, `__func__` is a compiler-generated symbol, which complicates relocation and linking in position-independent code. Workarounds such as storing offsets from `.rodata` or embedding string literals directly into the table would have significantly either increased complexity or increase the __bug_table size. Additionally, architectures not using the unified `BUG()` path would still require ad-hoc handling. Because current WARN*() message production strategy, a few WARN*() macros still need a check to suppress the part of the message produced in the macro itself. Current Proposal: Check Directly in the `WARN()` Macros. This avoids the need for function symbol resolution or ELF section modification. Suppression is implemented directly in the `WARN*()` macros. A helper function, `__kunit_is_suppressed_warning()`, is used to determine whether suppression applies. It is marked as `noinstr`, since some `WARN*()` sites reside in non-instrumentable sections. As it uses `strcmp`, a `noinstr` version of `strcmp` was introduced. The implementation is deliberately simple and avoids architecture-specific optimizations to preserve portability. Since this mechanism compares function names and is intended for test usage only, performance is not a primary concern. This series is based on the RFC patch and subsequent discussion at https://patchwork.kernel.org/project/linux-kselftest/patch/02546e59-1afe-4b… and offers a more comprehensive solution of the problem discussed there. Changes since RFC: - Introduced CONFIG_KUNIT_SUPPRESS_BACKTRACE - Minor cleanups and bug fixes - Added support for all affected architectures - Added support for counting suppressed warnings - Added unit tests using those counters - Added patch to suppress warning backtraces in dev_addr_lists tests Changes since v1: - Rebased to v6.9-rc1 - Added Tested-by:, Acked-by:, and Reviewed-by: tags [I retained those tags since there have been no functional changes] - Introduced KUNIT_SUPPRESS_BACKTRACE configuration option, enabled by default. Changes since v2: - Rebased to v6.9-rc2 - Added comments to drm warning suppression explaining why it is needed. - Added patch to move conditional code in arch/sh/include/asm/bug.h to avoid kerneldoc warning - Added architecture maintainers to Cc: for architecture specific patches - No functional changes Changes since v3: - Rebased to v6.14-rc6 - Dropped net: "kunit: Suppress lock warning noise at end of dev_addr_lists tests" since 3db3b62955cd6d73afde05a17d7e8e106695c3b9 - Added __kunit_ and KUNIT_ prefixes. - Tested on interessed architectures. Changes since v4: - Rebased to v6.15-rc7 - Dropped all code in __report_bug() - Moved all checks in WARN*() macros. - Dropped all architecture specific code. - Made __kunit_is_suppressed_warning nice to noinstr functions. Alessandro Carminati (2): bug/kunit: Core support for suppressing warning backtraces bug/kunit: Suppressing warning backtraces reduced impact on WARN*() sites Guenter Roeck (3): Add unit tests to verify that warning backtrace suppression works. drm: Suppress intentional warning backtraces in scaling unit tests kunit: Add documentation for warning backtrace suppression API Documentation/dev-tools/kunit/usage.rst | 30 ++++++- drivers/gpu/drm/tests/drm_rect_test.c | 16 ++++ include/asm-generic/bug.h | 48 +++++++---- include/kunit/bug.h | 62 ++++++++++++++ include/kunit/test.h | 1 + lib/kunit/Kconfig | 9 ++ lib/kunit/Makefile | 9 +- lib/kunit/backtrace-suppression-test.c | 105 ++++++++++++++++++++++++ lib/kunit/bug.c | 54 ++++++++++++ 9 files changed, 316 insertions(+), 18 deletions(-) create mode 100644 include/kunit/bug.h create mode 100644 lib/kunit/backtrace-suppression-test.c create mode 100644 lib/kunit/bug.c -- 2.34.1

7 months, 1 week

7
29
0 0

[PATCH v5 00/29] iommufd: Add vIOMMU infrastructure (Part-4 HW QUEUE)

by Nicolin Chen

The vIOMMU object is designed to represent a slice of an IOMMU HW for its virtualization features shared with or passed to user space (a VM mostly) in a way of HW acceleration. This extended the HWPT-based design for more advanced virtualization feature. HW QUEUE introduced by this series as a part of the vIOMMU infrastructure represents a HW accelerated queue/buffer for VM to use exclusively, e.g. - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffer, and PPR Log Buffer each of which allows its IOMMU HW to directly access a queue memory owned by a guest VM and allows a guest OS to control the HW queue direclty, to avoid VM Exit overheads to improve the performance. Introduce IOMMUFD_OBJ_HW_QUEUE and its pairing IOMMUFD_CMD_HW_QUEUE_ALLOC allowing VMM to forward the IOMMU-specific queue info, such as queue base address, size, and etc. Meanwhile, a guest-owned queue needs the guest kernel to control the queue by reading/writing its consumer and producer indexes, via MMIO acceses to the hardware MMIO registers. Introduce an mmap infrastructure for iommufd to support passing through a piece of MMIO region from the host physical address space to the guest physical address space. The mmap info (offset/ length) used by an mmap syscall must be pre-allocated and returned to the user space via an output driver-data during an IOMMUFD_CMD_HW_QUEUE_ALLOC call. Thus, it requires a driver-specific user data support in the vIOMMU allocation flow. As a real-world use case, this series implements a HW QUEUE support in the tegra241-cmdqv driver for VCMDQs on NVIDIA Grace CPU. In another word, it is also the Tegra CMDQV series Part-2 (user-space support), reworked from Previous RFCv1: https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/ This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to the standard SMMUv3 operating in the nested translation mode trapping CMDQ for TLBI and ATC_INV commands, this gives a huge performance improvement: 70% to 90% reductions of invalidation time were measured by various DMA unmap tests running in a guest OS. // Unmap latencies from "dma_map_benchmark -g @granule -t @threads", // by toggling "/sys/kernel/debug/iommu/tegra241_cmdqv/bypass_vcmdq" @granule | @threads | bypass_vcmdq=1 | bypass_vcmdq=0 4KB 1 35.7 us 5.3 us 16KB 1 41.8 us 6.8 us 64KB 1 68.9 us 9.9 us 128KB 1 109.0 us 12.6 us 256KB 1 187.1 us 18.0 us 4KB 2 96.9 us 6.8 us 16KB 2 97.8 us 7.5 us 64KB 2 151.5 us 10.7 us 128KB 2 257.8 us 12.7 us 256KB 2 443.0 us 17.9 us This is on Github: https://github.com/nicolinc/iommufd/commits/iommufd_hw_queue-v5 Paring QEMU branch for testing: https://github.com/nicolinc/qemu/commits/wip/for_iommufd_hw_queue-v5 Changelog v5 * Rebase on v6.15-rc6 * Add Reviewed-by from Jason and Kevin * Correct typos in kdoc and update commit logs * [iommufd] Add a cosmetic fix * [iommufd] Drop unused num_pfns * [iommufd] Drop unnecessary check * [iommufd] Reorder patch sequence * [iommufd] Use io_remap_pfn_range() * [iommufd] Use success oriented flow * [iommufd] Fix max_npages calculation * [iommufd] Add more selftest coverage * [iommufd] Drop redundant static_assert * [iommufd] Fix mmap pfn range validation * [iommufd] Reject unmap on pinned iovas * [iommufd] Drop redundant vm_flags_set() * [iommufd] Drop iommufd_struct_destroy() * [iommufd] Drop redundant queue iova test * [iommufd] Use "mmio_addr" and "mmio_pfn" * [iommufd] Rename to "nesting_parent_iova" * [iommufd] Make iopt_pin_pages call option * [iommufd] Add ictx comparison in depend() * [iommufd] Add iommufd_object_alloc_ucmd() * [iommufd] Move kcalloc() after validations * [iommufd] Replace ictx setting with WARN_ON * [iommufd] Make hw_info's type bidirectional * [smmu] Add supported_vsmmu_type in impl_ops * [smmu] Drop impl report in smmu vendor struct * [tegra] Add IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV * [tegra] Replace "number of VINTFs" with a note * [tegra] Drop the redundant lvcmdq pointer setting * [tegra] Flag IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA * [tegra] Use "vintf_alloc_vsid" for vdevice_alloc op v4 https://lore.kernel.org/all/cover.1746757630.git.nicolinc@nvidia.com/ * Rebase on v6.15-rc5 * Add Reviewed-by from Vasant * Rename "vQUEUE" to "HW QUEUE" * Use "offset" and "length" for all mmap-related variables * [iommufd] Use u64 for guest PA * [iommufd] Fix typo in uAPI doc * [iommufd] Rename immap_id to offset * [iommufd] Drop the partial-size mmap support * [iommufd] Do not replace WARN_ON with WARN_ON_ONCE * [iommufd] Use "u64 base_addr" for queue base address * [iommufd] Use u64 base_pfn/num_pfns for immap structure * [iommufd] Correct the size passed in to mtree_alloc_range() * [iommufd] Add IOMMUFD_VIOMMU_FLAG_HW_QUEUE_READS_PA to viommu_ops v3 https://lore.kernel.org/all/cover.1746139811.git.nicolinc@nvidia.com/ * Add Reviewed-by from Baolu, Pranjal, and Alok * Revise kdocs, uAPI docs, and commit logs * Rename "vCMDQ" back to "vQUEUE" for AMD cases * [tegra] Add tegra241_vcmdq_hw_flush_timeout() * [tegra] Rename vsmmu_alloc to alloc_vintf_user * [tegra] Use writel for SID replacement registers * [tegra] Move mmap removal call to vsmmu_destroy op * [tegra] Fix revert in tegra241_vintf_alloc_lvcmdq_user() * [iommufd] Replace "& ~PAGE_MASK" with PAGE_ALIGNED() * [iommufd] Add an object-type "owner" to immap structure * [iommufd] Drop the ictx input in the new for-driver APIs * [iommufd] Add iommufd_vma_ops to keep track of mmap lifecycle * [iommufd] Add viommu-based iommufd_viommu_alloc/destroy_mmap helpers * [iommufd] Rename iommufd_ctx_alloc/free_mmap to _iommufd_alloc/destroy_mmap v2 https://lore.kernel.org/all/cover.1745646960.git.nicolinc@nvidia.com/ * Add Reviewed-by from Jason * [smmu] Fix vsmmu initial value * [smmu] Support impl for hw_info * [tegra] Rename "slot" to "vsid" * [tegra] Update kdocs and commit logs * [tegra] Map/unmap LVCMDQ dynamically * [tegra] Refcount the previous LVCMDQ * [tegra] Return -EEXIST if LVCMDQ exists * [tegra] Simplify VINTF cleanup routine * [tegra] Use vmid and s2_domain in vsmmu * [tegra] Rename "mmap_pgoff" to "immap_id" * [tegra] Add more addr and length validation * [iommufd] Add more narrative to mmap's kdoc * [iommufd] Add iommufd_struct_depend/undepend() * [iommufd] Rename vcmdq_free op to vcmdq_destroy * [iommufd] Fix bug in iommu_copy_struct_to_user() * [iommufd] Drop is_io from iommufd_ctx_alloc_mmap() * [iommufd] Test the queue memory for its contiguity * [iommufd] Return -ENXIO if address or length fails * [iommufd] Do not change @min_last in mock_viommu_alloc() * [iommufd] Generalize TEGRA241_VCMDQ data in core structure * [iommufd] Add selftest coverage for IOMMUFD_CMD_VCMDQ_ALLOC * [iommufd] Add iopt_pin_pages() to prevent queue memory from unmapping v1 https://lore.kernel.org/all/cover.1744353300.git.nicolinc@nvidia.com/ Thanks Nicolin Nicolin Chen (29): iommufd: Apply obvious cosmetic fixes iommufd: Introduce iommufd_object_alloc_ucmd helper iommu: Apply the new iommufd_object_alloc_ucmd helper iommu: Add iommu_copy_struct_to_user helper iommu: Pass in a driver-level user data structure to viommu_alloc op iommufd/viommu: Allow driver-specific user data for a vIOMMU object iommufd/selftest: Support user_data in mock_viommu_alloc iommufd/selftest: Add coverage for viommu data iommufd: Do not unmap an owned iopt_area iommufd: Abstract iopt_pin_pages and iopt_unpin_pages helpers iommufd/driver: Let iommufd_viommu_alloc helper save ictx to viommu->ictx iommufd/viommu: Add driver-allocated vDEVICE support iommufd/viommu: Introduce IOMMUFD_OBJ_HW_QUEUE and its related struct iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl iommufd/driver: Add iommufd_hw_queue_depend/undepend() helpers iommufd/selftest: Add coverage for IOMMUFD_CMD_HW_QUEUE_ALLOC iommufd: Add mmap interface iommufd/selftest: Add coverage for the new mmap interface Documentation: userspace-api: iommufd: Update HW QUEUE iommu: Allow an input type in hw_info op iommufd: Allow an input data_type via iommu_hw_info iommufd/selftest: Update hw_info coverage for an input data_type iommu/arm-smmu-v3-iommufd: Add vsmmu_alloc impl op iommu/arm-smmu-v3-iommufd: Add hw_info to impl_ops iommu/tegra241-cmdqv: Use request_threaded_irq iommu/tegra241-cmdqv: Simplify deinit flow in tegra241_cmdqv_remove_vintf() iommu/tegra241-cmdqv: Do not statically map LVCMDQs iommu/tegra241-cmdqv: Add user-space use support iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 28 +- drivers/iommu/iommufd/io_pagetable.h | 15 +- drivers/iommu/iommufd/iommufd_private.h | 41 +- drivers/iommu/iommufd/iommufd_test.h | 20 + include/linux/iommu.h | 53 +- include/linux/iommufd.h | 221 +++++++- include/uapi/linux/iommufd.h | 150 +++++- tools/testing/selftests/iommu/iommufd_utils.h | 91 +++- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 33 +- .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 496 +++++++++++++++++- drivers/iommu/intel/iommu.c | 4 + drivers/iommu/iommufd/device.c | 137 +---- drivers/iommu/iommufd/driver.c | 97 ++++ drivers/iommu/iommufd/eventq.c | 14 +- drivers/iommu/iommufd/hw_pagetable.c | 6 +- drivers/iommu/iommufd/io_pagetable.c | 106 +++- drivers/iommu/iommufd/iova_bitmap.c | 1 - drivers/iommu/iommufd/main.c | 80 ++- drivers/iommu/iommufd/pages.c | 19 +- drivers/iommu/iommufd/selftest.c | 158 +++++- drivers/iommu/iommufd/viommu.c | 146 +++++- tools/testing/selftests/iommu/iommufd.c | 146 +++++- .../selftests/iommu/iommufd_fail_nth.c | 15 +- Documentation/userspace-api/iommufd.rst | 12 + 24 files changed, 1794 insertions(+), 295 deletions(-) -- 2.43.0

7 months, 1 week

3
78
0 0

[PATCH bpf-next] selftests/bpf: rbtree: Fix incorrect global variable usage

by Rong Tao

From: Rong Tao <rongtao(a)cestc.cn> Within __add_three() function, should use function parameters instead of global variables. So that the variables groot_nested.inner.root and groot_nested.inner.glock in rbtree_add_nodes_nested() are tested correctly. Signed-off-by: Rong Tao <rongtao(a)cestc.cn> --- tools/testing/selftests/bpf/progs/rbtree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/rbtree.c b/tools/testing/selftests/bpf/progs/rbtree.c index a3620c15c136..49fe93d7e059 100644 --- a/tools/testing/selftests/bpf/progs/rbtree.c +++ b/tools/testing/selftests/bpf/progs/rbtree.c @@ -61,19 +61,19 @@ static long __add_three(struct bpf_rb_root *root, struct bpf_spin_lock *lock) } m->key = 1; - bpf_spin_lock(&glock); - bpf_rbtree_add(&groot, &n->node, less); - bpf_rbtree_add(&groot, &m->node, less); - bpf_spin_unlock(&glock); + bpf_spin_lock(lock); + bpf_rbtree_add(root, &n->node, less); + bpf_rbtree_add(root, &m->node, less); + bpf_spin_unlock(lock); n = bpf_obj_new(typeof(*n)); if (!n) return 3; n->key = 3; - bpf_spin_lock(&glock); - bpf_rbtree_add(&groot, &n->node, less); - bpf_spin_unlock(&glock); + bpf_spin_lock(lock); + bpf_rbtree_add(root, &n->node, less); + bpf_spin_unlock(lock); return 0; } -- 2.49.0

7 months, 1 week

2
1
0 0

[PATCH v2 0/4] selftests/mm: cow and gup_longterm cleanups

by Mark Brown

The bulk of these changes modify the cow and gup_longterm tests to report unique and stable names for each test, bringing them into line with the expectations of tooling that works with kselftest. The string reported as a test result is used by tooling to both deduplicate tests and track tests between test runs, using the same string for multiple tests or changing the string depending on test result causes problems for user interfaces and automation such as bisection. It was suggested that converting to use kselftest_harness.h would be a good way of addressing this, however that really wants the set of tests to run to be known at compile time but both test programs dynamically enumarate the set of huge page sizes the system supports and test each. Refactoring to handle this would be even more invasive than these changes which are large but straightforward and repetitive. A version of the main gup_longterm cleanup was previously sent separately, this version factors out the helpers for logging the start of the test since the cow test looks very similar. Signed-off-by: Mark Brown <broonie(a)kernel.org> --- Changes in v2: - Typo fixes. - Link to v1: https://lore.kernel.org/r/20250522-selftests-mm-cow-dedupe-v1-0-713cee2fdd6… --- Mark Brown (4): selftests/mm: Use standard ksft_finished() in cow and gup_longterm selftests/mm: Add helper for logging test start and results selftests/mm: Report unique test names for each cow test selftests/mm: Fix test result reporting in gup_longterm tools/testing/selftests/mm/cow.c | 340 +++++++++++++++++++----------- tools/testing/selftests/mm/gup_longterm.c | 158 ++++++++------ tools/testing/selftests/mm/vm_util.h | 20 ++ 3 files changed, 334 insertions(+), 184 deletions(-) --- base-commit: a5806cd506af5a7c19bcd596e4708b5c464bfd21 change-id: 20250521-selftests-mm-cow-dedupe-33dcab034558 Best regards, -- Mark Brown <broonie(a)kernel.org>

7 months, 1 week

4
34
0 0

[PATCH bpf-next] selftests/bpf: Fix compile error of bin_attribute::read/write()

by Rong Tao

From: Rong Tao <rongtao(a)cestc.cn> Since commit 97d06802d10a ("sysfs: constify bin_attribute argument of bin_attribute::read/write()"), make bin_attribute parameter of bin_attribute::read/write() const. Signed-off-by: Rong Tao <rongtao(a)cestc.cn> --- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e6c248e3ae54..e9e918cdf31f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok; noinline ssize_t bpf_testmod_test_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_read_ctx ctx = { @@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); noinline ssize_t bpf_testmod_test_write(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_write_ctx ctx = { @@ -567,7 +567,7 @@ static void testmod_unregister_uprobe(void) static ssize_t bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { unsigned long offset = 0; -- 2.49.0

7 months, 1 week

3
3
0 0

[PATCH net] selftests: drv-net: tso: make bkg() wait for socat to quit

by Jakub Kicinski

Commit 846742f7e32f ("selftests: drv-net: add a warning for bkg + shell + terminate") added a warning for bkg() used with terminate=True. The tso test was missed as we didn't have it running anywhere in NIPA. Add exit_wait=True, to avoid: # Warning: combining shell and terminate is risky! # SIGTERM may not reach the child on zsh/ksh! getting printed twice for every variant. Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: willemb(a)google.com CC: linux-kselftest(a)vger.kernel.org --- tools/testing/selftests/drivers/net/hw/tso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py index e1ecb92f79d9..150d6db241a0 100755 --- a/tools/testing/selftests/drivers/net/hw/tso.py +++ b/tools/testing/selftests/drivers/net/hw/tso.py @@ -39,7 +39,7 @@ from lib.py import bkg, cmd, defer, ethtool, ip, rand_port, wait_port_listen port = rand_port() listen_cmd = f"socat -{ipver} -t 2 -u TCP-LISTEN:{port},reuseport /dev/null,ignoreeof" - with bkg(listen_cmd, host=cfg.remote) as nc: + with bkg(listen_cmd, host=cfg.remote, exit_wait=True) as nc: wait_port_listen(port, host=cfg.remote) if ipver == "4": -- 2.49.0

7 months, 1 week

3
2
0 0

[PATCH net v2] selftests: drv-net: add configs for the TSO test

by Jakub Kicinski

Add missing config options for the tso.py test, specifically to make sure the kernel is built with vxlan and gre tunnels. I noticed this while adding a TSO-capable device QEMU to the CI. Previously we only run virtio tests and it doesn't report LSO stats on the QEMU we have. Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- v2: - drop NET_IP_TUNNEL v1: https://lore.kernel.org/20250602231640.314556-1-kuba@kernel.org CC: shuah(a)kernel.org CC: willemb(a)google.com CC: linux-kselftest(a)vger.kernel.org --- tools/testing/selftests/drivers/net/hw/config | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/hw/config diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config new file mode 100644 index 000000000000..88ae719e6f8f --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/config @@ -0,0 +1,5 @@ +CONFIG_IPV6=y +CONFIG_IPV6_GRE=y +CONFIG_NET_IPGRE=y +CONFIG_NET_IPGRE_DEMUX=y +CONFIG_VXLAN=y -- 2.49.0

7 months, 1 week

3
3
0 0

[PATCH net] selftests: drv-net: tso: fix the GRE device name

by Jakub Kicinski

The device type for IPv4 GRE is "gre" not "ipgre", unlike for IPv6 which uses "ip6gre". Not sure how I missed this when writing the test, perhaps because all HW I have access to is on an IPv6-only network. Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: willemb(a)google.com CC: linux-kselftest(a)vger.kernel.org --- tools/testing/selftests/drivers/net/hw/tso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py index 150d6db241a0..3370827409aa 100755 --- a/tools/testing/selftests/drivers/net/hw/tso.py +++ b/tools/testing/selftests/drivers/net/hw/tso.py @@ -216,7 +216,7 @@ from lib.py import bkg, cmd, defer, ethtool, ip, rand_port, wait_port_listen ("", "6", "tx-tcp6-segmentation", None), ("vxlan", "", "tx-udp_tnl-segmentation", ("vxlan", True, "id 100 dstport 4789 noudpcsum")), ("vxlan_csum", "", "tx-udp_tnl-csum-segmentation", ("vxlan", False, "id 100 dstport 4789 udpcsum")), - ("gre", "4", "tx-gre-segmentation", ("ipgre", False, "")), + ("gre", "4", "tx-gre-segmentation", ("gre", False, "")), ("gre", "6", "tx-gre-segmentation", ("ip6gre", False, "")), ) -- 2.49.0

7 months, 1 week

4
3
0 0

[PATCH bpf-next v1 0/2] bpf,ktls: Fix data corruption caused by using bpf_msg_pop_data() in ktls

by Jiayuan Chen

Cong reported an issue where running 'test_sockmap' in the current bpf-next tree results in an error [1]. The specific test case that triggered the error is a combined test involving ktls and bpf_msg_pop_data(). Root Cause: When sending plaintext data, we initially calculated the corresponding ciphertext length. However, if we later reduced the plaintext data length via socket policy, we failed to recalculate the ciphertext length. This results in transmitting buffers containing uninitialized data during ciphertext transmission. This causes uninitialized bytes to be appended after a complete "Application Data" packet, leading to errors on the receiving end when parsing TLS record. This issue has existed for a long time but was only exposed after the following test code was merged. commit 47eae080410b ("selftests/bpf: Add more tests for test_txmsg_push_pop in test_sockmap") Although we already had tests for pop data before this commit, the pop data length was insufficient (less than 5 bytes). This meant that the corrupted TLS records with data length <5 bytes were cached without being parsed, resulting in no error being triggered. After this fix, all tests pass. 1/ 6 sockmap::txmsg test passthrough:OK 2/ 6 sockmap::txmsg test redirect:OK 3/ 2 sockmap::txmsg test redirect wait send mem:OK 4/ 6 sockmap::txmsg test drop:OK 5/ 6 sockmap::txmsg test ingress redirect:OK 6/ 7 sockmap::txmsg test skb:OK 7/12 sockmap::txmsg test apply:OK 8/12 sockmap::txmsg test cork:OK 9/ 3 sockmap::txmsg test hanging corks:OK 10/11 sockmap::txmsg test push_data:OK 11/17 sockmap::txmsg test pull-data:OK 12/ 9 sockmap::txmsg test pop-data:OK 13/ 6 sockmap::txmsg test push/pop data:OK 14/ 1 sockmap::txmsg test ingress parser:OK 15/ 1 sockmap::txmsg test ingress parser2:OK 16/ 6 sockhash::txmsg test passthrough:OK 17/ 6 sockhash::txmsg test redirect:OK 18/ 2 sockhash::txmsg test redirect wait send mem:OK 19/ 6 sockhash::txmsg test drop:OK 20/ 6 sockhash::txmsg test ingress redirect:OK 21/ 7 sockhash::txmsg test skb:OK 22/12 sockhash::txmsg test apply:OK 23/12 sockhash::txmsg test cork:OK 24/ 3 sockhash::txmsg test hanging corks:OK 25/11 sockhash::txmsg test push_data:OK 26/17 sockhash::txmsg test pull-data:OK 27/ 9 sockhash::txmsg test pop-data:OK 28/ 6 sockhash::txmsg test push/pop data:OK 29/ 1 sockhash::txmsg test ingress parser:OK 30/ 1 sockhash::txmsg test ingress parser2:OK 31/ 6 sockhash:ktls:txmsg test passthrough:OK 32/ 6 sockhash:ktls:txmsg test redirect:OK 33/ 2 sockhash:ktls:txmsg test redirect wait send mem:OK 34/ 6 sockhash:ktls:txmsg test drop:OK 35/ 6 sockhash:ktls:txmsg test ingress redirect:OK 36/ 7 sockhash:ktls:txmsg test skb:OK 37/12 sockhash:ktls:txmsg test apply:OK 38/12 sockhash:ktls:txmsg test cork:OK 39/ 3 sockhash:ktls:txmsg test hanging corks:OK 40/11 sockhash:ktls:txmsg test push_data:OK 41/17 sockhash:ktls:txmsg test pull-data:OK 42/ 9 sockhash:ktls:txmsg test pop-data:OK 43/ 6 sockhash:ktls:txmsg test push/pop data:OK 44/ 1 sockhash:ktls:txmsg test ingress parser:OK 45/ 0 sockhash:ktls:txmsg test ingress parser2:OK Pass: 45 Fail: 0 [1]: https://lore.kernel.org/bpf/CAM_iQpU7=4xjbefZoxndKoX9gFFMOe7FcWMq5tHBsymbrn… Jiayuan Chen (2): bpf,ktls: Fix data corruption when using bpf_msg_pop_data() in ktls selftests/bpf: Add test to cover ktls with bpf_msg_pop_data net/tls/tls_sw.c | 15 +++ .../selftests/bpf/prog_tests/sockmap_ktls.c | 91 +++++++++++++++++++ .../selftests/bpf/progs/test_sockmap_ktls.c | 4 + 3 files changed, 110 insertions(+) -- 2.47.1

7 months, 1 week

3
7
0 0

[PATCH v2 0/3] KVM: arm64: selftests: arch_timer_edge_cases fixes

by Sebastian Ott

Some small fixes for arch_timer_edge_cases that I stumbled upon while debugging failures for this selftest on ampere-one. Changes since v1: modified patch 3 based on suggestions from Marc. I've done some tests with this on various machines - seems to be all good, however on ampere-one I now hit this in 10% of the runs: ==== Test Assertion Failure ==== arm64/arch_timer_edge_cases.c:481: timer_get_cntct(timer) >= DEF_CNT + (timer_get_cntfrq() * (uint64_t)(delta_2_ms) / 1000) pid=166657 tid=166657 errno=4 - Interrupted system call 1 0x0000000000404db3: test_run at arch_timer_edge_cases.c:933 2 0x0000000000401f9f: main at arch_timer_edge_cases.c:1062 3 0x0000ffffaedd625b: ?? ??:0 4 0x0000ffffaedd633b: ?? ??:0 5 0x00000000004020af: _start at ??:? timer_get_cntct(timer) >= DEF_CNT + msec_to_cycles(delta_2_ms) This is not new, it was just hidden behind the other failure. I'll try to figure out what this is about (seems to be independent of the wait time).. Sebastian Ott (3): KVM: arm64: selftests: fix help text for arch_timer_edge_cases KVM: arm64: selftests: fix thread migration in arch_timer_edge_cases KVM: arm64: selftests: arch_timer_edge_cases - determine effective counter width .../kvm/arm64/arch_timer_edge_cases.c | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) base-commit: 0ff41df1cb268fc69e703a08a57ee14ae967d0ca -- 2.49.0

7 months, 1 week

3
7
0 0

[RFC] selftests/mm: Skip tests dependent on a binary not built

by Khaled Elnaggar

Hello. Running the mm selftests from the kernel's root directory on an x86_64 debian machine using: make defconfig sudo make kselftest TARGETS=mm the tests run normally till we reach one which stalls for 180 seconds and times out according to the following logs: ``` ----------------------------------------------- running ./charge_reserved_hugetlb.sh -cgroup-v2 ----------------------------------------------- CLEANUP DONE CLEANUP DONE Test normal case. private=, populate=, method=0, reserve= nr hugepages = 10 writing cgroup limit: 20971520 writing reseravation limit: 20971520 Starting: hugetlb_usage=0 reserved_usage=0 expect_failure is 0 Putting task in cgroup 'hugetlb_cgroup_test' Method is 0 >>> write_hugetlb_memory.sh: line 22: ./write_to_hugetlbfs: No such file or directory <<< Waiting for hugetlb memory reservation to reach size 10485760. 0 Waiting for hugetlb memory reservation to reach size 10485760. 0 ... Waiting for hugetlb memory reservation to reach size 10485760. 0 Waiting for hugetlb memory reservation to reach size 10485760. 0 not ok 1 selftests: mm: run_vmtests.sh # TIMEOUT 180 seconds make[3]: Leaving directory '/linux/tools/testing/selftests/mm' ``` Logs show that the executable "write_to_hugetlbfs" is missing, causing the test to hang waiting for hugepage reservations. The executable not found means it was not built by the Make system. It is mentioned in Makefile:136-142, and only built if ARCH is 64-bit ``` ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch ifneq ($(ARCH),riscv64) TEST_GEN_FILES += virtual_address_range endif TEST_GEN_FILES += write_to_hugetlbfs endif ``` So, for some reason, the top-level Makefile provides ARCH as x86. My proposed solution is similar to existing virtual_address_range check that is to check for the binary, and if it is not found, skip these 2 test cases: charge_reserved_hugetlb.sh and hugetlb_reparenting_test.sh since they directly and indirectly depend on write_to_hugetlbfs binary. This is just a workaround, the root issue of different ARCH detection when running tests from the kernel root directory should still be addressed. I am not sure how to approach it and open for your suggestions. Note that this issue does not happen when ran from selftests/mm using something like sudo make -C tools/testing/selftests/mm because then mm/Makefile's ARCH detection runs correctly (x86_64) Kindly review and share your thoughts. Signed-off-by: Khaled Elnaggar <khaledelnaggarlinux(a)gmail.com> --- tools/testing/selftests/mm/run_vmtests.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index dddd1dd8af14..cdbcfdb62f8a 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -375,8 +375,13 @@ CATEGORY="process_mrelease" run_test ./mrelease_test CATEGORY="mremap" run_test ./mremap_test CATEGORY="hugetlb" run_test ./thuge-gen + +# the following depend on write_to_hugetlbfs binary +if [ -x ./write_to_hugetlbfs ]; then CATEGORY="hugetlb" run_test ./charge_reserved_hugetlb.sh -cgroup-v2 CATEGORY="hugetlb" run_test ./hugetlb_reparenting_test.sh -cgroup-v2 +fi + if $RUN_DESTRUCTIVE; then nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) enable_soft_offline=$(cat /proc/sys/vm/enable_soft_offline) -- 2.47.2

7 months, 1 week

3
5
0 0

[PATCH 00/17] ARM64 PMU Partitioning

by Colton Lewis

Overview: This series implements a new PMU scheme on ARM, a partitioned PMU that exists alongside the existing emulated PMU and may be enabled by the kernel command line kvm.reserved_host_counters or by the vcpu ioctl KVM_ARM_PARTITION_PMU. This is a continuation of the RFC posted earlier this year. [1] The high level overview and reason for the name is that this implementation takes advantage of recent CPU features to partition the PMU counters into a host-reserved set and a guest-reserved set. Guests are allowed untrapped hardware access to the most frequently used PMU registers and features for the guest-reserved counters only. This untrapped hardware access significantly reduces the overhead of using performance monitoring capabilities such as the `perf` tool inside a guest VM. Register accesses that aren't trapping to KVM mean less time spent in the host kernel and more time on the workloads guests care about. This optimization especially shines during high `perf` sample rates or large numbers of events that require multiplexing hardware counters. Performance: For example, the following tests were carried out on identical ARM machines with 10 general purpose counters with identical guest images run on QEMU, the only difference being my PMU implementation or the existing one. Some arguments have been simplified here to clarify the purpose of the test: 1) time perf record -e ${FIFTEEN_HW_EVENTS} -F 1000 -- \ gzip -c tmpfs/random.64M.img >/dev/null On emulated PMU this command took 4.143s real time with 0.159s system time. On partitioned PMU this command took 3.139s real time with 0.110s system time, runtime reductions of 24.23% and 30.82%. 2) time perf stat -dd -- \ automated_specint2017.sh On emulated PMU this benchmark completed in 3789.16s real time with 224.45s system time and a final benchmark score of 4.28. On partitioned PMU this benchmark completed in 3525.67s real time with 15.98s system time and a final benchmark score of 4.56. That is a 6.95% reduction in runtime, 92.88% reduction in system time, and 6.54% improvement in overall benchmark score. Seeing these improvements on something as lightweight as perf stat is remarkable and implies there would have been a much greater improvement with perf record. I did not test that because I was not confident it would even finish in a reasonable time on the emulated PMU Test 3 was slightly different, I ran the workload in a VM with a single VCPU pinned to a physical CPU and analyzed from the host where the physical CPU spent its time using mpstat. 3) perf record -e ${FIFTEEN_HW_EVENTS} -F 4000 -- \ stress-ng --cpu 0 --timeout 30 Over a period of 30s the cpu running with the emulated PMU spent 34.96% of the time in the host kernel and 55.85% of the time in the guest. The cpu running the partitioned PMU spent 0.97% of its time in the host kernel and 91.06% of its time in the guest. Taken together, these tests represent a remarkable performance improvement for anything perf related using this new PMU implementation. Caveats: Because the most consistent and performant thing to do was untrap PMCR_EL0, the number of counters visible to the guest via PMCR_EL0.N is always equal to the value KVM sets for MDCR_EL2.HPMN. Previously allowed writes to PMCR_EL0.N via {GET,SET}_ONE_REG no longer affect the guest. These improvements come at a cost to 7-35 new registers that must be swapped at every vcpu_load and vcpu_put if the feature is enabled. I have been informed KVM would like to avoid paying this cost when possible. One solution is to make the trapping changes and context swapping lazy such that the trapping changes and context swapping only take place after the guest has actually accessed the PMU so guests that never access the PMU never pay the cost. This is not done here because it is not crucial to the primary functionality and I thought review would be more productive as soon as I had something complete enough for reviewers to easily play with. However, this or any better ideas are on the table for inclusion in future re-rolls. [1] https://lore.kernel.org/kvmarm/20250213180317.3205285-1-coltonlewis@google.… Colton Lewis (16): arm64: cpufeature: Add cpucap for HPMN0 arm64: Generate sign macro for sysreg Enums arm64: cpufeature: Add cpucap for PMICNTR KVM: arm64: Reorganize PMU functions KVM: arm64: Introduce method to partition the PMU perf: arm_pmuv3: Generalize counter bitmasks perf: arm_pmuv3: Keep out of guest counter partition KVM: arm64: Set up FGT for Partitioned PMU KVM: arm64: Writethrough trapped PMEVTYPER register KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned KVM: arm64: Writethrough trapped PMOVS register KVM: arm64: Context switch Partitioned PMU guest registers perf: pmuv3: Handle IRQs for Partitioned PMU guest counters KVM: arm64: Inject recorded guest interrupts KVM: arm64: Add ioctl to partition the PMU when supported KVM: arm64: selftests: Add test case for partitioned PMU Marc Zyngier (1): KVM: arm64: Cleanup PMU includes Documentation/virt/kvm/api.rst | 16 + arch/arm/include/asm/arm_pmuv3.h | 24 + arch/arm64/include/asm/arm_pmuv3.h | 36 +- arch/arm64/include/asm/kvm_host.h | 208 +++++- arch/arm64/include/asm/kvm_pmu.h | 82 +++ arch/arm64/kernel/cpufeature.c | 15 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 24 +- arch/arm64/kvm/debug.c | 13 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 65 +- arch/arm64/kvm/pmu-emul.c | 629 +---------------- arch/arm64/kvm/pmu-part.c | 358 ++++++++++ arch/arm64/kvm/pmu.c | 630 ++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 54 +- arch/arm64/tools/cpucaps | 2 + arch/arm64/tools/gen-sysreg.awk | 1 + arch/arm64/tools/sysreg | 6 +- drivers/perf/arm_pmuv3.c | 55 +- include/kvm/arm_pmu.h | 199 ------ include/linux/perf/arm_pmu.h | 15 +- include/linux/perf/arm_pmuv3.h | 14 +- include/uapi/linux/kvm.h | 4 + tools/include/uapi/linux/kvm.h | 2 + .../selftests/kvm/arm64/vpmu_counter_access.c | 40 +- virt/kvm/kvm_main.c | 1 + 25 files changed, 1616 insertions(+), 879 deletions(-) create mode 100644 arch/arm64/include/asm/kvm_pmu.h create mode 100644 arch/arm64/kvm/pmu-part.c delete mode 100644 include/kvm/arm_pmu.h base-commit: 1b85d923ba8c9e6afaf19e26708411adde94fba8 -- 2.49.0.1204.g71687c7c1d-goog

7 months, 1 week

3
33
0 0

[PATCH] selftests: cachestat: add tests for mmap and /proc/cpuinfo

by Suresh K C

From: Suresh K C <suresh.k.chandrappa(a)gmail.com> Add a test case to verify cachestat behavior with memory-mapped files using mmap(). This ensures that pages accessed via mmap are correctly accounted for in the page cache. Also add a test for /proc/cpuinfo to validate cachestat's handling of virtual files in pseudo-filesystems. This improves test coverage for edge cases involving non-regular files. Tested on x86_64 with default kernel config. Signed-off-by: Suresh K C <suresh.k.chandrappa(a)gmail.com> --- .../selftests/cachestat/test_cachestat.c | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..81e7f6dd2279 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -22,7 +22,7 @@ static const char * const dev_files[] = { "/dev/zero", "/dev/null", "/dev/urandom", - "/proc/version", "/proc" + "/proc/version","/proc/cpuinfo","/proc" }; void print_cachestat(struct cachestat *cs) @@ -202,6 +202,65 @@ static int test_cachestat(const char *filename, bool write_random, bool create, return ret; } +bool test_cachestat_mmap(void){ + + size_t PS = sysconf(_SC_PAGESIZE); + size_t filesize = PS * 512 * 2;; + int syscall_ret; + size_t compute_len = PS * 512; + struct cachestat_range cs_range = { PS, compute_len }; + char *filename = "tmpshmcstat"; + unsigned long num_pages = compute_len / PS; + struct cachestat cs; + bool ret = true; + int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd < 0) { + ksft_print_msg("Unable to create mmap file.\n"); + ret = false; + goto out; + } + if (ftruncate(fd, filesize)) { + ksft_print_msg("Unable to truncate mmap file.\n"); + ret = false; + goto close_fd; + } + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to mmap file.\n"); + ret = false; + goto close_fd; + } + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + + for (int i = 0; i < filesize; i++) { + map[i] = 'A'; + } + map[filesize - 1] = 'X'; + + syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); + + if (syscall_ret) { + ksft_print_msg("Cachestat returned non-zero.\n"); + ret = false; + } else { + print_cachestat(&cs); + if (cs.nr_cache + cs.nr_evicted != num_pages) { + ksft_print_msg("Total number of cached and evicted pages is off.\n"); + ret = false; + } + } + +close_fd: + close(fd); + unlink(filename); +out: + return ret; +} + bool test_cachestat_shmem(void) { size_t PS = sysconf(_SC_PAGESIZE); @@ -274,7 +333,7 @@ int main(void) ret = 1; } - for (int i = 0; i < 5; i++) { + for (int i = 0; i < 6; i++) { const char *dev_filename = dev_files[i]; if (test_cachestat(dev_filename, false, false, false, @@ -315,5 +374,11 @@ int main(void) ret = 1; } + if (test_cachestat_mmap()) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } -- 2.43.0

7 months, 1 week

2
1
0 0

[PATCH v2] riscv: sbi: Add SBI Debug Triggers Extension tests

by Jesse Taube

Add tests for the DBTR SBI extension. Signed-off-by: Jesse Taube <jesse(a)rivosinc.com> --- V1 -> V2: - Call report_prefix_pop before returning - Disable compressed instructions in exec_call, update related comment - Remove extra "| 1" in dbtr_test_load - Remove extra newlines - Remove extra tabs in check_exec - Remove typedefs from enums - Return when dbtr_install_trigger fails - s/avalible/available/g - s/unistall/uninstall/g --- lib/riscv/asm/sbi.h | 28 ++ lib/riscv/sbi.c | 58 ++++ riscv/Makefile | 1 + riscv/sbi-dbtr.c | 751 ++++++++++++++++++++++++++++++++++++++++++++ riscv/sbi-tests.h | 1 + riscv/sbi.c | 1 + 6 files changed, 840 insertions(+) create mode 100644 riscv/sbi-dbtr.c diff --git a/lib/riscv/asm/sbi.h b/lib/riscv/asm/sbi.h index a5738a5c..ce19ab89 100644 --- a/lib/riscv/asm/sbi.h +++ b/lib/riscv/asm/sbi.h @@ -51,6 +51,7 @@ enum sbi_ext_id { SBI_EXT_SUSP = 0x53555350, SBI_EXT_FWFT = 0x46574654, SBI_EXT_SSE = 0x535345, + SBI_EXT_DBTR = 0x44425452, }; enum sbi_ext_base_fid { @@ -125,6 +126,17 @@ enum sbi_ext_fwft_fid { #define SBI_FWFT_SET_FLAG_LOCK BIT(0) +enum sbi_ext_dbtr_fid { + SBI_EXT_DBTR_NUM_TRIGGERS = 0, + SBI_EXT_DBTR_SETUP_SHMEM, + SBI_EXT_DBTR_TRIGGER_READ, + SBI_EXT_DBTR_TRIGGER_INSTALL, + SBI_EXT_DBTR_TRIGGER_UPDATE, + SBI_EXT_DBTR_TRIGGER_UNINSTALL, + SBI_EXT_DBTR_TRIGGER_ENABLE, + SBI_EXT_DBTR_TRIGGER_DISABLE, +}; + enum sbi_ext_sse_fid { SBI_EXT_SSE_READ_ATTRS = 0, SBI_EXT_SSE_WRITE_ATTRS, @@ -282,6 +294,22 @@ static inline bool sbi_sse_event_is_global(uint32_t event_id) return !!(event_id & SBI_SSE_EVENT_GLOBAL_BIT); } +struct sbiret sbi_debug_num_triggers(unsigned long trig_tdata1); +struct sbiret sbi_debug_set_shmem(void *shmem); +struct sbiret sbi_debug_set_shmem_raw(unsigned long shmem_phys_lo, + unsigned long shmem_phys_hi, + unsigned long flags); +struct sbiret sbi_debug_read_triggers(unsigned long trig_idx_base, + unsigned long trig_count); +struct sbiret sbi_debug_install_triggers(unsigned long trig_count); +struct sbiret sbi_debug_update_triggers(unsigned long trig_count); +struct sbiret sbi_debug_uninstall_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask); +struct sbiret sbi_debug_enable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask); +struct sbiret sbi_debug_disable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask); + struct sbiret sbi_sse_read_attrs_raw(unsigned long event_id, unsigned long base_attr_id, unsigned long attr_count, unsigned long phys_lo, unsigned long phys_hi); diff --git a/lib/riscv/sbi.c b/lib/riscv/sbi.c index 2959378f..39c0d3bd 100644 --- a/lib/riscv/sbi.c +++ b/lib/riscv/sbi.c @@ -32,6 +32,64 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, return ret; } +struct sbiret sbi_debug_num_triggers(unsigned long trig_tdata1) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_NUM_TRIGGERS, trig_tdata1, 0, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_set_shmem(void *shmem) +{ + phys_addr_t p = virt_to_phys(shmem); + + return sbi_debug_set_shmem_raw(lower_32_bits(p), upper_32_bits(p), 0); +} + +struct sbiret sbi_debug_set_shmem_raw(unsigned long shmem_phys_lo, + unsigned long shmem_phys_hi, + unsigned long flags) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_SETUP_SHMEM, shmem_phys_lo, + shmem_phys_hi, flags, 0, 0, 0); +} + +struct sbiret sbi_debug_read_triggers(unsigned long trig_idx_base, + unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_READ, trig_idx_base, + trig_count, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_install_triggers(unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_INSTALL, trig_count, 0, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_update_triggers(unsigned long trig_count) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_UPDATE, trig_count, 0, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_uninstall_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_UNINSTALL, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_enable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_ENABLE, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + +struct sbiret sbi_debug_disable_triggers(unsigned long trig_idx_base, + unsigned long trig_idx_mask) +{ + return sbi_ecall(SBI_EXT_DBTR, SBI_EXT_DBTR_TRIGGER_DISABLE, trig_idx_base, + trig_idx_mask, 0, 0, 0, 0); +} + struct sbiret sbi_sse_read_attrs_raw(unsigned long event_id, unsigned long base_attr_id, unsigned long attr_count, unsigned long phys_lo, unsigned long phys_hi) diff --git a/riscv/Makefile b/riscv/Makefile index 11e68eae..55c7ac93 100644 --- a/riscv/Makefile +++ b/riscv/Makefile @@ -20,6 +20,7 @@ all: $(tests) $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-asm.o $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-fwft.o $(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-sse.o +$(TEST_DIR)/sbi-deps += $(TEST_DIR)/sbi-dbtr.o all_deps += $($(TEST_DIR)/sbi-deps) diff --git a/riscv/sbi-dbtr.c b/riscv/sbi-dbtr.c new file mode 100644 index 00000000..fe323f0f --- /dev/null +++ b/riscv/sbi-dbtr.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SBI DBTR testsuite + * + * Copyright (C) 2025, Rivos Inc., Jesse Taube <jesse(a)rivosinc.com> + */ + +#include <asm/io.h> + +#include "sbi-tests.h" + +#define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) + +#if __riscv_xlen == 64 +#define SBI_DBTR_SHMEM_INVALID_ADDR 0xFFFFFFFFFFFFFFFFUL +#elif __riscv_xlen == 32 +#define SBI_DBTR_SHMEM_INVALID_ADDR 0xFFFFFFFFUL +#else +#error "Unexpected __riscv_xlen" +#endif + +#define RV_MAX_TRIGGERS 32 + +#define SBI_DBTR_TRIG_STATE_MAPPED BIT(0) +#define SBI_DBTR_TRIG_STATE_U BIT(1) +#define SBI_DBTR_TRIG_STATE_S BIT(2) +#define SBI_DBTR_TRIG_STATE_VU BIT(3) +#define SBI_DBTR_TRIG_STATE_VS BIT(4) +#define SBI_DBTR_TRIG_STATE_HAVE_HW_TRIG BIT(5) + +#define SBI_DBTR_TRIG_STATE_HW_TRIG_IDX_SHIFT 8 +#define SBI_DBTR_TRIG_STATE_HW_TRIG_IDX(trig_state) (trig_state >> SBI_DBTR_TRIG_STATE_HW_TRIG_IDX_SHIFT) + +#define SBI_DBTR_TDATA1_TYPE_SHIFT (__riscv_xlen - 4) + +#define SBI_DBTR_TDATA1_MCONTROL6_LOAD_BIT BIT(0) +#define SBI_DBTR_TDATA1_MCONTROL6_STORE_BIT BIT(1) +#define SBI_DBTR_TDATA1_MCONTROL6_EXECUTE_BIT BIT(2) +#define SBI_DBTR_TDATA1_MCONTROL6_U_BIT BIT(3) +#define SBI_DBTR_TDATA1_MCONTROL6_S_BIT BIT(4) +#define SBI_DBTR_TDATA1_MCONTROL6_SELECT_BIT BIT(21) +#define SBI_DBTR_TDATA1_MCONTROL6_VS_BIT BIT(23) +#define SBI_DBTR_TDATA1_MCONTROL6_VU_BIT BIT(24) + +#define SBI_DBTR_TDATA1_MCONTROL_LOAD_BIT BIT(0) +#define SBI_DBTR_TDATA1_MCONTROL_STORE_BIT BIT(1) +#define SBI_DBTR_TDATA1_MCONTROL_EXECUTE_BIT BIT(2) +#define SBI_DBTR_TDATA1_MCONTROL_U_BIT BIT(3) +#define SBI_DBTR_TDATA1_MCONTROL_S_BIT BIT(4) +#define SBI_DBTR_TDATA1_MCONTROL_SELECT_BIT BIT(19) + +enum McontrolType { + SBI_DBTR_TDATA1_TYPE_NONE = (0UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_LEGACY = (1UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_MCONTROL = (2UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ICOUNT = (3UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ITRIGGER = (4UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_ETRIGGER = (5UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_MCONTROL6 = (6UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_TMEXTTRIGGER = (7UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED0 = (8UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED1 = (9UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED2 = (10UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_RESERVED3 = (11UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM0 = (12UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM1 = (13UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_CUSTOM2 = (14UL << SBI_DBTR_TDATA1_TYPE_SHIFT), + SBI_DBTR_TDATA1_TYPE_DISABLED = (15UL << SBI_DBTR_TDATA1_TYPE_SHIFT), +}; + +enum Tdata1Value { + VALUE_NONE = 0, + VALUE_LOAD = BIT(0), + VALUE_STORE = BIT(1), + VALUE_EXECUTE = BIT(2), +}; + +enum Tdata1Mode { + MODE_NONE = 0, + MODE_M = BIT(0), + MODE_U = BIT(1), + MODE_S = BIT(2), + MODE_VU = BIT(3), + MODE_VS = BIT(4), +}; + +struct sbi_dbtr_data_msg { + unsigned long tstate; + unsigned long tdata1; + unsigned long tdata2; + unsigned long tdata3; +}; + +struct sbi_dbtr_id_msg { + unsigned long idx; +}; + +/* SBI shared mem messages layout */ +struct sbi_dbtr_shmem_entry { + union { + struct sbi_dbtr_data_msg data; + struct sbi_dbtr_id_msg id; + }; +}; + +static bool dbtr_handled; + +// Expected to be leaf function as not to disrupt frame-pointer +static __attribute__((naked)) void exec_call(void) +{ + // skip over nop when triggered instead of ret. + asm volatile (".option push\n" + ".option arch, -c\n" + "nop\n" + "ret\n" + ".option pop\n"); +} + +static void dbtr_exception_handler(struct pt_regs *regs) +{ + dbtr_handled = true; + + /* Reading *epc may cause a fault, skip over nop */ + if ((void *)regs->epc == exec_call) { + regs->epc += 4; + return; + } + + /* WARNING: Skips over the trapped intruction */ + regs->epc += INSN_LEN(readw((void *)regs->epc)); +} + +static bool do_save(void *tdata2) +{ + bool ret; + + writel(0, tdata2); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static bool do_load(void *tdata2) +{ + bool ret; + + readl(tdata2); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static bool do_exec(void) +{ + bool ret; + + exec_call(); + + ret = dbtr_handled; + dbtr_handled = false; + + return ret; +} + +static unsigned long gen_tdata1_mcontrol(enum Tdata1Mode mode, enum Tdata1Value value) +{ + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL; + + if (value & VALUE_LOAD) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_LOAD_BIT; + + if (value & VALUE_STORE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_STORE_BIT; + + if (value & VALUE_EXECUTE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_EXECUTE_BIT; + + if (mode & MODE_M) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_U_BIT; + + if (mode & MODE_U) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_U_BIT; + + if (mode & MODE_S) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL_S_BIT; + + return tdata1; +} + +static unsigned long gen_tdata1_mcontrol6(enum Tdata1Mode mode, enum Tdata1Value value) +{ + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL6; + + if (value & VALUE_LOAD) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_LOAD_BIT; + + if (value & VALUE_STORE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_STORE_BIT; + + if (value & VALUE_EXECUTE) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_EXECUTE_BIT; + + if (mode & MODE_M) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_U_BIT; + + if (mode & MODE_U) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_U_BIT; + + if (mode & MODE_S) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_S_BIT; + + if (mode & MODE_VU) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_VU_BIT; + + if (mode & MODE_VS) + tdata1 |= SBI_DBTR_TDATA1_MCONTROL6_VS_BIT; + + return tdata1; +} + +static unsigned long gen_tdata1(enum McontrolType type, enum Tdata1Value value, enum Tdata1Mode mode) +{ + switch (type) { + case SBI_DBTR_TDATA1_TYPE_MCONTROL: + return gen_tdata1_mcontrol(mode, value); + case SBI_DBTR_TDATA1_TYPE_MCONTROL6: + return gen_tdata1_mcontrol6(mode, value); + default: + return 0; + } +} + +static bool dbtr_install_trigger(struct sbi_dbtr_shmem_entry *shmem, void *tdata2, + unsigned long tdata1) +{ + struct sbiret sbi_ret; + bool ret; + + shmem->data.tdata1 = tdata1; + shmem->data.tdata2 = (unsigned long)tdata2; + + sbi_ret = sbi_debug_install_triggers(1); + ret = sbiret_report_error(&sbi_ret, SBI_SUCCESS, "sbi_debug_install_triggers"); + if (ret) + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + return ret; +} + +static bool dbtr_uninstall_trigger(void) +{ + struct sbiret ret; + + install_exception_handler(EXC_BREAKPOINT, NULL); + + ret = sbi_debug_uninstall_triggers(0, 1); + return sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); +} + +static unsigned long dbtr_test_num_triggers(void) +{ + struct sbiret ret; + //sbi_debug_num_triggers will return trig_max in sbiret.value when trig_tdata1 == 0 + unsigned long tdata1 = 0; + + // should be atleast one trigger. + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers"); + + if (ret.value == 0) + report_fail("sbi_debug_num_triggers: Returned 0 triggers available"); + else + report_pass("sbi_debug_num_triggers: Returned %lu triggers available", ret.value); + + return ret.value; +} + +static enum McontrolType dbtr_test_type(unsigned long *num_trig) +{ + struct sbiret ret; + + //sbi_debug_num_triggers will return trig_max in sbiret.value when trig_tdata1 == 0 + unsigned long tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL6; + + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers"); + if (ret.value > 0) { + report_pass("sbi_debug_num_triggers: Returned %lu mcontrol6 triggers available", + ret.value); + *num_trig = ret.value; + return tdata1; + } + + tdata1 = SBI_DBTR_TDATA1_TYPE_MCONTROL; + + ret = sbi_debug_num_triggers(tdata1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_num_triggers"); + *num_trig = ret.value; + if (ret.value > 0) { + report_pass("sbi_debug_num_triggers: Returned %lu mcontrol triggers available", + ret.value); + return tdata1; + } + + report_fail("sbi_debug_num_triggers: Returned 0 mcontrol(6) triggers available"); + + return SBI_DBTR_TDATA1_TYPE_NONE; +} + +static struct sbiret dbtr_test_save_install_uninstall(struct sbi_dbtr_shmem_entry *shmem, + enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("save_trigger"); + + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S | MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + ret = sbi_debug_install_triggers(1); + if (!sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_install_triggers")) { + report_prefix_pop(); + return ret; + } + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(do_save(&test), "triggered"); + + if (do_load(&test)) + report_fail("triggered by load"); + + ret = sbi_debug_uninstall_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + if (do_save(&test)) + report_fail("triggered after uninstall"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); + + return ret; +} + +static void dbtr_test_update(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("update_trigger"); + + if (!dbtr_install_trigger(shmem, NULL, gen_tdata1(type, VALUE_NONE, MODE_NONE))) { + report_prefix_pop(); + return; + } + + shmem->id.idx = 0; + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + ret = sbi_debug_update_triggers(1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_update_triggers"); + + report(do_save(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_load(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + + report_prefix_push("load_trigger"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_LOAD, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_load(&test), "triggered"); + + if (do_save(&test)) + report_fail("triggered by save"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_disable_enable(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("sbi_debug_disable_triggers"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + if (do_save(&test)) { + report_fail("should not trigger"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); + report_skip("sbi_debug_enable_triggers: no disable"); + + return; + } + + report_pass("should not trigger"); + + report_prefix_pop(); + report_prefix_push("sbi_debug_enable_triggers"); + + ret = sbi_debug_enable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_enable_triggers"); + + report(do_save(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_exec(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + + report_prefix_push("exec_trigger"); + /* check if loads and saves trigger exec */ + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + report_prefix_pop(); + return; + } + + if (do_load(&test)) + report_fail("triggered by load"); + + if (do_save(&test)) + report_fail("triggered by save"); + + dbtr_uninstall_trigger(); + + /* Check if exec works */ + if (!dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + report_prefix_pop(); + return; + } + report(do_exec(), "exec trigger"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_read(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + const unsigned long tstatus_expected = SBI_DBTR_TRIG_STATE_S | SBI_DBTR_TRIG_STATE_MAPPED; + const unsigned long tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + static unsigned long test; + struct sbiret ret; + + report_prefix_push("sbi_debug_read_triggers"); + if (!dbtr_install_trigger(shmem, &test, tdata1)) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_read_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_read_triggers"); + + report(shmem->data.tdata1 == tdata1, "tdata1 expected: 0x%016lx, found: 0x%016lx", + tdata1, shmem->data.tdata1); + report(shmem->data.tdata2 == ((unsigned long)&test), + "tdata2 expected: 0x%016lx, found: 0x%016lx", ((unsigned long)&test), + shmem->data.tdata2); + report(shmem->data.tstate == tstatus_expected, "tstate expected: 0x%016lx, found: 0x%016lx", + tstatus_expected, shmem->data.tstate); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void check_exec(unsigned long base) +{ + struct sbiret ret; + + report(do_exec(), "exec triggered"); + + ret = sbi_debug_uninstall_triggers(base, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); +} + +static void dbtr_test_multiple(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type, + unsigned long num_trigs) +{ + static unsigned long test[2]; + struct sbiret ret; + bool have_three = num_trigs > 2; + + if (num_trigs < 2) + return; + + report_prefix_push("test_multiple"); + + if (!dbtr_install_trigger(shmem, &test[0], gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + if (!dbtr_install_trigger(shmem, &test[1], gen_tdata1(type, VALUE_LOAD, MODE_S))) + goto error; + if (have_three && + !dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) { + ret = sbi_debug_uninstall_triggers(1, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + goto error; + } + + report(do_save(&test[0]), "save triggered"); + + if (do_load(&test[0])) + report_fail("save triggered by load"); + + report(do_load(&test[1]), "load triggered"); + + if (do_save(&test[1])) + report_fail("load triggered by save"); + + if (have_three) + check_exec(2); + + ret = sbi_debug_uninstall_triggers(1, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + if (do_load(&test[1])) + report_fail("load triggered after uninstall"); + + report(do_save(&test[0]), "save triggered"); + + if (!have_three && + dbtr_install_trigger(shmem, exec_call, gen_tdata1(type, VALUE_EXECUTE, MODE_S))) + check_exec(1); + +error: + ret = sbi_debug_uninstall_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_uninstall_triggers"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_multiple_types(struct sbi_dbtr_shmem_entry *shmem, unsigned long type) +{ + static unsigned long test; + + report_prefix_push("dbtr_test_multiple_types"); + + /* check if loads and saves trigger exec */ + if (!dbtr_install_trigger(shmem, &test, + gen_tdata1(type, VALUE_EXECUTE | VALUE_LOAD | VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_load(&test), "load trigger"); + + report(do_save(&test), "save trigger"); + + dbtr_uninstall_trigger(); + + /* Check if exec works */ + if (!dbtr_install_trigger(shmem, exec_call, + gen_tdata1(type, VALUE_EXECUTE | VALUE_LOAD | VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_exec(), "exec trigger"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_disable_uninstall(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("disable uninstall"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + dbtr_uninstall_trigger(); + + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + report(do_save(&test), "triggered"); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +static void dbtr_test_uninstall_enable(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("uninstall enable"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + dbtr_uninstall_trigger(); + + ret = sbi_debug_enable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_enable_triggers"); + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(!do_save(&test), "should not trigger"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_uninstall_update(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + static unsigned long test; + struct sbiret ret; + + report_prefix_push("uninstall update"); + if (!dbtr_install_trigger(shmem, NULL, gen_tdata1(type, VALUE_NONE, MODE_NONE))) { + report_prefix_pop(); + return; + } + + dbtr_uninstall_trigger(); + + shmem->id.idx = 0; + shmem->data.tdata1 = gen_tdata1(type, VALUE_STORE, MODE_S); + shmem->data.tdata2 = (unsigned long)&test; + + ret = sbi_debug_update_triggers(1); + sbiret_report_error(&ret, SBI_ERR_FAILURE, "sbi_debug_update_triggers"); + + install_exception_handler(EXC_BREAKPOINT, dbtr_exception_handler); + + report(!do_save(&test), "should not trigger"); + + install_exception_handler(EXC_BREAKPOINT, NULL); + report_prefix_pop(); +} + +static void dbtr_test_disable_read(struct sbi_dbtr_shmem_entry *shmem, enum McontrolType type) +{ + const unsigned long tstatus_expected = SBI_DBTR_TRIG_STATE_S | SBI_DBTR_TRIG_STATE_MAPPED; + const unsigned long tdata1 = gen_tdata1(type, VALUE_STORE, MODE_NONE); + static unsigned long test; + struct sbiret ret; + + report_prefix_push("disable_read"); + if (!dbtr_install_trigger(shmem, &test, gen_tdata1(type, VALUE_STORE, MODE_S))) { + report_prefix_pop(); + return; + } + + ret = sbi_debug_disable_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_disable_triggers"); + + ret = sbi_debug_read_triggers(0, 1); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_read_triggers"); + + report(shmem->data.tdata1 == tdata1, "tdata1 expected: 0x%016lx, found: 0x%016lx", + tdata1, shmem->data.tdata1); + report(shmem->data.tdata2 == ((unsigned long)&test), + "tdata2 expected: 0x%016lx, found: 0x%016lx", + ((unsigned long)&test), shmem->data.tdata2); + report(shmem->data.tstate == tstatus_expected, "tstate expected: 0x%016lx, found: 0x%016lx", + tstatus_expected, shmem->data.tstate); + + dbtr_uninstall_trigger(); + report_prefix_pop(); +} + +void check_dbtr(void) +{ + static struct sbi_dbtr_shmem_entry shmem[RV_MAX_TRIGGERS] = {}; + unsigned long num_trigs; + enum McontrolType trig_type; + struct sbiret ret; + + report_prefix_push("dbtr"); + + if (!sbi_probe(SBI_EXT_DBTR)) { + report_skip("extension not available"); + report_prefix_pop(); + return; + } + + if (__sbi_get_imp_id() == SBI_IMPL_OPENSBI && + __sbi_get_imp_version() < sbi_impl_opensbi_mk_version(1, 6)) { + report_skip("OpenSBI < v1.7 detected, skipping tests"); + report_prefix_pop(); + return; + } + + num_trigs = dbtr_test_num_triggers(); + if (!num_trigs) + goto error; + + trig_type = dbtr_test_type(&num_trigs); + if (trig_type == SBI_DBTR_TDATA1_TYPE_NONE) + goto error; + + ret = sbi_debug_set_shmem(shmem); + sbiret_report_error(&ret, SBI_SUCCESS, "sbi_debug_set_shmem"); + + ret = dbtr_test_save_install_uninstall(&shmem[0], trig_type); + /* install or uninstall failed */ + if (ret.error != SBI_SUCCESS) + goto error; + + dbtr_test_load(&shmem[0], trig_type); + dbtr_test_exec(&shmem[0], trig_type); + dbtr_test_read(&shmem[0], trig_type); + dbtr_test_disable_enable(&shmem[0], trig_type); + dbtr_test_update(&shmem[0], trig_type); + dbtr_test_multiple_types(&shmem[0], trig_type); + dbtr_test_multiple(shmem, trig_type, num_trigs); + dbtr_test_disable_uninstall(&shmem[0], trig_type); + dbtr_test_uninstall_enable(&shmem[0], trig_type); + dbtr_test_uninstall_update(&shmem[0], trig_type); + dbtr_test_disable_read(&shmem[0], trig_type); + +error: + report_prefix_pop(); +} diff --git a/riscv/sbi-tests.h b/riscv/sbi-tests.h index d5c4ae70..6a227745 100644 --- a/riscv/sbi-tests.h +++ b/riscv/sbi-tests.h @@ -99,6 +99,7 @@ static inline bool env_enabled(const char *env) void sbi_bad_fid(int ext); void check_sse(void); +void check_dbtr(void); #endif /* __ASSEMBLER__ */ #endif /* _RISCV_SBI_TESTS_H_ */ diff --git a/riscv/sbi.c b/riscv/sbi.c index edb1a6be..5bd496d0 100644 --- a/riscv/sbi.c +++ b/riscv/sbi.c @@ -1561,6 +1561,7 @@ int main(int argc, char **argv) check_susp(); check_sse(); check_fwft(); + check_dbtr(); return report_summary(); } -- 2.43.0

7 months, 1 week

2
1
0 0

[PATCH] selftests/timers: Fix integer overlow errors on 32 bit systems

by Terry Tritton

The use of NSEC_PER_SEC (1000000000L) as defined in include/vdso/time64.h causes several integer overflow warnings and test errors on 32 bit architectures. Use a long long instead of long to prevent integer overflow when converting seconds to nanoseconds. Signed-off-by: Terry Tritton <terry.tritton(a)linaro.org> --- tools/testing/selftests/timers/adjtick.c | 5 ++++- tools/testing/selftests/timers/alarmtimer-suspend.c | 4 +++- tools/testing/selftests/timers/inconsistency-check.c | 4 +++- tools/testing/selftests/timers/leap-a-day.c | 4 +++- tools/testing/selftests/timers/mqueue-lat.c | 3 ++- tools/testing/selftests/timers/nanosleep.c | 4 +++- tools/testing/selftests/timers/nsleep-lat.c | 4 +++- tools/testing/selftests/timers/posix_timers.c | 5 ++++- tools/testing/selftests/timers/raw_skew.c | 4 +++- tools/testing/selftests/timers/set-2038.c | 4 +++- tools/testing/selftests/timers/set-timer-lat.c | 4 +++- tools/testing/selftests/timers/valid-adjtimex.c | 5 ++++- 12 files changed, 38 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c index 777d9494b683..b5929c33b632 100644 --- a/tools/testing/selftests/timers/adjtick.c +++ b/tools/testing/selftests/timers/adjtick.c @@ -22,10 +22,13 @@ #include <sys/time.h> #include <sys/timex.h> #include <time.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL +#define USEC_PER_SEC 1000000LL + #define MILLION 1000000 long systick; diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c index a9ef76ea6051..b5799df271ae 100644 --- a/tools/testing/selftests/timers/alarmtimer-suspend.c +++ b/tools/testing/selftests/timers/alarmtimer-suspend.c @@ -28,10 +28,12 @@ #include <signal.h> #include <stdlib.h> #include <pthread.h> -#include <include/vdso/time64.h> #include <errno.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + #define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */ #define SUSPEND_SECS 15 diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c index 9d1573769d55..2b2d7293b313 100644 --- a/tools/testing/selftests/timers/inconsistency-check.c +++ b/tools/testing/selftests/timers/inconsistency-check.c @@ -28,9 +28,11 @@ #include <sys/timex.h> #include <string.h> #include <signal.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index 04004a7c0934..008c38ce4b2f 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -48,9 +48,11 @@ #include <string.h> #include <signal.h> #include <unistd.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + #define CLOCK_TAI 11 time_t next_leap; diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c index 63de2334a291..1a6d26f86137 100644 --- a/tools/testing/selftests/timers/mqueue-lat.c +++ b/tools/testing/selftests/timers/mqueue-lat.c @@ -29,9 +29,10 @@ #include <signal.h> #include <errno.h> #include <mqueue.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL #define TARGET_TIMEOUT 100000000 /* 100ms in nanoseconds */ #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */ diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c index 252c6308c569..55ea67478fdd 100644 --- a/tools/testing/selftests/timers/nanosleep.c +++ b/tools/testing/selftests/timers/nanosleep.c @@ -27,9 +27,11 @@ #include <sys/timex.h> #include <string.h> #include <signal.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c index de23dc0c9f97..347d622987c8 100644 --- a/tools/testing/selftests/timers/nsleep-lat.c +++ b/tools/testing/selftests/timers/nsleep-lat.c @@ -24,9 +24,11 @@ #include <sys/timex.h> #include <string.h> #include <signal.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + #define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */ /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index f0eceb0faf34..555bf161f420 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -16,11 +16,14 @@ #include <string.h> #include <unistd.h> #include <time.h> -#include <include/vdso/time64.h> #include <pthread.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL +#define USEC_PER_SEC 1000000LL + #define DELAY 2 static void __fatal_error(const char *test, const char *name, const char *what) diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 957f7cd29cb1..ff7675d98560 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -25,9 +25,11 @@ #include <sys/time.h> #include <sys/timex.h> #include <time.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + #define shift_right(x, s) ({ \ __typeof__(x) __x = (x); \ __typeof__(s) __s = (s); \ diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c index ed244315e11c..8130d551a11c 100644 --- a/tools/testing/selftests/timers/set-2038.c +++ b/tools/testing/selftests/timers/set-2038.c @@ -27,9 +27,11 @@ #include <unistd.h> #include <time.h> #include <sys/time.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + #define KTIME_MAX ((long long)~((unsigned long long)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c index 9d8437c13929..79a6a6cba186 100644 --- a/tools/testing/selftests/timers/set-timer-lat.c +++ b/tools/testing/selftests/timers/set-timer-lat.c @@ -28,9 +28,11 @@ #include <signal.h> #include <stdlib.h> #include <pthread.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL + /* CLOCK_HWSPECIFIC == CLOCK_SGI_CYCLE (Deprecated) */ #define CLOCK_HWSPECIFIC 10 diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c index 6b7801055ad1..e4f31e678630 100644 --- a/tools/testing/selftests/timers/valid-adjtimex.c +++ b/tools/testing/selftests/timers/valid-adjtimex.c @@ -29,9 +29,12 @@ #include <string.h> #include <signal.h> #include <unistd.h> -#include <include/vdso/time64.h> #include "../kselftest.h" +/* define NSEC_PER_SEC as long long to avoid overflow on 32 bit architectures*/ +#define NSEC_PER_SEC 1000000000LL +#define USEC_PER_SEC 1000000LL + #define ADJ_SETOFFSET 0x0100 #include <sys/syscall.h> -- 2.39.5

7 months, 1 week

2
2
0 0

[PATCH] selftests: ublk: kublk: improve behavior on init failure

by Uday Shankar

Some failure modes are handled poorly by kublk. For example, if ublk_drv is built as a module but not currently loaded into the kernel, ./kublk add ... just hangs forever. This happens because in this case (and a few others), the worker process does not notify its parent (via a write to the shared eventfd) that it has tried and failed to initialize, so the parent hangs forever. Fix this by ensuring that we always notify the parent process of any initialization failure, and have the parent print a (not very descriptive) log line when this happens. Signed-off-by: Uday Shankar <ushankar(a)purestorage.com> --- tools/testing/selftests/ublk/kublk.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index a98e14e4c245965d817b93843ff9a4011291223b..e2d2042810d4bb472e48a0ed91317d2bdf6e2f2a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1112,7 +1112,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) __u64 features; const struct ublk_tgt_ops *ops; struct ublksrv_ctrl_dev_info *info; - struct ublk_dev *dev; + struct ublk_dev *dev = NULL; int dev_id = ctx->dev_id; int ret, i; @@ -1120,13 +1120,15 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) if (!ops) { ublk_err("%s: no such tgt type, type %s\n", __func__, tgt_type); - return -ENODEV; + ret = -ENODEV; + goto fail; } if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", __func__, nr_queues, depth); - return -EINVAL; + ret = -EINVAL; + goto fail; } /* default to 1:1 threads:queues if nthreads is unspecified */ @@ -1136,30 +1138,37 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) if (nthreads > UBLK_MAX_THREADS) { ublk_err("%s: %u is too many threads (max %u)\n", __func__, nthreads, UBLK_MAX_THREADS); - return -EINVAL; + ret = -EINVAL; + goto fail; } if (nthreads != nr_queues && !ctx->per_io_tasks) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); - return -EINVAL; + ret = -EINVAL; + goto fail; } dev = ublk_ctrl_init(); if (!dev) { ublk_err("%s: can't alloc dev id %d, type %s\n", __func__, dev_id, tgt_type); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } /* kernel doesn't support get_features */ ret = ublk_ctrl_get_features(dev, &features); - if (ret < 0) - return -EINVAL; + if (ret < 0) { + ret = -EINVAL; + goto fail; + } - if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) - return -ENOTSUP; + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) { + ret = -ENOTSUP; + goto fail; + } info = &dev->dev_info; info->dev_id = ctx->dev_id; @@ -1200,7 +1209,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) fail: if (ret < 0) ublk_send_dev_event(ctx, dev, -1); - ublk_ctrl_deinit(dev); + if (dev) + ublk_ctrl_deinit(dev); return ret; } @@ -1262,6 +1272,8 @@ static int cmd_dev_add(struct dev_ctx *ctx) shmctl(ctx->_shmid, IPC_RMID, NULL); /* wait for child and detach from it */ wait(NULL); + if (exit_code == EXIT_FAILURE) + ublk_err("%s: command failed\n", __func__); exit(exit_code); } else { exit(EXIT_FAILURE); --- base-commit: c09a8b00f850d3ca0af998bff1fac4a3f6d11768 change-id: 20250603-ublk_init_fail-b498905159eb Best regards, -- Uday Shankar <ushankar(a)purestorage.com>

7 months, 1 week

3
2
0 0

[PATCH v2] selftests: ir_decoder: Convert header comment to proper multi-line block

by Abdelrahman Fekry

well, i checked the script using checkpatch.pl and it shows that the patch has no warnings or errors and its ready to be sent v2: - fixed multiple trailing whitespace errors and - the Signed-off-by mismatch The test file for the IR decoder used single-line comments at the top to document its purpose and licensing, which is inconsistent with the style used throughout the Linux kernel. In this patch i converted the file header to a proper multi-line comment block (/*) that aligns with standard kernel practices. This improves readability, consistency across selftests, and ensures the license and documentation are clearly visible in a familiar format. No functional changes have been made. Signed-off-by: Abdelrahman Fekry <abdelrahmanfekry375(a)gmail.com> --- tools/testing/selftests/ir/ir_loopback.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index f4a15cbdd5ea..c94faa975630 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 -// test ir decoder -// -// Copyright (C) 2018 Sean Young <sean(a)mess.org> - -// When sending LIRC_MODE_SCANCODE, the IR will be encoded. rc-loopback -// will send this IR to the receiver side, where we try to read the decoded -// IR. Decoding happens in a separate kernel thread, so we will need to -// wait until that is scheduled, hence we use poll to check for read -// readiness. - +/* Copyright (C) 2018 Sean Young <sean(a)mess.org> + * + * Selftest for IR decoder + * + * + * When sending LIRC_MODE_SCANCODE, the IR will be encoded. + * rc-loopback will send this IR to the receiver side, + * where we try to read the decoded IR. + * Decoding happens in a separate kernel thread, + * so we will need to wait until that is scheduled, + * hence we use poll to check for read + * readiness. + */ #include <linux/lirc.h> #include <errno.h> #include <stdio.h> -- 2.25.1

7 months, 1 week

1
0
0 0

[PATCH bpf-next v3 00/11] bpf: Mitigate Spectre v1 using barriers

by Luis Gerhorst

This improves the expressiveness of unprivileged BPF by inserting speculation barriers instead of rejecting the programs. The approach was previously presented at LPC'24 [1] and RAID'24 [2]. To mitigate the Spectre v1 (PHT) vulnerability, the kernel rejects potentially-dangerous unprivileged BPF programs as of commit 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches"). In [2], we have analyzed 364 object files from open source projects (Linux Samples and Selftests, BCC, Loxilb, Cilium, libbpf Examples, Parca, and Prevail) and found that this affects 31% to 54% of programs. To resolve this in the majority of cases this patchset adds a fall-back for mitigating Spectre v1 using speculation barriers. The kernel still optimistically attempts to verify all speculative paths but uses speculation barriers against v1 when unsafe behavior is detected. This allows for more programs to be accepted without disabling the BPF Spectre mitigations (e.g., by setting cpu_mitigations_off()). For this, it relies on the fact that speculation barriers prevent all later instructions if the speculation was not correct: * On x86_64, lfence acts as full speculation barrier, not only as a load fence [3]: An LFENCE instruction or a serializing instruction will ensure that no later instructions execute, even speculatively, until all prior instructions complete locally. [...] Inserting an LFENCE instruction after a bounds check prevents later operations from executing before the bound check completes. This was experimentally confirmed in [4]. * ARM's SB speculation barrier instruction also affects "any instruction that appears later in the program order than the barrier" [5]. In [1] we have measured the overhead of this approach relative to having mitigations off and including the upstream Spectre v4 mitigations. For event tracing and stack-sampling profilers, we found that mitigations increase BPF program execution time by 0% to 62%. For the Loxilb network load balancer, we have measured a 14% slowdown in SCTP performance but no significant slowdown for TCP. This overhead only applies to programs that were previously rejected. I reran the expressiveness-evaluation with v6.14 and made sure the main results still match those from [1] and [2] (which used v6.5). Main design decisions are: * Do not use separate bytecode insns for v1 and v4 barriers (inspired by Daniel Borkmann's question at LPC). This simplifies the verifier significantly and has the only downside that performance on PowerPC is not as high as it could be. * Allow archs to still disable v1/v4 mitigations separately by setting bpf_jit_bypass_spec_v1/v4(). This has the benefit that archs can benefit from improved BPF expressiveness / performance if they are not vulnerable (e.g., ARM64 for v4 in the kernel). * Do not remove the empty BPF_NOSPEC implementation for backends for which it is unknown whether they are vulnerable to Spectre v1. [1] https://lpc.events/event/18/contributions/1954/ ("Mitigating Spectre-PHT using Speculation Barriers in Linux eBPF") [2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux Kernel Extensions") [3] https://www.intel.com/content/www/us/en/developer/articles/technical/softwa… ("Managed Runtime Speculative Execution Side Channel Mitigations") [4] https://dl.acm.org/doi/pdf/10.1145/3359789.3359837 ("Speculator: a tool to analyze speculative execution attacks and mitigations" - Section 4.6 "Stopping Speculative Execution") [5] https://developer.arm.com/documentation/ddi0597/2020-12/Base-Instructions/S… ("SB - Speculation Barrier - Arm Armv8-A A32/T32 Instruction Set Architecture (2020-12)") Changes: * v2 -> v3: - Fix https://lore.kernel.org/oe-kbuild-all/202504212030.IF1SLhz6-lkp@intel.com/ and similar by moving the bpf_jit_bypass_spec_v1/v4() prototypes out of the #ifdef CONFIG_BPF_SYSCALL. Decided not to move them to filter.h (where similar bpf_jit_*() prototypes live) as they would still have to be duplicated in bpf.h to be usable to bpf_bypass_spec_v1/v4() (unless including filter.h in bpf.h is an option). - Fix https://lore.kernel.org/oe-kbuild-all/202504220035.SoGveGpj-lkp@intel.com/ by moving the variable declarations out of the switch-case. - Build touched C files with W=2 and bpf config on x86 to check that there are no other warnings introduced. - Found 3 more checkpatch warnings that can be fixed without degrading readability. - Rebase to bpf-next 2025-05-01 - Link to v2: https://lore.kernel.org/bpf/20250421091802.3234859-1-luis.gerhorst@fau.de/ * v1 -> v2: - Drop former commits 9 ("bpf: Return PTR_ERR from push_stack()") and 11 ("bpf: Fall back to nospec for spec path verification") as suggested by Alexei. This series therefore no longer changes push_stack() to return PTR_ERR. - Add detailed explanation of how lfence works internally and how it affects the algorithm. - Add tests checking that nospec instructions are inserted in expected locations using __xlated_unpriv as suggested by Eduard (also, include a fix for __xlated_unpriv) - Add a test for the mitigations from the description of commit 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches") - Remove unused variables from do_check[_insn]() as suggested by Eduard. - Remove INSN_IDX_MODIFIED to improve readability as suggested by Eduard. This also causes the nospec_result-check to run (and fail) for jumping-ops. Add a warning to assert that this check must never succeed in that case. - Add details on the safety of patch 10 ("bpf: Allow nospec-protected var-offset stack access") based on the feedback on v1. - Rebase to bpf-next-250420 - Link to v1: https://lore.kernel.org/all/20250313172127.1098195-1-luis.gerhorst@fau.de/ * RFC -> v1: - rebase to bpf-next-250313 - tests: mark expected successes/new errors - add bpt_jit_bypass_spec_v1/v4() to avoid #ifdef in bpf_bypass_spec_v1/v4() - ensure that nospec with v1-support is implemented for archs for which GCC supports speculation barriers, except for MIPS - arm64: emit speculation barrier - powerpc: change nospec to include v1 barrier - discuss potential security (archs that do not impl. BPF nospec) and performance (only PowerPC) regressions - Link to RFC: https://lore.kernel.org/bpf/20250224203619.594724-1-luis.gerhorst@fau.de/ Luis Gerhorst (11): selftests/bpf: Fix caps for __xlated/jited_unpriv bpf: Move insn if/else into do_check_insn() bpf: Return -EFAULT on misconfigurations bpf: Return -EFAULT on internal errors bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4() bpf, arm64, powerpc: Change nospec to include v1 barrier bpf: Rename sanitize_stack_spill to nospec_result bpf: Fall back to nospec for Spectre v1 selftests/bpf: Add test for Spectre v1 mitigation bpf: Allow nospec-protected var-offset stack access bpf: Fall back to nospec for sanitization-failures arch/arm64/net/bpf_jit.h | 5 + arch/arm64/net/bpf_jit_comp.c | 28 +- arch/powerpc/net/bpf_jit_comp64.c | 80 ++- include/linux/bpf.h | 11 +- include/linux/bpf_verifier.h | 3 +- include/linux/filter.h | 2 +- kernel/bpf/core.c | 32 +- kernel/bpf/verifier.c | 653 ++++++++++-------- tools/testing/selftests/bpf/progs/bpf_misc.h | 4 + .../selftests/bpf/progs/verifier_and.c | 8 +- .../selftests/bpf/progs/verifier_bounds.c | 66 +- .../bpf/progs/verifier_bounds_deduction.c | 45 +- .../selftests/bpf/progs/verifier_map_ptr.c | 20 +- .../selftests/bpf/progs/verifier_movsx.c | 16 +- .../selftests/bpf/progs/verifier_unpriv.c | 65 +- .../bpf/progs/verifier_value_ptr_arith.c | 101 ++- tools/testing/selftests/bpf/test_loader.c | 14 +- .../selftests/bpf/verifier/dead_code.c | 3 +- tools/testing/selftests/bpf/verifier/jmp32.c | 33 +- tools/testing/selftests/bpf/verifier/jset.c | 10 +- 20 files changed, 771 insertions(+), 428 deletions(-) base-commit: 358b1c0f56ebb6996fcec7dcdcf6bae5dcbc8b6c -- 2.49.0

7 months, 1 week

6
34
0 0

[PATCH v2 1/2] libbpf: add support for printing BTF character arrays as strings

by Blake Jones

The BTF dumper code currently displays arrays of characters as just that - arrays, with each character formatted individually. Sometimes this is what makes sense, but it's nice to be able to treat that array as a string. This change adds a special case to the btf_dump functionality to allow arrays of single-byte integer values to be printed as character strings. Characters for which isprint() returns false are printed as hex-escaped values. This is enabled when the new ".emit_strings" is set to 1 in the btf_dump_type_data_opts structure. As an example, here's what it looks like to dump the string "hello" using a few different field values for btf_dump_type_data_opts (.compact = 1): - .emit_strings = 0, .skip_names = 0: (char[6])['h','e','l','l','o',] - .emit_strings = 0, .skip_names = 1: ['h','e','l','l','o',] - .emit_strings = 1, .skip_names = 0: (char[6])"hello" - .emit_strings = 1, .skip_names = 1: "hello" Here's the string "h\xff", dumped with .compact = 1 and .skip_names = 1: - .emit_strings = 0: ['h',-1,] - .emit_strings = 1: "h\xff" Signed-off-by: Blake Jones <blakejones(a)google.com> --- tools/lib/bpf/btf.h | 3 ++- tools/lib/bpf/btf_dump.c | 44 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 4392451d634b..ccfd905f03df 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -326,9 +326,10 @@ struct btf_dump_type_data_opts { bool compact; /* no newlines/indentation */ bool skip_names; /* skip member/type names */ bool emit_zeroes; /* show 0-valued fields */ + bool emit_strings; /* print char arrays as strings */ size_t :0; }; -#define btf_dump_type_data_opts__last_field emit_zeroes +#define btf_dump_type_data_opts__last_field emit_strings LIBBPF_API int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 460c3e57fadb..336a6646e0fa 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -68,6 +68,7 @@ struct btf_dump_data { bool compact; bool skip_names; bool emit_zeroes; + bool emit_strings; __u8 indent_lvl; /* base indent level */ char indent_str[BTF_DATA_INDENT_STR_LEN]; /* below are used during iteration */ @@ -2028,6 +2029,43 @@ static int btf_dump_var_data(struct btf_dump *d, return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0); } +static int btf_dump_string_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_array *array = btf_array(t); + __u32 i; + + btf_dump_data_pfx(d); + btf_dump_printf(d, "\""); + + for (i = 0; i < array->nelems; i++, data++) { + char c; + + if (data >= d->typed_dump->data_end) + return -E2BIG; + + c = *(char *)data; + if (c == '\0') { + /* + * When printing character arrays as strings, NUL bytes + * are always treated as string terminators; they are + * never printed. + */ + break; + } + if (isprint(c)) + btf_dump_printf(d, "%c", c); + else + btf_dump_printf(d, "\\x%02x", *(__u8 *)data); + } + + btf_dump_printf(d, "\""); + + return 0; +} + static int btf_dump_array_data(struct btf_dump *d, const struct btf_type *t, __u32 id, @@ -2055,8 +2093,11 @@ static int btf_dump_array_data(struct btf_dump *d, * char arrays, so if size is 1 and element is * printable as a char, we'll do that. */ - if (elem_size == 1) + if (elem_size == 1) { + if (d->typed_dump->emit_strings) + return btf_dump_string_data(d, t, id, data); d->typed_dump->is_array_char = true; + } } /* note that we increment depth before calling btf_dump_print() below; @@ -2544,6 +2585,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, d->typed_dump->compact = OPTS_GET(opts, compact, false); d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false); d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false); + d->typed_dump->emit_strings = OPTS_GET(opts, emit_strings, false); ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0); -- 2.49.0.1204.g71687c7c1d-goog

7 months, 1 week

3
5
0 0

[RESEND PATCH] selftests/bpf: Fix bpf selftest build error

by Saket Kumar Bhaskar

On linux-next, build for bpf selftest displays an error due to mismatch in the expected function signature of bpf_testmod_test_read and bpf_testmod_test_write. Commit 97d06802d10a ("sysfs: constify bin_attribute argument of bin_attribute::read/write()") changed the required type for struct bin_attribute to const struct bin_attribute. To resolve the error, update corresponding signature for the callback. Fixes: 97d06802d10a ("sysfs: constify bin_attribute argument of bin_attribute::read/write()") Reported-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com> Closes: https://lore.kernel.org/all/e915da49-2b9a-4c4c-a34f-877f378129f6@linux.ibm.… Tested-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com> Signed-off-by: Saket Kumar Bhaskar <skb99(a)linux.ibm.com> --- [RESEND]: - Added Fixes and Tested-by tag. - Added Greg as receipent for driver-core tree. Original patch: https://lore.kernel.org/all/20250509122348.649064-1-skb99@linux.ibm.com/ tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 2e54b95ad898..194c442580ee 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok; noinline ssize_t bpf_testmod_test_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_read_ctx ctx = { @@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); noinline ssize_t bpf_testmod_test_write(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_write_ctx ctx = { -- 2.43.5

7 months, 1 week

4
4
0 0

[PATCH net] selftests: drv-net: add configs for the TSO test

by Jakub Kicinski

Add missing config options for the tso.py test, specifically to make sure the kernel is built with vxlan and gre tunnels. I noticed this while adding a TSO-capable device QEMU to the CI. Previously we only run virtio tests and it doesn't report LSO stats on the QEMU we have. Fixes: 0d0f4174f6c8 ("selftests: drv-net: add a simple TSO test") Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> --- CC: shuah(a)kernel.org CC: willemb(a)google.com CC: linux-kselftest(a)vger.kernel.org --- tools/testing/selftests/drivers/net/hw/config | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 tools/testing/selftests/drivers/net/hw/config diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config new file mode 100644 index 000000000000..ea4b70d71563 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/config @@ -0,0 +1,6 @@ +CONFIG_IPV6=y +CONFIG_IPV6_GRE=y +CONFIG_NET_IP_TUNNEL=y +CONFIG_NET_IPGRE=y +CONFIG_NET_IPGRE_DEMUX=y +CONFIG_VXLAN=y -- 2.49.0

7 months, 1 week

2
2
0 0

[PATCH net v3] selftests: net: build net/lib dependency in all target

by Bui Quang Minh

We have the logic to include net/lib automatically for net related selftests. However, currently, this logic is only in install target which means only `make install` will have net/lib included. This commit adds the logic to all target so that all `make`, `make run_tests` and `make install` will have net/lib included in net related selftests. Signed-off-by: Bui Quang Minh <minhquangbui99(a)gmail.com> --- Changes in v3: - Don't remove INSTALL_DEP_TARGETS in install target so that net/lib is copied to INSTALL_PATH Changes in v2: - Make the commit message clearer. tools/testing/selftests/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 6aa11cd3db42..339b31e6a6b5 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -205,7 +205,7 @@ export KHDR_INCLUDES all: @ret=1; \ - for TARGET in $(TARGETS); do \ + for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET \ -- 2.43.0

7 months, 1 week

2
1
0 0

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-kselftest-mirror