This small series proposes the removal of the BPF_RI_F_RF_NO_DIRECT XDP flag in favour of page_pool's internal page_pool_napi_local() check which can override a non-direct recycle into a direct one if the right conditions are met.,
This was discussed on the mailing list on several occasions [1][2].
The first patch adds additional benchmarking code to the page_pool benchmark.
The second patch has the actual change with a proper explanation and measurements. It remains to be debated if the whole BPF_RI_F_RF_NO_DIRECT mechanism should be deleted or only its use in xdp_return_frame_rx_napi().
There is still the unresolved issue of drivers that don't support page_pool NAPI recycling. This series could be extended to add that support. Otherwise those drivers would end up with slow path recycling for XDP.
[1] https://lore.kernel.org/all/8d165026-1477-46cb-94d4-a01e1da40833@kernel.org/ [2] https://lore.kernel.org/all/20250918084823.372000-1-dtatulea@nvidia.com/
Dragos Tatulea (2): page_pool: add benchmarking for napi-based recycling xdp: Delegate fast path return decision to page_pool
drivers/net/veth.c | 2 - include/linux/filter.h | 22 ----- include/net/xdp.h | 2 +- kernel/bpf/cpumap.c | 2 - net/bpf/test_run.c | 2 - net/core/filter.c | 2 +- net/core/xdp.c | 24 ++--- .../bench/page_pool/bench_page_pool_simple.c | 92 ++++++++++++++++++- 8 files changed, 104 insertions(+), 44 deletions(-)
The code brings back the tasklet based code in order to be able to run in softirq context.
One additional test is added which benchmarks the impact of page_pool_napi_local().
Signed-off-by: Dragos Tatulea dtatulea@nvidia.com --- .../bench/page_pool/bench_page_pool_simple.c | 92 ++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c index cb6468adbda4..84683c547814 100644 --- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c @@ -9,6 +9,7 @@ #include <linux/limits.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/netdevice.h> #include <net/page_pool/helpers.h>
#include "time_bench.h" @@ -16,6 +17,8 @@ static int verbose = 1; #define MY_POOL_SIZE 1024
+DEFINE_MUTEX(wait_for_tasklet); + /* Makes tests selectable. Useful for perf-record to analyze a single test. * Hint: Bash shells support writing binary number like: $((2#101010) * @@ -31,6 +34,10 @@ enum benchmark_bit { bit_run_bench_no_softirq01, bit_run_bench_no_softirq02, bit_run_bench_no_softirq03, + bit_run_bench_tasklet01, + bit_run_bench_tasklet02, + bit_run_bench_tasklet03, + bit_run_bench_tasklet04, };
#define bit(b) (1 << (b)) @@ -120,7 +127,12 @@ static void pp_fill_ptr_ring(struct page_pool *pp, int elems) kfree(array); }
-enum test_type { type_fast_path, type_ptr_ring, type_page_allocator }; +enum test_type { + type_fast_path, + type_napi_aware, + type_ptr_ring, + type_page_allocator, +};
/* Depends on compile optimizing this function */ static int time_bench_page_pool(struct time_bench_record *rec, void *data, @@ -132,6 +144,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data,
struct page_pool *pp; struct page *page; + struct napi_struct napi = {0};
struct page_pool_params pp_params = { .order = 0, @@ -141,6 +154,7 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, .dev = NULL, /* Only use for DMA mapping */ .dma_dir = DMA_BIDIRECTIONAL, }; + struct page_pool_stats stats = {0};
pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { @@ -155,6 +169,11 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, else pr_warn("%s(): Cannot use page_pool fast-path\n", func);
+ if (type == type_napi_aware) { + napi.list_owner = smp_processor_id(); + page_pool_enable_direct_recycling(pp, &napi); + } + time_bench_start(rec); /** Loop to measure **/ for (i = 0; i < rec->loops; i++) { @@ -173,7 +192,13 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, page_pool_recycle_direct(pp, page);
} else if (type == type_ptr_ring) { - /* Normal return path */ + /* Normal return path, either direct or via ptr_ring */ + page_pool_put_page(pp, page, -1, false); + + } else if (type == type_napi_aware) { + /* NAPI-aware recycling: uses fast-path recycling if + * possible. + */ page_pool_put_page(pp, page, -1, false);
} else if (type == type_page_allocator) { @@ -188,6 +213,14 @@ static int time_bench_page_pool(struct time_bench_record *rec, void *data, } } time_bench_stop(rec, loops_cnt); + + if (type == type_napi_aware) { + page_pool_get_stats(pp, &stats); + if (stats.recycle_stats.cached < rec->loops) + pr_warn("%s(): NAPI-aware recycling wasn't used\n", + func); + } + out: page_pool_destroy(pp); return loops_cnt; @@ -211,6 +244,54 @@ static int time_bench_page_pool03_slow(struct time_bench_record *rec, return time_bench_page_pool(rec, data, type_page_allocator, __func__); }
+static int time_bench_page_pool04_napi_aware(struct time_bench_record *rec, + void *data) +{ + return time_bench_page_pool(rec, data, type_napi_aware, __func__); +} + +/* Testing page_pool requires running under softirq. + * + * Running under a tasklet satisfy this, as tasklets are built on top of + * softirq. + */ +static void pp_tasklet_handler(struct tasklet_struct *t) +{ + uint32_t nr_loops = loops; + + if (in_serving_softirq()) + pr_warn("%s(): in_serving_softirq fast-path\n", + __func__); // True + else + pr_warn("%s(): Cannot use page_pool fast-path\n", __func__); + + if (enabled(bit_run_bench_tasklet01)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path", + NULL, time_bench_page_pool01_fast_path); + + if (enabled(bit_run_bench_tasklet02)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring", + NULL, time_bench_page_pool02_ptr_ring); + + if (enabled(bit_run_bench_tasklet03)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL, + time_bench_page_pool03_slow); + + if (enabled(bit_run_bench_tasklet04)) + time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware", + NULL, time_bench_page_pool04_napi_aware); + + mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */ +} +DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler); + +static void run_tasklet_tests(void) +{ + tasklet_enable(&pp_tasklet); + /* "Async" schedule tasklet, which runs on the CPU that schedule it */ + tasklet_schedule(&pp_tasklet); +} + static int run_benchmark_tests(void) { uint32_t nr_loops = loops; @@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
run_benchmark_tests();
+ mutex_lock(&wait_for_tasklet); + run_tasklet_tests(); + /* Sleep on mutex, waiting for tasklet to release */ + mutex_lock(&wait_for_tasklet); + return 0; } module_init(bench_page_pool_simple_module_init);
static void __exit bench_page_pool_simple_module_exit(void) { + tasklet_kill(&pp_tasklet); + if (verbose) pr_info("Unloaded\n"); }
diff --git a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c index cb6468adb..84683c547 100644 --- a/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c +++ b/tools/testing/selftests/net/bench/page_pool/bench_page_pool_simple.c @@ -16,6 +17,8 @@ static int verbose = 1; #define MY_POOL_SIZE 1024
+DEFINE_MUTEX(wait_for_tasklet);
[ ... ]
+/* Testing page_pool requires running under softirq.
- Running under a tasklet satisfy this, as tasklets are built on top of
- softirq.
- */
+static void pp_tasklet_handler(struct tasklet_struct *t) +{
- uint32_t nr_loops = loops;
- if (in_serving_softirq())
pr_warn("%s(): in_serving_softirq fast-path\n",__func__); // True- else
pr_warn("%s(): Cannot use page_pool fast-path\n", __func__);- if (enabled(bit_run_bench_tasklet01))
time_bench_loop(nr_loops, 0, "tasklet_page_pool01_fast_path",NULL, time_bench_page_pool01_fast_path);- if (enabled(bit_run_bench_tasklet02))
time_bench_loop(nr_loops, 0, "tasklet_page_pool02_ptr_ring",NULL, time_bench_page_pool02_ptr_ring);- if (enabled(bit_run_bench_tasklet03))
time_bench_loop(nr_loops, 0, "tasklet_page_pool03_slow", NULL,time_bench_page_pool03_slow);- if (enabled(bit_run_bench_tasklet04))
time_bench_loop(nr_loops, 0, "tasklet_page_pool04_napi_aware",NULL, time_bench_page_pool04_napi_aware);- mutex_unlock(&wait_for_tasklet); /* Module __init waiting on unlock */
^^^^^^^^^^^^^
Can mutex_unlock() be called from softirq context? The pp_tasklet_handler() function runs as a tasklet handler in softirq context, but mutexes are sleeping locks that require process context. Would a completion or spinlock be more appropriate here?
The synchronization pattern appears to be: bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet) bench_page_pool_simple_module_init()->tasklet_schedule(&pp_tasklet) bench_page_pool_simple_module_init()->mutex_lock(&wait_for_tasklet) [blocks] pp_tasklet_handler()->mutex_unlock(&wait_for_tasklet) [softirq context]
+} +DECLARE_TASKLET_DISABLED(pp_tasklet, pp_tasklet_handler);
+static void run_tasklet_tests(void) +{
- tasklet_enable(&pp_tasklet);
- /* "Async" schedule tasklet, which runs on the CPU that schedule it */
- tasklet_schedule(&pp_tasklet);
+}
[ ... ]
@@ -251,12 +332,19 @@ static int __init bench_page_pool_simple_module_init(void)
run_benchmark_tests();
- mutex_lock(&wait_for_tasklet);
- run_tasklet_tests();
- /* Sleep on mutex, waiting for tasklet to release */
- mutex_lock(&wait_for_tasklet);
- return 0;
}
--- AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19165940352
linux-kselftest-mirror@lists.linaro.org