This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9] [ 41.509763] Modules linked in: [ 41.512295] CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.19.90 #13 [ 41.516134] Hardware name: linux,dummy-virt (DT) [ 41.519182] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 41.522415] pc : perf_trace_buf_alloc+0x138/0x238 [ 41.525583] lr : perf_trace_buf_alloc+0x138/0x238 [ 41.528656] sp : ffff8000c137e880 [ 41.531050] x29: ffff8000c137e880 x28: ffff20000850ced0 [ 41.534759] x27: 0000000000000000 x26: ffff8000c137e9c0 [ 41.538456] x25: ffff8000ce5c2ae0 x24: ffff200008358b08 [ 41.542151] x23: 0000000000000000 x22: ffff2000084a50ac [ 41.545834] x21: ffff8000c137e880 x20: 000000000000001c [ 41.549516] x19: ffff7dffbfdf88e8 x18: 0000000000000000 [ 41.553202] x17: 0000000000000000 x16: 0000000000000000 [ 41.556892] x15: 1ffff00036e07805 x14: 0000000000000000 [ 41.560592] x13: 0000000000000004 x12: 0000000000000000 [ 41.564315] x11: 1fffefbff7fbf120 x10: ffff0fbff7fbf120 [ 41.568003] x9 : dfff200000000000 x8 : ffff7dffbfdf8904 [ 41.571699] x7 : 0000000000000000 x6 : ffff0fbff7fbf121 [ 41.575398] x5 : ffff0fbff7fbf121 x4 : ffff0fbff7fbf121 [ 41.579086] x3 : ffff20000850cdc8 x2 : 0000000000000008 [ 41.582773] x1 : ffff8000c1376000 x0 : 0000000000000100 [ 41.586495] Call trace: [ 41.588922] perf_trace_buf_alloc+0x138/0x238 [ 41.591912] perf_ftrace_function_call+0x1ac/0x248 [ 41.595123] ftrace_ops_no_ops+0x3a4/0x488 [ 41.597998] ftrace_graph_call+0x0/0xc [ 41.600715] rcu_dynticks_curr_cpu_in_eqs+0x14/0x70 [ 41.603962] rcu_is_watching+0xc/0x20 [ 41.606635] ftrace_ops_no_ops+0x240/0x488 [ 41.609530] ftrace_graph_call+0x0/0xc [ 41.612249] __read_once_size_nocheck.constprop.0+0x1c/0x38 [ 41.615905] unwind_frame+0x140/0x358 [ 41.618597] walk_stackframe+0x34/0x60 [ 41.621359] __save_stack_trace+0x204/0x3b8 [ 41.624328] save_stack_trace+0x2c/0x38 [ 41.627112] __kasan_slab_free+0x120/0x228 [ 41.630018] kasan_slab_free+0x10/0x18 [ 41.632752] kfree+0x84/0x250 [ 41.635107] skb_free_head+0x70/0xb0 [ 41.637772] skb_release_data+0x3f8/0x730 [ 41.640626] skb_release_all+0x50/0x68 [ 41.643350] kfree_skb+0x84/0x278 [ 41.645890] kfree_skb_list+0x4c/0x78 [ 41.648595] __dev_queue_xmit+0x1a4c/0x23a0 [ 41.651541] dev_queue_xmit+0x28/0x38 [ 41.654254] ip6_finish_output2+0xeb0/0x1630 [ 41.657261] ip6_finish_output+0x2d8/0x7f8 [ 41.660174] ip6_output+0x19c/0x348 [ 41.663850] mld_sendpack+0x560/0x9e0 [ 41.666564] mld_ifc_timer_expire+0x484/0x8a8 [ 41.669624] call_timer_fn+0x68/0x4b0 [ 41.672355] expire_timers+0x168/0x498 [ 41.675126] run_timer_softirq+0x230/0x7a8 [ 41.678052] __do_softirq+0x2d0/0xba0 [ 41.680763] run_ksoftirqd+0x110/0x1a0 [ 41.683512] smpboot_thread_fn+0x31c/0x620 [ 41.686429] kthread+0x2c8/0x348 [ 41.688927] ret_from_fork+0x10/0x18
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com --- kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f7e89c989df7..01bc3c1f93f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -275,7 +275,7 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. */ -static void rcu_dynticks_eqs_enter(void) +static notrace void rcu_dynticks_eqs_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); int seq; @@ -298,7 +298,7 @@ static void rcu_dynticks_eqs_enter(void) * Record exit from an extended quiescent state. This is only to be * called from an extended quiescent state. */ -static void rcu_dynticks_eqs_exit(void) +static notrace void rcu_dynticks_eqs_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); int seq; @@ -343,7 +343,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -706,7 +706,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter(bool user) +static notrace void rcu_eqs_enter(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; @@ -763,7 +763,7 @@ void rcu_idle_enter(void) * If you add or remove a call to rcu_user_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_enter(void) +notrace void rcu_user_enter(void) { lockdep_assert_irqs_disabled(); rcu_eqs_enter(true); @@ -781,7 +781,7 @@ void rcu_user_enter(void) * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +notrace void rcu_nmi_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -829,7 +829,7 @@ void rcu_nmi_exit(void) * If you add or remove a call to rcu_irq_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_exit(void) +notrace void rcu_irq_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -864,7 +864,7 @@ void rcu_irq_exit_irqson(void) * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ -static void rcu_eqs_exit(bool user) +static notrace void rcu_eqs_exit(bool user) { struct rcu_dynticks *rdtp; long oldval; @@ -914,7 +914,7 @@ void rcu_idle_exit(void) * If you add or remove a call to rcu_user_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_user_exit(void) +void notrace rcu_user_exit(void) { rcu_eqs_exit(1); } @@ -932,7 +932,7 @@ void rcu_user_exit(void) * If you add or remove a call to rcu_nmi_enter(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_enter(void) +notrace void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); long incby = 2; @@ -982,7 +982,7 @@ void rcu_nmi_enter(void) * If you add or remove a call to rcu_irq_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_irq_enter(void) +notrace void rcu_irq_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5f6de49dc78e..568818bef28f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2677,7 +2677,7 @@ static void rcu_bind_gp_kthread(void) }
/* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) +static notrace void rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -2685,7 +2685,7 @@ static void rcu_dynticks_task_enter(void) }
/* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) +static notrace void rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
On Wed, Jul 13, 2022 at 06:20:09PM +0800, Zheng Yejian wrote:
This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9] [ 41.509763] Modules linked in: [ 41.512295] CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.19.90 #13 [ 41.516134] Hardware name: linux,dummy-virt (DT) [ 41.519182] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 41.522415] pc : perf_trace_buf_alloc+0x138/0x238 [ 41.525583] lr : perf_trace_buf_alloc+0x138/0x238 [ 41.528656] sp : ffff8000c137e880 [ 41.531050] x29: ffff8000c137e880 x28: ffff20000850ced0 [ 41.534759] x27: 0000000000000000 x26: ffff8000c137e9c0 [ 41.538456] x25: ffff8000ce5c2ae0 x24: ffff200008358b08 [ 41.542151] x23: 0000000000000000 x22: ffff2000084a50ac [ 41.545834] x21: ffff8000c137e880 x20: 000000000000001c [ 41.549516] x19: ffff7dffbfdf88e8 x18: 0000000000000000 [ 41.553202] x17: 0000000000000000 x16: 0000000000000000 [ 41.556892] x15: 1ffff00036e07805 x14: 0000000000000000 [ 41.560592] x13: 0000000000000004 x12: 0000000000000000 [ 41.564315] x11: 1fffefbff7fbf120 x10: ffff0fbff7fbf120 [ 41.568003] x9 : dfff200000000000 x8 : ffff7dffbfdf8904 [ 41.571699] x7 : 0000000000000000 x6 : ffff0fbff7fbf121 [ 41.575398] x5 : ffff0fbff7fbf121 x4 : ffff0fbff7fbf121 [ 41.579086] x3 : ffff20000850cdc8 x2 : 0000000000000008 [ 41.582773] x1 : ffff8000c1376000 x0 : 0000000000000100 [ 41.586495] Call trace: [ 41.588922] perf_trace_buf_alloc+0x138/0x238 [ 41.591912] perf_ftrace_function_call+0x1ac/0x248 [ 41.595123] ftrace_ops_no_ops+0x3a4/0x488 [ 41.597998] ftrace_graph_call+0x0/0xc [ 41.600715] rcu_dynticks_curr_cpu_in_eqs+0x14/0x70 [ 41.603962] rcu_is_watching+0xc/0x20 [ 41.606635] ftrace_ops_no_ops+0x240/0x488 [ 41.609530] ftrace_graph_call+0x0/0xc [ 41.612249] __read_once_size_nocheck.constprop.0+0x1c/0x38 [ 41.615905] unwind_frame+0x140/0x358 [ 41.618597] walk_stackframe+0x34/0x60 [ 41.621359] __save_stack_trace+0x204/0x3b8 [ 41.624328] save_stack_trace+0x2c/0x38 [ 41.627112] __kasan_slab_free+0x120/0x228 [ 41.630018] kasan_slab_free+0x10/0x18 [ 41.632752] kfree+0x84/0x250 [ 41.635107] skb_free_head+0x70/0xb0 [ 41.637772] skb_release_data+0x3f8/0x730 [ 41.640626] skb_release_all+0x50/0x68 [ 41.643350] kfree_skb+0x84/0x278 [ 41.645890] kfree_skb_list+0x4c/0x78 [ 41.648595] __dev_queue_xmit+0x1a4c/0x23a0 [ 41.651541] dev_queue_xmit+0x28/0x38 [ 41.654254] ip6_finish_output2+0xeb0/0x1630 [ 41.657261] ip6_finish_output+0x2d8/0x7f8 [ 41.660174] ip6_output+0x19c/0x348 [ 41.663850] mld_sendpack+0x560/0x9e0 [ 41.666564] mld_ifc_timer_expire+0x484/0x8a8 [ 41.669624] call_timer_fn+0x68/0x4b0 [ 41.672355] expire_timers+0x168/0x498 [ 41.675126] run_timer_softirq+0x230/0x7a8 [ 41.678052] __do_softirq+0x2d0/0xba0 [ 41.680763] run_ksoftirqd+0x110/0x1a0 [ 41.683512] smpboot_thread_fn+0x31c/0x620 [ 41.686429] kthread+0x2c8/0x348 [ 41.688927] ret_from_fork+0x10/0x18
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com
kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-)
Given that no one has noticed this on 4.19 yet, this change is very odd.
What changed to cause this to suddenly happen? How did you test this change, and as you did change the function to be __alaways_inline, what did that cause to have happen?
I would need an ack from the maintainer of this code before I can take an out-of-tree patch.
thanks,
greg k-h
On Fri, 15 Jul 2022 16:39:12 +0200 Greg KH gregkh@linuxfoundation.org wrote:
On Wed, Jul 13, 2022 at 06:20:09PM +0800, Zheng Yejian wrote:
This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9] [ 41.509763] Modules linked in: [ 41.512295] CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.19.90 #13 [ 41.516134] Hardware name: linux,dummy-virt (DT) [ 41.519182] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 41.522415] pc : perf_trace_buf_alloc+0x138/0x238 [ 41.525583] lr : perf_trace_buf_alloc+0x138/0x238 [ 41.528656] sp : ffff8000c137e880 [ 41.531050] x29: ffff8000c137e880 x28: ffff20000850ced0 [ 41.534759] x27: 0000000000000000 x26: ffff8000c137e9c0 [ 41.538456] x25: ffff8000ce5c2ae0 x24: ffff200008358b08 [ 41.542151] x23: 0000000000000000 x22: ffff2000084a50ac [ 41.545834] x21: ffff8000c137e880 x20: 000000000000001c [ 41.549516] x19: ffff7dffbfdf88e8 x18: 0000000000000000 [ 41.553202] x17: 0000000000000000 x16: 0000000000000000 [ 41.556892] x15: 1ffff00036e07805 x14: 0000000000000000 [ 41.560592] x13: 0000000000000004 x12: 0000000000000000 [ 41.564315] x11: 1fffefbff7fbf120 x10: ffff0fbff7fbf120 [ 41.568003] x9 : dfff200000000000 x8 : ffff7dffbfdf8904 [ 41.571699] x7 : 0000000000000000 x6 : ffff0fbff7fbf121 [ 41.575398] x5 : ffff0fbff7fbf121 x4 : ffff0fbff7fbf121 [ 41.579086] x3 : ffff20000850cdc8 x2 : 0000000000000008 [ 41.582773] x1 : ffff8000c1376000 x0 : 0000000000000100 [ 41.586495] Call trace: [ 41.588922] perf_trace_buf_alloc+0x138/0x238 [ 41.591912] perf_ftrace_function_call+0x1ac/0x248 [ 41.595123] ftrace_ops_no_ops+0x3a4/0x488 [ 41.597998] ftrace_graph_call+0x0/0xc [ 41.600715] rcu_dynticks_curr_cpu_in_eqs+0x14/0x70 [ 41.603962] rcu_is_watching+0xc/0x20 [ 41.606635] ftrace_ops_no_ops+0x240/0x488 [ 41.609530] ftrace_graph_call+0x0/0xc [ 41.612249] __read_once_size_nocheck.constprop.0+0x1c/0x38 [ 41.615905] unwind_frame+0x140/0x358 [ 41.618597] walk_stackframe+0x34/0x60 [ 41.621359] __save_stack_trace+0x204/0x3b8 [ 41.624328] save_stack_trace+0x2c/0x38 [ 41.627112] __kasan_slab_free+0x120/0x228 [ 41.630018] kasan_slab_free+0x10/0x18 [ 41.632752] kfree+0x84/0x250 [ 41.635107] skb_free_head+0x70/0xb0 [ 41.637772] skb_release_data+0x3f8/0x730 [ 41.640626] skb_release_all+0x50/0x68 [ 41.643350] kfree_skb+0x84/0x278 [ 41.645890] kfree_skb_list+0x4c/0x78 [ 41.648595] __dev_queue_xmit+0x1a4c/0x23a0 [ 41.651541] dev_queue_xmit+0x28/0x38 [ 41.654254] ip6_finish_output2+0xeb0/0x1630 [ 41.657261] ip6_finish_output+0x2d8/0x7f8 [ 41.660174] ip6_output+0x19c/0x348 [ 41.663850] mld_sendpack+0x560/0x9e0 [ 41.666564] mld_ifc_timer_expire+0x484/0x8a8 [ 41.669624] call_timer_fn+0x68/0x4b0 [ 41.672355] expire_timers+0x168/0x498 [ 41.675126] run_timer_softirq+0x230/0x7a8 [ 41.678052] __do_softirq+0x2d0/0xba0 [ 41.680763] run_ksoftirqd+0x110/0x1a0 [ 41.683512] smpboot_thread_fn+0x31c/0x620 [ 41.686429] kthread+0x2c8/0x348 [ 41.688927] ret_from_fork+0x10/0x18
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com
kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-)
Given that no one has noticed this on 4.19 yet, this change is very odd.
What changed to cause this to suddenly happen? How did you test this change, and as you did change the function to be __alaways_inline, what did that cause to have happen?
Hi greg, Steve had similar questions, so answer them together in: https://lore.kernel.org/lkml/20220716152313.46350-1-zhengyejian1@huawei.com/
There are something need to be discussed.
I would need an ack from the maintainer of this code before I can take an out-of-tree patch.
thanks,
greg k-h
On Sat, Jul 16, 2022 at 11:34:24PM +0800, Zheng Yejian wrote:
On Fri, 15 Jul 2022 16:39:12 +0200 Greg KH gregkh@linuxfoundation.org wrote:
On Wed, Jul 13, 2022 at 06:20:09PM +0800, Zheng Yejian wrote:
This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9] [ 41.509763] Modules linked in: [ 41.512295] CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.19.90 #13 [ 41.516134] Hardware name: linux,dummy-virt (DT) [ 41.519182] pstate: 80c00005 (Nzcv daif +PAN +UAO) [ 41.522415] pc : perf_trace_buf_alloc+0x138/0x238 [ 41.525583] lr : perf_trace_buf_alloc+0x138/0x238 [ 41.528656] sp : ffff8000c137e880 [ 41.531050] x29: ffff8000c137e880 x28: ffff20000850ced0 [ 41.534759] x27: 0000000000000000 x26: ffff8000c137e9c0 [ 41.538456] x25: ffff8000ce5c2ae0 x24: ffff200008358b08 [ 41.542151] x23: 0000000000000000 x22: ffff2000084a50ac [ 41.545834] x21: ffff8000c137e880 x20: 000000000000001c [ 41.549516] x19: ffff7dffbfdf88e8 x18: 0000000000000000 [ 41.553202] x17: 0000000000000000 x16: 0000000000000000 [ 41.556892] x15: 1ffff00036e07805 x14: 0000000000000000 [ 41.560592] x13: 0000000000000004 x12: 0000000000000000 [ 41.564315] x11: 1fffefbff7fbf120 x10: ffff0fbff7fbf120 [ 41.568003] x9 : dfff200000000000 x8 : ffff7dffbfdf8904 [ 41.571699] x7 : 0000000000000000 x6 : ffff0fbff7fbf121 [ 41.575398] x5 : ffff0fbff7fbf121 x4 : ffff0fbff7fbf121 [ 41.579086] x3 : ffff20000850cdc8 x2 : 0000000000000008 [ 41.582773] x1 : ffff8000c1376000 x0 : 0000000000000100 [ 41.586495] Call trace: [ 41.588922] perf_trace_buf_alloc+0x138/0x238 [ 41.591912] perf_ftrace_function_call+0x1ac/0x248 [ 41.595123] ftrace_ops_no_ops+0x3a4/0x488 [ 41.597998] ftrace_graph_call+0x0/0xc [ 41.600715] rcu_dynticks_curr_cpu_in_eqs+0x14/0x70 [ 41.603962] rcu_is_watching+0xc/0x20 [ 41.606635] ftrace_ops_no_ops+0x240/0x488 [ 41.609530] ftrace_graph_call+0x0/0xc [ 41.612249] __read_once_size_nocheck.constprop.0+0x1c/0x38 [ 41.615905] unwind_frame+0x140/0x358 [ 41.618597] walk_stackframe+0x34/0x60 [ 41.621359] __save_stack_trace+0x204/0x3b8 [ 41.624328] save_stack_trace+0x2c/0x38 [ 41.627112] __kasan_slab_free+0x120/0x228 [ 41.630018] kasan_slab_free+0x10/0x18 [ 41.632752] kfree+0x84/0x250 [ 41.635107] skb_free_head+0x70/0xb0 [ 41.637772] skb_release_data+0x3f8/0x730 [ 41.640626] skb_release_all+0x50/0x68 [ 41.643350] kfree_skb+0x84/0x278 [ 41.645890] kfree_skb_list+0x4c/0x78 [ 41.648595] __dev_queue_xmit+0x1a4c/0x23a0 [ 41.651541] dev_queue_xmit+0x28/0x38 [ 41.654254] ip6_finish_output2+0xeb0/0x1630 [ 41.657261] ip6_finish_output+0x2d8/0x7f8 [ 41.660174] ip6_output+0x19c/0x348 [ 41.663850] mld_sendpack+0x560/0x9e0 [ 41.666564] mld_ifc_timer_expire+0x484/0x8a8 [ 41.669624] call_timer_fn+0x68/0x4b0 [ 41.672355] expire_timers+0x168/0x498 [ 41.675126] run_timer_softirq+0x230/0x7a8 [ 41.678052] __do_softirq+0x2d0/0xba0 [ 41.680763] run_ksoftirqd+0x110/0x1a0 [ 41.683512] smpboot_thread_fn+0x31c/0x620 [ 41.686429] kthread+0x2c8/0x348 [ 41.688927] ret_from_fork+0x10/0x18
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com
kernel/rcu/tree.c | 22 +++++++++++----------- kernel/rcu/tree_plugin.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-)
Given that no one has noticed this on 4.19 yet, this change is very odd.
What changed to cause this to suddenly happen? How did you test this change, and as you did change the function to be __alaways_inline, what did that cause to have happen?
Hi greg, Steve had similar questions, so answer them together in: https://lore.kernel.org/lkml/20220716152313.46350-1-zhengyejian1@huawei.com/
There are something need to be discussed.
Great, I'll drop this from my patch queue, please work to find the real issue and solution here.
thanks,
greg k-h
On Wed, 13 Jul 2022 18:20:09 +0800 Zheng Yejian zhengyejian1@huawei.com wrote:
This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9]
This detects something that is spinning with preemption disabled but interrupts enabled.
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
This is not the bug. That code can handle a recursion:
ftrace_graph_call is assembly that is converted to call
void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { [..]
bit = ftrace_test_recursion_trylock(ip, *parent); if (bit < 0) return;
This will stop the code as "bit" will be < 0 on the second call to ftrace_graph_call. If it was a real recursion issue, it would crash the machine when the recursion runs out of stack space.
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
That will not fix your problem.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com
Can you reproduce this consistently without this patch, and then not so with this patch?
Or are you just assuming that this fixes a bug because you observed a recursion?
Please explain to me why this would cause the hang?
-- Steve
On Fri, 15 Jul 2022 18:06:07 -0400 Steven Rostedt rostedt@goodmis.org wrote:
On Wed, 13 Jul 2022 18:20:09 +0800 Zheng Yejian zhengyejian1@huawei.com wrote:
This patch and problem analysis is based on v4.19 LTS, but v5.4 LTS and below seem to be involved.
Hulk Robot reports a softlockup problem, see following logs: [ 41.463870] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ksoftirqd/0:9]
This detects something that is spinning with preemption disabled but interrupts enabled.
Look into above call stack, there is a recursive call in 'ftrace_graph_call', and the direct cause of above recursion is that 'rcu_dynticks_curr_cpu_in_eqs' is traced, see following snippet: __read_once_size_nocheck.constprop.0 ftrace_graph_call <-- 1. first call ...... rcu_dynticks_curr_cpu_in_eqs ftrace_graph_call <-- 2. recursive call here!!!
This is not the bug. That code can handle a recursion:
ftrace_graph_call is assembly that is converted to call
void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { [..]
bit = ftrace_test_recursion_trylock(ip, *parent); if (bit < 0) return;
This will stop the code as "bit" will be < 0 on the second call to ftrace_graph_call. If it was a real recursion issue, it would crash the machine when the recursion runs out of stack space.
Yes, it was not a real recursion issue, the shell was still responsive but became much more slower than normal, maybe something wrong affected the interrupt handling. I'll try to explain it below.
Comparing with mainline kernel, commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") mark related functions as 'noinstr' which implies notrace, noinline and sticks things in the .noinstr.text section. Link: https://lore.kernel.org/all/20200416114706.625340212@infradead.org/
But we cannot directly backport that commit, because there seems to be many prepatches. Instead, marking the functions as 'notrace' where it is 'noinstr' in that commit and mark 'rcu_dynticks_curr_cpu_in_eqs' as inline look like it resolves the problem.
That will not fix your problem.
But base on reproduction rule I summarized, this patch happened to 'fix' the problem, need deeper analysis, please see below.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com
Can you reproduce this consistently without this patch, and then not so with this patch?
Or are you just assuming that this fixes a bug because you observed a recursion?
Please explain to me why this would cause the hang?
-- Steve
I've confirmed that it can be consistently reproduced without this patch by: 1. compile the reproducing code which is placed at the end of this email; 2. compile kernel and start qemu with following options: qemu-system-aarch64 -m 8192 -smp 8 -chardev socket,id=SOCKSYZ,server=on,wait=off,host=localhost,port=15555 \ -mon chardev=SOCKSYZ,mode=control -display none -serial stdio -no-reboot -name VM-3 -device virtio-rng-pci \ -machine virt,accel=kvm,gic-version=3 -cpu host -device virtio-net-pci,netdev=net0 -kernel arch/arm64/boot/Image \ -netdev user,id=net0,restrict=on,hostfwd=tcp:127.0.0.1:40001-:22 -snapshot -initrd rootfs.cpio.gz \ -append "root=/dev/vda console=ttyAMA0 console=ttyAMA0 root=/dev/ram rdinit=/sbin/init nohz_full=1 earlycon=pl011,0x9000000" 3. exceute reproducing binary in one shell; 4. press 'top' command in another shell, wait about 30 seconds, almost inevitably, problem reproduced.
When problem reproduced, the phenomenon I summarized is that: 1. shell reponse is very slow when press anything; 2. 'top' command refresh info very slowly; 3. CPU time spent handling hardware & software interrupts show by 'top' is abnormally high, note following field idle / irq / siq, 4.19.252 is abnormal: 4.19.252(abnormal): CPU: 0.6% usr 23.2% sys 0.0% nic 0.7% idle 0.0% io 63.2% irq 12.1% sirq 5.10.130(normal): CPU: 0.1% usr 57.9% sys 0.0% nic 2.5% idle 0.0% io 33.1% irq 6.1% sirq 4. sometimes no softlockup warnings.
Base on above reproduction method, I perform following practices: 1. Do bisect to find the patch that introduce the problem, currently I found that: 1) on linux-4.19.y, introduced from v4.19.156~29 by commit 2de780dfbe1a ("ftrace: Handle tracing when switching between context") 2) on linux-4.14.y, introduced from v4.14.205~26 by commit 4e3df0faba76 ("ftrace: Handle tracing when switching between context") 3) on v5.4 and above, even contain above commit, but I didn't reproduce, I currently don't know why ??? 2. I write this patch with referencing to commit ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") because at first I think it is the fix, and the problem look like being fixed, see following 'top' output: 5.10.130(normal): CPU: 0.1% usr 57.9% sys 0.0% nic 2.5% idle 0.0% io 33.1% irq 6.1% sirq 4.19.252 before patch: CPU: 0.6% usr 23.2% sys 0.0% nic 0.7% idle 0.0% io 63.2% irq 12.1% sirq 4.19.252 after patch: CPU: 1.3% usr 60.3% sys 0.0% nic 1.7% idle 0.0% io 30.0% irq 6.5% sirq
So we may focus on 726b3d3f141f ("ftrace: Handle tracing when switching between context") to explain the problem and find a real resolution!!!
Reproducing code (reproduce.c) is as follows: ``` C // autogenerated by syzkaller (https://github.com/google/syzkaller)
#define _GNU_SOURCE
#include <arpa/inet.h> #include <dirent.h> #include <endian.h> #include <errno.h> #include <fcntl.h> #include <net/if.h> #include <net/if_arp.h> #include <netinet/in.h> #include <sched.h> #include <signal.h> #include <stdarg.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/prctl.h> #include <sys/resource.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/time.h> #include <sys/types.h> #include <sys/uio.h> #include <sys/wait.h> #include <time.h> #include <unistd.h>
#include <linux/capability.h> #include <linux/genetlink.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_link.h> #include <linux/if_tun.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/neighbour.h> #include <linux/net.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/tcp.h> #include <linux/veth.h>
#ifndef __NR_mmap #define __NR_mmap 222 #endif #ifndef __NR_perf_event_open #define __NR_perf_event_open 241 #endif #ifndef __NR_setsockopt #define __NR_setsockopt 208 #endif
static unsigned long long procid;
static void sleep_ms(uint64_t ms) { usleep(ms * 1000); }
static uint64_t current_time_ms(void) { struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts)) exit(1); return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000; }
#define BITMASK(bf_off, bf_len) (((1ull << (bf_len)) - 1) << (bf_off)) #define STORE_BY_BITMASK(type, htobe, addr, val, bf_off, bf_len) \ *(type*)(addr) = \ htobe((htobe(*(type*)(addr)) & ~BITMASK((bf_off), (bf_len))) | \ (((type)(val) << (bf_off)) & BITMASK((bf_off), (bf_len))))
static bool write_file(const char* file, const char* what, ...) { char buf[1024]; va_list args; va_start(args, what); vsnprintf(buf, sizeof(buf), what, args); va_end(args); buf[sizeof(buf) - 1] = 0; int len = strlen(buf); int fd = open(file, O_WRONLY | O_CLOEXEC); if (fd == -1) return false; if (write(fd, buf, len) != len) { int err = errno; close(fd); errno = err; return false; } close(fd); return true; }
struct nlmsg { char* pos; int nesting; struct nlattr* nested[8]; char buf[4096]; };
static void netlink_init(struct nlmsg* nlmsg, int typ, int flags, const void* data, int size) { memset(nlmsg, 0, sizeof(*nlmsg)); struct nlmsghdr* hdr = (struct nlmsghdr*)nlmsg->buf; hdr->nlmsg_type = typ; hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; memcpy(hdr + 1, data, size); nlmsg->pos = (char*)(hdr + 1) + NLMSG_ALIGN(size); }
static void netlink_attr(struct nlmsg* nlmsg, int typ, const void* data, int size) { struct nlattr* attr = (struct nlattr*)nlmsg->pos; attr->nla_len = sizeof(*attr) + size; attr->nla_type = typ; if (size > 0) memcpy(attr + 1, data, size); nlmsg->pos += NLMSG_ALIGN(attr->nla_len); }
static void netlink_nest(struct nlmsg* nlmsg, int typ) { struct nlattr* attr = (struct nlattr*)nlmsg->pos; attr->nla_type = typ; nlmsg->pos += sizeof(*attr); nlmsg->nested[nlmsg->nesting++] = attr; }
static void netlink_done(struct nlmsg* nlmsg) { struct nlattr* attr = nlmsg->nested[--nlmsg->nesting]; attr->nla_len = nlmsg->pos - (char*)attr; }
static int netlink_send_ext(struct nlmsg* nlmsg, int sock, uint16_t reply_type, int* reply_len, bool dofail) { if (nlmsg->pos > nlmsg->buf + sizeof(nlmsg->buf) || nlmsg->nesting) exit(1); struct nlmsghdr* hdr = (struct nlmsghdr*)nlmsg->buf; hdr->nlmsg_len = nlmsg->pos - nlmsg->buf; struct sockaddr_nl addr; memset(&addr, 0, sizeof(addr)); addr.nl_family = AF_NETLINK; ssize_t n = sendto(sock, nlmsg->buf, hdr->nlmsg_len, 0, (struct sockaddr*)&addr, sizeof(addr)); if (n != (ssize_t)hdr->nlmsg_len) { if (dofail) exit(1); return -1; } n = recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0); if (reply_len) *reply_len = 0; if (n < 0) { if (dofail) exit(1); return -1; } if (n < (ssize_t)sizeof(struct nlmsghdr)) { errno = EINVAL; if (dofail) exit(1); return -1; } if (hdr->nlmsg_type == NLMSG_DONE) return 0; if (reply_len && hdr->nlmsg_type == reply_type) { *reply_len = n; return 0; } if (n < (ssize_t)(sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr))) { errno = EINVAL; if (dofail) exit(1); return -1; } if (hdr->nlmsg_type != NLMSG_ERROR) { errno = EINVAL; if (dofail) exit(1); return -1; } errno = -((struct nlmsgerr*)(hdr + 1))->error; return -errno; }
static int netlink_send(struct nlmsg* nlmsg, int sock) { return netlink_send_ext(nlmsg, sock, 0, NULL, true); }
static int netlink_query_family_id(struct nlmsg* nlmsg, int sock, const char* family_name, bool dofail) { struct genlmsghdr genlhdr; memset(&genlhdr, 0, sizeof(genlhdr)); genlhdr.cmd = CTRL_CMD_GETFAMILY; netlink_init(nlmsg, GENL_ID_CTRL, 0, &genlhdr, sizeof(genlhdr)); netlink_attr(nlmsg, CTRL_ATTR_FAMILY_NAME, family_name, strnlen(family_name, GENL_NAMSIZ - 1) + 1); int n = 0; int err = netlink_send_ext(nlmsg, sock, GENL_ID_CTRL, &n, dofail); if (err < 0) { return -1; } uint16_t id = 0; struct nlattr* attr = (struct nlattr*)(nlmsg->buf + NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(genlhdr))); for (; (char*)attr < nlmsg->buf + n; attr = (struct nlattr*)((char*)attr + NLMSG_ALIGN(attr->nla_len))) { if (attr->nla_type == CTRL_ATTR_FAMILY_ID) { id = *(uint16_t*)(attr + 1); break; } } if (!id) { errno = EINVAL; return -1; } recv(sock, nlmsg->buf, sizeof(nlmsg->buf), 0); return id; }
static int netlink_next_msg(struct nlmsg* nlmsg, unsigned int offset, unsigned int total_len) { struct nlmsghdr* hdr = (struct nlmsghdr*)(nlmsg->buf + offset); if (offset == total_len || offset + hdr->nlmsg_len > total_len) return -1; return hdr->nlmsg_len; }
static void netlink_add_device_impl(struct nlmsg* nlmsg, const char* type, const char* name) { struct ifinfomsg hdr; memset(&hdr, 0, sizeof(hdr)); netlink_init(nlmsg, RTM_NEWLINK, NLM_F_EXCL | NLM_F_CREATE, &hdr, sizeof(hdr)); if (name) netlink_attr(nlmsg, IFLA_IFNAME, name, strlen(name)); netlink_nest(nlmsg, IFLA_LINKINFO); netlink_attr(nlmsg, IFLA_INFO_KIND, type, strlen(type)); }
static void netlink_add_device(struct nlmsg* nlmsg, int sock, const char* type, const char* name) { netlink_add_device_impl(nlmsg, type, name); netlink_done(nlmsg); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_veth(struct nlmsg* nlmsg, int sock, const char* name, const char* peer) { netlink_add_device_impl(nlmsg, "veth", name); netlink_nest(nlmsg, IFLA_INFO_DATA); netlink_nest(nlmsg, VETH_INFO_PEER); nlmsg->pos += sizeof(struct ifinfomsg); netlink_attr(nlmsg, IFLA_IFNAME, peer, strlen(peer)); netlink_done(nlmsg); netlink_done(nlmsg); netlink_done(nlmsg); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_hsr(struct nlmsg* nlmsg, int sock, const char* name, const char* slave1, const char* slave2) { netlink_add_device_impl(nlmsg, "hsr", name); netlink_nest(nlmsg, IFLA_INFO_DATA); int ifindex1 = if_nametoindex(slave1); netlink_attr(nlmsg, IFLA_HSR_SLAVE1, &ifindex1, sizeof(ifindex1)); int ifindex2 = if_nametoindex(slave2); netlink_attr(nlmsg, IFLA_HSR_SLAVE2, &ifindex2, sizeof(ifindex2)); netlink_done(nlmsg); netlink_done(nlmsg); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_linked(struct nlmsg* nlmsg, int sock, const char* type, const char* name, const char* link) { netlink_add_device_impl(nlmsg, type, name); netlink_done(nlmsg); int ifindex = if_nametoindex(link); netlink_attr(nlmsg, IFLA_LINK, &ifindex, sizeof(ifindex)); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_vlan(struct nlmsg* nlmsg, int sock, const char* name, const char* link, uint16_t id, uint16_t proto) { netlink_add_device_impl(nlmsg, "vlan", name); netlink_nest(nlmsg, IFLA_INFO_DATA); netlink_attr(nlmsg, IFLA_VLAN_ID, &id, sizeof(id)); netlink_attr(nlmsg, IFLA_VLAN_PROTOCOL, &proto, sizeof(proto)); netlink_done(nlmsg); netlink_done(nlmsg); int ifindex = if_nametoindex(link); netlink_attr(nlmsg, IFLA_LINK, &ifindex, sizeof(ifindex)); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_macvlan(struct nlmsg* nlmsg, int sock, const char* name, const char* link) { netlink_add_device_impl(nlmsg, "macvlan", name); netlink_nest(nlmsg, IFLA_INFO_DATA); uint32_t mode = MACVLAN_MODE_BRIDGE; netlink_attr(nlmsg, IFLA_MACVLAN_MODE, &mode, sizeof(mode)); netlink_done(nlmsg); netlink_done(nlmsg); int ifindex = if_nametoindex(link); netlink_attr(nlmsg, IFLA_LINK, &ifindex, sizeof(ifindex)); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_add_geneve(struct nlmsg* nlmsg, int sock, const char* name, uint32_t vni, struct in_addr* addr4, struct in6_addr* addr6) { netlink_add_device_impl(nlmsg, "geneve", name); netlink_nest(nlmsg, IFLA_INFO_DATA); netlink_attr(nlmsg, IFLA_GENEVE_ID, &vni, sizeof(vni)); if (addr4) netlink_attr(nlmsg, IFLA_GENEVE_REMOTE, addr4, sizeof(*addr4)); if (addr6) netlink_attr(nlmsg, IFLA_GENEVE_REMOTE6, addr6, sizeof(*addr6)); netlink_done(nlmsg); netlink_done(nlmsg); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
#define IFLA_IPVLAN_FLAGS 2 #define IPVLAN_MODE_L3S 2 #undef IPVLAN_F_VEPA #define IPVLAN_F_VEPA 2
static void netlink_add_ipvlan(struct nlmsg* nlmsg, int sock, const char* name, const char* link, uint16_t mode, uint16_t flags) { netlink_add_device_impl(nlmsg, "ipvlan", name); netlink_nest(nlmsg, IFLA_INFO_DATA); netlink_attr(nlmsg, IFLA_IPVLAN_MODE, &mode, sizeof(mode)); netlink_attr(nlmsg, IFLA_IPVLAN_FLAGS, &flags, sizeof(flags)); netlink_done(nlmsg); netlink_done(nlmsg); int ifindex = if_nametoindex(link); netlink_attr(nlmsg, IFLA_LINK, &ifindex, sizeof(ifindex)); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static void netlink_device_change(struct nlmsg* nlmsg, int sock, const char* name, bool up, const char* master, const void* mac, int macsize, const char* new_name) { struct ifinfomsg hdr; memset(&hdr, 0, sizeof(hdr)); if (up) hdr.ifi_flags = hdr.ifi_change = IFF_UP; hdr.ifi_index = if_nametoindex(name); netlink_init(nlmsg, RTM_NEWLINK, 0, &hdr, sizeof(hdr)); if (new_name) netlink_attr(nlmsg, IFLA_IFNAME, new_name, strlen(new_name)); if (master) { int ifindex = if_nametoindex(master); netlink_attr(nlmsg, IFLA_MASTER, &ifindex, sizeof(ifindex)); } if (macsize) netlink_attr(nlmsg, IFLA_ADDRESS, mac, macsize); int err = netlink_send(nlmsg, sock); if (err < 0) { } }
static int netlink_add_addr(struct nlmsg* nlmsg, int sock, const char* dev, const void* addr, int addrsize) { struct ifaddrmsg hdr; memset(&hdr, 0, sizeof(hdr)); hdr.ifa_family = addrsize == 4 ? AF_INET : AF_INET6; hdr.ifa_prefixlen = addrsize == 4 ? 24 : 120; hdr.ifa_scope = RT_SCOPE_UNIVERSE; hdr.ifa_index = if_nametoindex(dev); netlink_init(nlmsg, RTM_NEWADDR, NLM_F_CREATE | NLM_F_REPLACE, &hdr, sizeof(hdr)); netlink_attr(nlmsg, IFA_LOCAL, addr, addrsize); netlink_attr(nlmsg, IFA_ADDRESS, addr, addrsize); return netlink_send(nlmsg, sock); }
static void netlink_add_addr4(struct nlmsg* nlmsg, int sock, const char* dev, const char* addr) { struct in_addr in_addr; inet_pton(AF_INET, addr, &in_addr); int err = netlink_add_addr(nlmsg, sock, dev, &in_addr, sizeof(in_addr)); if (err < 0) { } }
static void netlink_add_addr6(struct nlmsg* nlmsg, int sock, const char* dev, const char* addr) { struct in6_addr in6_addr; inet_pton(AF_INET6, addr, &in6_addr); int err = netlink_add_addr(nlmsg, sock, dev, &in6_addr, sizeof(in6_addr)); if (err < 0) { } }
static struct nlmsg nlmsg;
#define DEVLINK_FAMILY_NAME "devlink"
#define DEVLINK_CMD_PORT_GET 5 #define DEVLINK_ATTR_BUS_NAME 1 #define DEVLINK_ATTR_DEV_NAME 2 #define DEVLINK_ATTR_NETDEV_NAME 7
static struct nlmsg nlmsg2;
static void initialize_devlink_ports(const char* bus_name, const char* dev_name, const char* netdev_prefix) { struct genlmsghdr genlhdr; int len, total_len, id, err, offset; uint16_t netdev_index; int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); if (sock == -1) exit(1); int rtsock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (rtsock == -1) exit(1); id = netlink_query_family_id(&nlmsg, sock, DEVLINK_FAMILY_NAME, true); if (id == -1) goto error; memset(&genlhdr, 0, sizeof(genlhdr)); genlhdr.cmd = DEVLINK_CMD_PORT_GET; netlink_init(&nlmsg, id, NLM_F_DUMP, &genlhdr, sizeof(genlhdr)); netlink_attr(&nlmsg, DEVLINK_ATTR_BUS_NAME, bus_name, strlen(bus_name) + 1); netlink_attr(&nlmsg, DEVLINK_ATTR_DEV_NAME, dev_name, strlen(dev_name) + 1); err = netlink_send_ext(&nlmsg, sock, id, &total_len, true); if (err < 0) { goto error; } offset = 0; netdev_index = 0; while ((len = netlink_next_msg(&nlmsg, offset, total_len)) != -1) { struct nlattr* attr = (struct nlattr*)(nlmsg.buf + offset + NLMSG_HDRLEN + NLMSG_ALIGN(sizeof(genlhdr))); for (; (char*)attr < nlmsg.buf + offset + len; attr = (struct nlattr*)((char*)attr + NLMSG_ALIGN(attr->nla_len))) { if (attr->nla_type == DEVLINK_ATTR_NETDEV_NAME) { char* port_name; char netdev_name[IFNAMSIZ]; port_name = (char*)(attr + 1); snprintf(netdev_name, sizeof(netdev_name), "%s%d", netdev_prefix, netdev_index); netlink_device_change(&nlmsg2, rtsock, port_name, true, 0, 0, 0, netdev_name); break; } } offset += len; netdev_index++; } error: close(rtsock); close(sock); }
#define DEV_IPV4 "172.20.20.%d" #define DEV_IPV6 "fe80::%02x" #define DEV_MAC 0x00aaaaaaaaaa
static void netdevsim_add(unsigned int addr, unsigned int port_count) { char buf[16]; sprintf(buf, "%u %u", addr, port_count); if (write_file("/sys/bus/netdevsim/new_device", buf)) { snprintf(buf, sizeof(buf), "netdevsim%d", addr); initialize_devlink_ports("netdevsim", buf, "netdevsim"); } }
#define WG_GENL_NAME "wireguard" enum wg_cmd { WG_CMD_GET_DEVICE, WG_CMD_SET_DEVICE, }; enum wgdevice_attribute { WGDEVICE_A_UNSPEC, WGDEVICE_A_IFINDEX, WGDEVICE_A_IFNAME, WGDEVICE_A_PRIVATE_KEY, WGDEVICE_A_PUBLIC_KEY, WGDEVICE_A_FLAGS, WGDEVICE_A_LISTEN_PORT, WGDEVICE_A_FWMARK, WGDEVICE_A_PEERS, }; enum wgpeer_attribute { WGPEER_A_UNSPEC, WGPEER_A_PUBLIC_KEY, WGPEER_A_PRESHARED_KEY, WGPEER_A_FLAGS, WGPEER_A_ENDPOINT, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, WGPEER_A_LAST_HANDSHAKE_TIME, WGPEER_A_RX_BYTES, WGPEER_A_TX_BYTES, WGPEER_A_ALLOWEDIPS, WGPEER_A_PROTOCOL_VERSION, }; enum wgallowedip_attribute { WGALLOWEDIP_A_UNSPEC, WGALLOWEDIP_A_FAMILY, WGALLOWEDIP_A_IPADDR, WGALLOWEDIP_A_CIDR_MASK, };
static void netlink_wireguard_setup(void) { const char ifname_a[] = "wg0"; const char ifname_b[] = "wg1"; const char ifname_c[] = "wg2"; const char private_a[] = "\xa0\x5c\xa8\x4f\x6c\x9c\x8e\x38\x53\xe2\xfd\x7a\x70\xae\x0f\xb2\x0f\xa1" "\x52\x60\x0c\xb0\x08\x45\x17\x4f\x08\x07\x6f\x8d\x78\x43"; const char private_b[] = "\xb0\x80\x73\xe8\xd4\x4e\x91\xe3\xda\x92\x2c\x22\x43\x82\x44\xbb\x88\x5c" "\x69\xe2\x69\xc8\xe9\xd8\x35\xb1\x14\x29\x3a\x4d\xdc\x6e"; const char private_c[] = "\xa0\xcb\x87\x9a\x47\xf5\xbc\x64\x4c\x0e\x69\x3f\xa6\xd0\x31\xc7\x4a\x15" "\x53\xb6\xe9\x01\xb9\xff\x2f\x51\x8c\x78\x04\x2f\xb5\x42"; const char public_a[] = "\x97\x5c\x9d\x81\xc9\x83\xc8\x20\x9e\xe7\x81\x25\x4b\x89\x9f\x8e\xd9\x25" "\xae\x9f\x09\x23\xc2\x3c\x62\xf5\x3c\x57\xcd\xbf\x69\x1c"; const char public_b[] = "\xd1\x73\x28\x99\xf6\x11\xcd\x89\x94\x03\x4d\x7f\x41\x3d\xc9\x57\x63\x0e" "\x54\x93\xc2\x85\xac\xa4\x00\x65\xcb\x63\x11\xbe\x69\x6b"; const char public_c[] = "\xf4\x4d\xa3\x67\xa8\x8e\xe6\x56\x4f\x02\x02\x11\x45\x67\x27\x08\x2f\x5c" "\xeb\xee\x8b\x1b\xf5\xeb\x73\x37\x34\x1b\x45\x9b\x39\x22"; const uint16_t listen_a = 20001; const uint16_t listen_b = 20002; const uint16_t listen_c = 20003; const uint16_t af_inet = AF_INET; const uint16_t af_inet6 = AF_INET6; const struct sockaddr_in endpoint_b_v4 = { .sin_family = AF_INET, .sin_port = htons(listen_b), .sin_addr = {htonl(INADDR_LOOPBACK)}}; const struct sockaddr_in endpoint_c_v4 = { .sin_family = AF_INET, .sin_port = htons(listen_c), .sin_addr = {htonl(INADDR_LOOPBACK)}}; struct sockaddr_in6 endpoint_a_v6 = {.sin6_family = AF_INET6, .sin6_port = htons(listen_a)}; endpoint_a_v6.sin6_addr = in6addr_loopback; struct sockaddr_in6 endpoint_c_v6 = {.sin6_family = AF_INET6, .sin6_port = htons(listen_c)}; endpoint_c_v6.sin6_addr = in6addr_loopback; const struct in_addr first_half_v4 = {0}; const struct in_addr second_half_v4 = {(uint32_t)htonl(128 << 24)}; const struct in6_addr first_half_v6 = {{{0}}}; const struct in6_addr second_half_v6 = {{{0x80}}}; const uint8_t half_cidr = 1; const uint16_t persistent_keepalives[] = {1, 3, 7, 9, 14, 19}; struct genlmsghdr genlhdr = {.cmd = WG_CMD_SET_DEVICE, .version = 1}; int sock; int id, err; sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); if (sock == -1) { return; } id = netlink_query_family_id(&nlmsg, sock, WG_GENL_NAME, true); if (id == -1) goto error; netlink_init(&nlmsg, id, 0, &genlhdr, sizeof(genlhdr)); netlink_attr(&nlmsg, WGDEVICE_A_IFNAME, ifname_a, strlen(ifname_a) + 1); netlink_attr(&nlmsg, WGDEVICE_A_PRIVATE_KEY, private_a, 32); netlink_attr(&nlmsg, WGDEVICE_A_LISTEN_PORT, &listen_a, 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGDEVICE_A_PEERS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_b, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_b_v4, sizeof(endpoint_b_v4)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[0], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v4, sizeof(first_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v6, sizeof(first_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_c, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_c_v6, sizeof(endpoint_c_v6)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[1], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v4, sizeof(second_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v6, sizeof(second_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); err = netlink_send(&nlmsg, sock); if (err < 0) { } netlink_init(&nlmsg, id, 0, &genlhdr, sizeof(genlhdr)); netlink_attr(&nlmsg, WGDEVICE_A_IFNAME, ifname_b, strlen(ifname_b) + 1); netlink_attr(&nlmsg, WGDEVICE_A_PRIVATE_KEY, private_b, 32); netlink_attr(&nlmsg, WGDEVICE_A_LISTEN_PORT, &listen_b, 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGDEVICE_A_PEERS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_a, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_a_v6, sizeof(endpoint_a_v6)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[2], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v4, sizeof(first_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v6, sizeof(first_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_c, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_c_v4, sizeof(endpoint_c_v4)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[3], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v4, sizeof(second_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v6, sizeof(second_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); err = netlink_send(&nlmsg, sock); if (err < 0) { } netlink_init(&nlmsg, id, 0, &genlhdr, sizeof(genlhdr)); netlink_attr(&nlmsg, WGDEVICE_A_IFNAME, ifname_c, strlen(ifname_c) + 1); netlink_attr(&nlmsg, WGDEVICE_A_PRIVATE_KEY, private_c, 32); netlink_attr(&nlmsg, WGDEVICE_A_LISTEN_PORT, &listen_c, 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGDEVICE_A_PEERS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_a, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_a_v6, sizeof(endpoint_a_v6)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[4], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v4, sizeof(first_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &first_half_v6, sizeof(first_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGPEER_A_PUBLIC_KEY, public_b, 32); netlink_attr(&nlmsg, WGPEER_A_ENDPOINT, &endpoint_b_v4, sizeof(endpoint_b_v4)); netlink_attr(&nlmsg, WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL, &persistent_keepalives[5], 2); netlink_nest(&nlmsg, NLA_F_NESTED | WGPEER_A_ALLOWEDIPS); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v4, sizeof(second_half_v4)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_nest(&nlmsg, NLA_F_NESTED | 0); netlink_attr(&nlmsg, WGALLOWEDIP_A_FAMILY, &af_inet6, 2); netlink_attr(&nlmsg, WGALLOWEDIP_A_IPADDR, &second_half_v6, sizeof(second_half_v6)); netlink_attr(&nlmsg, WGALLOWEDIP_A_CIDR_MASK, &half_cidr, 1); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); netlink_done(&nlmsg); err = netlink_send(&nlmsg, sock); if (err < 0) { }
error: close(sock); } static void initialize_netdevices(void) { char netdevsim[16]; sprintf(netdevsim, "netdevsim%d", (int)procid); struct { const char* type; const char* dev; } devtypes[] = { {"ip6gretap", "ip6gretap0"}, {"bridge", "bridge0"}, {"vcan", "vcan0"}, {"bond", "bond0"}, {"team", "team0"}, {"dummy", "dummy0"}, {"nlmon", "nlmon0"}, {"caif", "caif0"}, {"batadv", "batadv0"}, {"vxcan", "vxcan1"}, {"netdevsim", netdevsim}, {"veth", 0}, {"xfrm", "xfrm0"}, {"wireguard", "wg0"}, {"wireguard", "wg1"}, {"wireguard", "wg2"}, }; const char* devmasters[] = {"bridge", "bond", "team", "batadv"}; struct { const char* name; int macsize; bool noipv6; } devices[] = { {"lo", ETH_ALEN}, {"sit0", 0}, {"bridge0", ETH_ALEN}, {"vcan0", 0, true}, {"tunl0", 0}, {"gre0", 0}, {"gretap0", ETH_ALEN}, {"ip_vti0", 0}, {"ip6_vti0", 0}, {"ip6tnl0", 0}, {"ip6gre0", 0}, {"ip6gretap0", ETH_ALEN}, {"erspan0", ETH_ALEN}, {"bond0", ETH_ALEN}, {"veth0", ETH_ALEN}, {"veth1", ETH_ALEN}, {"team0", ETH_ALEN}, {"veth0_to_bridge", ETH_ALEN}, {"veth1_to_bridge", ETH_ALEN}, {"veth0_to_bond", ETH_ALEN}, {"veth1_to_bond", ETH_ALEN}, {"veth0_to_team", ETH_ALEN}, {"veth1_to_team", ETH_ALEN}, {"veth0_to_hsr", ETH_ALEN}, {"veth1_to_hsr", ETH_ALEN}, {"hsr0", 0}, {"dummy0", ETH_ALEN}, {"nlmon0", 0}, {"vxcan0", 0, true}, {"vxcan1", 0, true}, {"caif0", ETH_ALEN}, {"batadv0", ETH_ALEN}, {netdevsim, ETH_ALEN}, {"xfrm0", ETH_ALEN}, {"veth0_virt_wifi", ETH_ALEN}, {"veth1_virt_wifi", ETH_ALEN}, {"virt_wifi0", ETH_ALEN}, {"veth0_vlan", ETH_ALEN}, {"veth1_vlan", ETH_ALEN}, {"vlan0", ETH_ALEN}, {"vlan1", ETH_ALEN}, {"macvlan0", ETH_ALEN}, {"macvlan1", ETH_ALEN}, {"ipvlan0", ETH_ALEN}, {"ipvlan1", ETH_ALEN}, {"veth0_macvtap", ETH_ALEN}, {"veth1_macvtap", ETH_ALEN}, {"macvtap0", ETH_ALEN}, {"macsec0", ETH_ALEN}, {"veth0_to_batadv", ETH_ALEN}, {"veth1_to_batadv", ETH_ALEN}, {"batadv_slave_0", ETH_ALEN}, {"batadv_slave_1", ETH_ALEN}, {"geneve0", ETH_ALEN}, {"geneve1", ETH_ALEN}, {"wg0", 0}, {"wg1", 0}, {"wg2", 0}, }; int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock == -1) exit(1); unsigned i; for (i = 0; i < sizeof(devtypes) / sizeof(devtypes[0]); i++) netlink_add_device(&nlmsg, sock, devtypes[i].type, devtypes[i].dev); for (i = 0; i < sizeof(devmasters) / (sizeof(devmasters[0])); i++) { char master[32], slave0[32], veth0[32], slave1[32], veth1[32]; sprintf(slave0, "%s_slave_0", devmasters[i]); sprintf(veth0, "veth0_to_%s", devmasters[i]); netlink_add_veth(&nlmsg, sock, slave0, veth0); sprintf(slave1, "%s_slave_1", devmasters[i]); sprintf(veth1, "veth1_to_%s", devmasters[i]); netlink_add_veth(&nlmsg, sock, slave1, veth1); sprintf(master, "%s0", devmasters[i]); netlink_device_change(&nlmsg, sock, slave0, false, master, 0, 0, NULL); netlink_device_change(&nlmsg, sock, slave1, false, master, 0, 0, NULL); } netlink_device_change(&nlmsg, sock, "bridge_slave_0", true, 0, 0, 0, NULL); netlink_device_change(&nlmsg, sock, "bridge_slave_1", true, 0, 0, 0, NULL); netlink_add_veth(&nlmsg, sock, "hsr_slave_0", "veth0_to_hsr"); netlink_add_veth(&nlmsg, sock, "hsr_slave_1", "veth1_to_hsr"); netlink_add_hsr(&nlmsg, sock, "hsr0", "hsr_slave_0", "hsr_slave_1"); netlink_device_change(&nlmsg, sock, "hsr_slave_0", true, 0, 0, 0, NULL); netlink_device_change(&nlmsg, sock, "hsr_slave_1", true, 0, 0, 0, NULL); netlink_add_veth(&nlmsg, sock, "veth0_virt_wifi", "veth1_virt_wifi"); netlink_add_linked(&nlmsg, sock, "virt_wifi", "virt_wifi0", "veth1_virt_wifi"); netlink_add_veth(&nlmsg, sock, "veth0_vlan", "veth1_vlan"); netlink_add_vlan(&nlmsg, sock, "vlan0", "veth0_vlan", 0, htons(ETH_P_8021Q)); netlink_add_vlan(&nlmsg, sock, "vlan1", "veth0_vlan", 1, htons(ETH_P_8021AD)); netlink_add_macvlan(&nlmsg, sock, "macvlan0", "veth1_vlan"); netlink_add_macvlan(&nlmsg, sock, "macvlan1", "veth1_vlan"); netlink_add_ipvlan(&nlmsg, sock, "ipvlan0", "veth0_vlan", IPVLAN_MODE_L2, 0); netlink_add_ipvlan(&nlmsg, sock, "ipvlan1", "veth0_vlan", IPVLAN_MODE_L3S, IPVLAN_F_VEPA); netlink_add_veth(&nlmsg, sock, "veth0_macvtap", "veth1_macvtap"); netlink_add_linked(&nlmsg, sock, "macvtap", "macvtap0", "veth0_macvtap"); netlink_add_linked(&nlmsg, sock, "macsec", "macsec0", "veth1_macvtap"); char addr[32]; sprintf(addr, DEV_IPV4, 14 + 10); struct in_addr geneve_addr4; if (inet_pton(AF_INET, addr, &geneve_addr4) <= 0) exit(1); struct in6_addr geneve_addr6; if (inet_pton(AF_INET6, "fc00::01", &geneve_addr6) <= 0) exit(1); netlink_add_geneve(&nlmsg, sock, "geneve0", 0, &geneve_addr4, 0); netlink_add_geneve(&nlmsg, sock, "geneve1", 1, 0, &geneve_addr6); netdevsim_add((int)procid, 4); netlink_wireguard_setup(); for (i = 0; i < sizeof(devices) / (sizeof(devices[0])); i++) { char addr[32]; sprintf(addr, DEV_IPV4, i + 10); netlink_add_addr4(&nlmsg, sock, devices[i].name, addr); if (!devices[i].noipv6) { sprintf(addr, DEV_IPV6, i + 10); netlink_add_addr6(&nlmsg, sock, devices[i].name, addr); } uint64_t macaddr = DEV_MAC + ((i + 10ull) << 40); netlink_device_change(&nlmsg, sock, devices[i].name, true, 0, &macaddr, devices[i].macsize, NULL); } close(sock); } static void initialize_netdevices_init(void) { int sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sock == -1) exit(1); struct { const char* type; int macsize; bool noipv6; bool noup; } devtypes[] = { {"nr", 7, true}, {"rose", 5, true, true}, }; unsigned i; for (i = 0; i < sizeof(devtypes) / sizeof(devtypes[0]); i++) { char dev[32], addr[32]; sprintf(dev, "%s%d", devtypes[i].type, (int)procid); sprintf(addr, "172.30.%d.%d", i, (int)procid + 1); netlink_add_addr4(&nlmsg, sock, dev, addr); if (!devtypes[i].noipv6) { sprintf(addr, "fe88::%02x:%02x", i, (int)procid + 1); netlink_add_addr6(&nlmsg, sock, dev, addr); } int macsize = devtypes[i].macsize; uint64_t macaddr = 0xbbbbbb + ((unsigned long long)i << (8 * (macsize - 2))) + (procid << (8 * (macsize - 1))); netlink_device_change(&nlmsg, sock, dev, !devtypes[i].noup, 0, &macaddr, macsize, NULL); } close(sock); }
#define MAX_FDS 30
static void setup_common() { if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0)) { } }
static void setup_binderfs() { if (mkdir("/dev/binderfs", 0777)) { } if (mount("binder", "/dev/binderfs", "binder", 0, NULL)) { } if (symlink("/dev/binderfs", "./binderfs")) { } }
static void loop();
static void sandbox_common() { prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); setsid(); struct rlimit rlim; rlim.rlim_cur = rlim.rlim_max = (200 << 20); setrlimit(RLIMIT_AS, &rlim); rlim.rlim_cur = rlim.rlim_max = 32 << 20; setrlimit(RLIMIT_MEMLOCK, &rlim); rlim.rlim_cur = rlim.rlim_max = 136 << 20; setrlimit(RLIMIT_FSIZE, &rlim); rlim.rlim_cur = rlim.rlim_max = 1 << 20; setrlimit(RLIMIT_STACK, &rlim); rlim.rlim_cur = rlim.rlim_max = 0; setrlimit(RLIMIT_CORE, &rlim); rlim.rlim_cur = rlim.rlim_max = 256; setrlimit(RLIMIT_NOFILE, &rlim); if (unshare(CLONE_NEWNS)) { } if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) { } if (unshare(CLONE_NEWIPC)) { } if (unshare(0x02000000)) { } if (unshare(CLONE_NEWUTS)) { } if (unshare(CLONE_SYSVSEM)) { } typedef struct { const char* name; const char* value; } sysctl_t; static const sysctl_t sysctls[] = { {"/proc/sys/kernel/shmmax", "16777216"}, {"/proc/sys/kernel/shmall", "536870912"}, {"/proc/sys/kernel/shmmni", "1024"}, {"/proc/sys/kernel/msgmax", "8192"}, {"/proc/sys/kernel/msgmni", "1024"}, {"/proc/sys/kernel/msgmnb", "1024"}, {"/proc/sys/kernel/sem", "1024 1048576 500 1024"}, }; unsigned i; for (i = 0; i < sizeof(sysctls) / sizeof(sysctls[0]); i++) write_file(sysctls[i].name, sysctls[i].value); }
static int wait_for_loop(int pid) { if (pid < 0) exit(1); int status = 0; while (waitpid(-1, &status, __WALL) != pid) { } return WEXITSTATUS(status); }
static void drop_caps(void) { struct __user_cap_header_struct cap_hdr = {}; struct __user_cap_data_struct cap_data[2] = {}; cap_hdr.version = _LINUX_CAPABILITY_VERSION_3; cap_hdr.pid = getpid(); if (syscall(SYS_capget, &cap_hdr, &cap_data)) exit(1); const int drop = (1 << CAP_SYS_PTRACE) | (1 << CAP_SYS_NICE); cap_data[0].effective &= ~drop; cap_data[0].permitted &= ~drop; cap_data[0].inheritable &= ~drop; if (syscall(SYS_capset, &cap_hdr, &cap_data)) exit(1); }
static int do_sandbox_none(void) { if (unshare(CLONE_NEWPID)) { } int pid = fork(); if (pid != 0) return wait_for_loop(pid); setup_common(); sandbox_common(); drop_caps(); initialize_netdevices_init(); if (unshare(CLONE_NEWNET)) { } initialize_netdevices(); setup_binderfs(); loop(); exit(1); }
static void kill_and_wait(int pid, int* status) { kill(-pid, SIGKILL); kill(pid, SIGKILL); for (int i = 0; i < 100; i++) { if (waitpid(-1, status, WNOHANG | __WALL) == pid) return; usleep(1000); } DIR* dir = opendir("/sys/fs/fuse/connections"); if (dir) { for (;;) { struct dirent* ent = readdir(dir); if (!ent) break; if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue; char abort[300]; snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name); int fd = open(abort, O_WRONLY); if (fd == -1) { continue; } if (write(fd, abort, 1) < 0) { } close(fd); } closedir(dir); } else { } while (waitpid(-1, status, __WALL) != pid) { } }
static void setup_test() { prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); setpgrp(); write_file("/proc/self/oom_score_adj", "1000"); }
static void close_fds() { for (int fd = 3; fd < MAX_FDS; fd++) close(fd); }
static void execute_one(void);
#define WAIT_FLAGS __WALL
static void loop(void) { int iter = 0; for (;; iter++) { int pid = fork(); if (pid < 0) exit(1); if (pid == 0) { setup_test(); execute_one(); close_fds(); exit(0); } int status = 0; uint64_t start = current_time_ms(); for (;;) { if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid) break; sleep_ms(1); if (current_time_ms() - start < 5000) continue; kill_and_wait(pid, &status); break; } } }
void execute_one(void) { *(uint32_t*)0x20000740 = 2; *(uint32_t*)0x20000744 = 0x80; *(uint8_t*)0x20000748 = 1; *(uint8_t*)0x20000749 = 0; *(uint8_t*)0x2000074a = 0; *(uint8_t*)0x2000074b = 0; *(uint32_t*)0x2000074c = 0; *(uint64_t*)0x20000750 = 0; *(uint64_t*)0x20000758 = 0; *(uint64_t*)0x20000760 = 0; STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 0, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 1, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 2, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 3, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 4, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 5, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 6, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 7, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 8, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 9, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 10, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 11, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 12, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 13, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 14, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 15, 2); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 17, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 18, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 19, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 20, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 21, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 22, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 23, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 24, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 25, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 26, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 27, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 28, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 29, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 30, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 31, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 32, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 33, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 34, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 35, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 36, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 37, 1); STORE_BY_BITMASK(uint64_t, , 0x20000768, 0, 38, 26); *(uint32_t*)0x20000770 = 0; *(uint32_t*)0x20000774 = 0; *(uint64_t*)0x20000778 = 0; *(uint64_t*)0x20000780 = 0; *(uint64_t*)0x20000788 = 0; *(uint64_t*)0x20000790 = 0; *(uint32_t*)0x20000798 = 0; *(uint32_t*)0x2000079c = 0; *(uint64_t*)0x200007a0 = 0; *(uint32_t*)0x200007a8 = 0; *(uint16_t*)0x200007ac = 0; *(uint16_t*)0x200007ae = 0; *(uint32_t*)0x200007b0 = 0; *(uint32_t*)0x200007b4 = 0; *(uint64_t*)0x200007b8 = 0; syscall(__NR_perf_event_open, 0x20000740ul, -1, 0ul, -1, 0ul); { int i; for (i = 0; i < 32; i++) { syscall(__NR_perf_event_open, 0x20000740ul, -1, 0ul, -1, 0ul); } } *(uint32_t*)0x20000040 = 0; *(uint32_t*)0x20000044 = 0x80; *(uint8_t*)0x20000048 = 0; *(uint8_t*)0x20000049 = 0; *(uint8_t*)0x2000004a = 0; *(uint8_t*)0x2000004b = 0; *(uint32_t*)0x2000004c = 0; *(uint64_t*)0x20000050 = 0; *(uint64_t*)0x20000058 = 0; *(uint64_t*)0x20000060 = 0; STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 0, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 1, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 2, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 3, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 4, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 5, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 6, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 7, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 8, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 9, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 10, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 11, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 12, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 13, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 14, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 15, 2); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 17, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 18, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 19, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 20, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 21, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 22, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 23, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 24, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 25, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 26, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 27, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 28, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 29, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 30, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 31, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 32, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 33, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 34, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 35, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 36, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 37, 1); STORE_BY_BITMASK(uint64_t, , 0x20000068, 0, 38, 26); *(uint32_t*)0x20000070 = 0; *(uint32_t*)0x20000074 = 0; *(uint64_t*)0x20000078 = 0; *(uint64_t*)0x20000080 = 0; *(uint64_t*)0x20000088 = 0; *(uint64_t*)0x20000090 = 0; *(uint32_t*)0x20000098 = 0; *(uint32_t*)0x2000009c = 0; *(uint64_t*)0x200000a0 = 0; *(uint32_t*)0x200000a8 = 0; *(uint16_t*)0x200000ac = 0; *(uint16_t*)0x200000ae = 0; *(uint32_t*)0x200000b0 = 0; *(uint32_t*)0x200000b4 = 0; *(uint64_t*)0x200000b8 = 0; syscall(__NR_perf_event_open, 0x20000040ul, 0, 0ul, -1, 0ul); *(uint32_t*)0x20000000 = htobe32(0xe0000002); *(uint8_t*)0x20000004 = 0xac; *(uint8_t*)0x20000005 = 0x14; *(uint8_t*)0x20000006 = 0x14; *(uint8_t*)0x20000007 = 0xaa; *(uint32_t*)0x20000008 = 0; syscall(__NR_setsockopt, -1, 0, 0x23, 0x20000000ul, 0xcul); } int main(void) { syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); for (procid = 0; procid < 8; procid++) { if (fork() == 0) { do_sandbox_none(); } } sleep(1000000); return 0; } ```
linux-stable-mirror@lists.linaro.org