When operating Cilium in netkit mode with BPF-based host routing, calls to bpf_redirect() cause a kernel panic.
[ 52.247646] BUG: kernel NULL pointer dereference, address: 0000000000000038 ... [ 52.247727] RIP: 0010:bpf_redirect+0x18/0x80 ... [ 52.247986] Call Trace: [ 52.247990] <TASK> [ 52.248002] ? show_regs+0x6c/0x80 [ 52.248024] ? __die+0x24/0x80 [ 52.248029] ? page_fault_oops+0x155/0x570 [ 52.248047] ? fib_rules_lookup+0x112/0x270 [ 52.248056] ? do_user_addr_fault+0x4b2/0x870 [ 52.248063] ? exc_page_fault+0x82/0x1b0 [ 52.248090] ? asm_exc_page_fault+0x27/0x30 [ 52.248103] ? bpf_redirect+0x18/0x80 [ 52.248109] bpf_prog_f0698aabaf44c832_tail_handle_ipv4+0x173f/0x2707 [ 52.248119] ? sbitmap_find_bit+0xe3/0x270 [ 52.248129] netkit_xmit+0x177/0x3c0 [ 52.248139] dev_hard_start_xmit+0x62/0x1d0 [ 52.248149] __dev_queue_xmit+0x241/0xf30 [ 52.248155] ? alloc_skb_with_frags+0x60/0x280 [ 52.248164] ? __check_object_size+0x2a2/0x310 [ 52.248173] ? ip_generic_getfrag+0x63/0x110 [ 52.248181] ip_finish_output2+0x2cf/0x560 [ 52.248187] __ip_finish_output+0xb6/0x180 [ 52.248193] ip_finish_output+0x29/0x120 [ 52.248198] ip_output+0x5f/0x100 [ 52.248204] ? __pfx_ip_finish_output+0x10/0x10 [ 52.248210] ip_send_skb+0x98/0xb0 [ 52.248215] udp_send_skb+0x146/0x370
Setting a breakpoint inside bpf_net_ctx_get_ri() confirms that current->bpf_net_context is NULL right before the panic.
(gdb) p $lx_current().bpf_net_context $4 = (struct bpf_net_context *) 0x0 <fixed_percpu_data> (gdb) disassemble bpf_redirect Dump of assembler code for function bpf_redirect: 0xffffffff81f085e0 <+0>: nopl 0x0(%rax,%rax,1) 0xffffffff81f085e5 <+5>: mov %gs:0x7e12d593(%rip),%rax 0xffffffff81f085ed <+13>: push %rbp 0xffffffff81f085ee <+14>: mov 0x23d0(%rax),%rax => 0xffffffff81f085f5 <+21>: mov %rsp,%rbp 0xffffffff81f085f8 <+24>: mov 0x38(%rax),%edx ... (gdb) continue Continuing.
Thread 1 hit Breakpoint 1, panic ... 288 { (gdb)
commit 401cb7dae813 ("net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.") recently moved bpf_redirect_info into bpf_net_context, a new member of task_struct. Currently, current->bpf_net_context is set and then cleared inside sch_handle_egress() where tcx_run() and tc_run() execute, but it looks like netkit_xmit() was missed leaving current->bpf_net_context uninitialized when it runs. This patch ensures that current->bpf_net_context is initialized while running netkit_xmit().
Signed-off-by: Jordan Rife jrife@google.com Fixes: 401cb7dae813 ("net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.") Cc: stable@vger.kernel.org --- drivers/net/netkit.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index d0036a856039..92ac0cb5a327 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -65,6 +65,7 @@ static struct netkit *netkit_priv(const struct net_device *dev)
static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) { + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct netkit *nk = netkit_priv(dev); enum netkit_action ret = READ_ONCE(nk->policy); netdev_tx_t ret_dev = NET_XMIT_SUCCESS; @@ -73,6 +74,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) int len = skb->len;
rcu_read_lock(); + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); peer = rcu_dereference(nk->peer); if (unlikely(!peer || !(peer->flags & IFF_UP) || !pskb_may_pull(skb, ETH_HLEN) || @@ -109,6 +111,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) ret_dev = NET_XMIT_DROP; break; } + bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); return ret_dev; }