[ Upstream commit 6043b794c7668c19dabc4a93c75b924a19474d59 ] [ Note: Fixed conflict due to unrelated change in inet_proto_csum_replace_by_diff. ]
During ILA address translations, the L4 checksums can be handled in different ways. One of them, adj-transport, consist in parsing the transport layer and updating any found checksum. This logic relies on inet_proto_csum_replace_by_diff and produces an incorrect skb->csum when in state CHECKSUM_COMPLETE.
This bug can be reproduced with a simple ILA to SIR mapping, assuming packets are received with CHECKSUM_COMPLETE:
$ ip a show dev eth0 14: eth0@if15: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 62:ae:35:9e:0f:8d brd ff:ff:ff:ff:ff:ff link-netnsid 0 inet6 3333:0:0:1::c078/64 scope global valid_lft forever preferred_lft forever inet6 fd00:10:244:1::c078/128 scope global nodad valid_lft forever preferred_lft forever inet6 fe80::60ae:35ff:fe9e:f8d/64 scope link proto kernel_ll valid_lft forever preferred_lft forever $ ip ila add loc_match fd00:10:244:1 loc 3333:0:0:1 \ csum-mode adj-transport ident-type luid dev eth0
Then I hit [fd00:10:244:1::c078]:8000 with a server listening only on [3333:0:0:1::c078]:8000. With the bug, the SYN packet is dropped with SKB_DROP_REASON_TCP_CSUM after inet_proto_csum_replace_by_diff changed skb->csum. The translation and drop are visible on pwru [1] traces:
IFACE TUPLE FUNC eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) ipv6_rcv eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) ip6_rcv_core eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) nf_hook_slow eth0:9 [fd00:10:244:3::3d8]:51420->[fd00:10:244:1::c078]:8000(tcp) inet_proto_csum_replace_by_diff eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) tcp_v6_early_demux eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_route_input eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_input eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_input_finish eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ip6_protocol_deliver_rcu eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) raw6_local_deliver eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) ipv6_raw_deliver eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) tcp_v6_rcv eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) __skb_checksum_complete eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) kfree_skb_reason(SKB_DROP_REASON_TCP_CSUM) eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_release_head_state eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_release_data eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) skb_free_head eth0:9 [fd00:10:244:3::3d8]:51420->[3333:0:0:1::c078]:8000(tcp) kfree_skbmem
This is happening because inet_proto_csum_replace_by_diff is updating skb->csum when it shouldn't. The L4 checksum is updated such that it "cancels" the IPv6 address change in terms of checksum computation, so the impact on skb->csum is null.
Note this would be different for an IPv4 packet since three fields would be updated: the IPv4 address, the IP checksum, and the L4 checksum. Two would cancel each other and skb->csum would still need to be updated to take the L4 checksum change into account.
This patch fixes it by passing an ipv6 flag to inet_proto_csum_replace_by_diff, to skip the skb->csum update if we're in the IPv6 case. Note the behavior of the only other user of inet_proto_csum_replace_by_diff, the BPF subsystem, is left as is in this patch and fixed in the subsequent patch.
With the fix, using the reproduction from above, I can confirm skb->csum is not touched by inet_proto_csum_replace_by_diff and the TCP SYN proceeds to the application after the ILA translation.
Link: https://github.com/cilium/pwru [1] Fixes: 65d7ab8de582 ("net: Identifier Locator Addressing module") Signed-off-by: Paul Chaignon paul.chaignon@gmail.com Acked-by: Daniel Borkmann daniel@iogearbox.net Link: https://patch.msgid.link/b5539869e3550d46068504feb02d37653d939c0b.1748509484... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Paul Chaignon paul.chaignon@gmail.com --- include/net/checksum.h | 2 +- net/core/filter.c | 2 +- net/core/utils.c | 4 ++-- net/ipv6/ila/ila_common.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/include/net/checksum.h b/include/net/checksum.h index d3b5d368a0ca..c975c76b4dd4 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -154,7 +154,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6);
static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, diff --git a/net/core/filter.c b/net/core/filter.c index 9d358fb865e2..65b7fb9c3d29 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1970,7 +1970,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL;
- inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, false); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); diff --git a/net/core/utils.c b/net/core/utils.c index 1f31a39236d5..d010fcf1dc08 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_add(diff, ~skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 95e9146918cc..b8d43ed4689d 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); + diff, true, true); } break; case NEXTHDR_UDP: @@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); + diff, true, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); + diff, true, true); } break; }
[ Upstream commit ead7f9b8de65632ef8060b84b0c55049a33cfea1 ] [ Note: Fixed conflict due to unrelated comment change. ]
In Cilium, we use bpf_csum_diff + bpf_l4_csum_replace to, among other things, update the L4 checksum after reverse SNATing IPv6 packets. That use case is however not currently supported and leads to invalid skb->csum values in some cases. This patch adds support for IPv6 address changes in bpf_l4_csum_update via a new flag.
When calling bpf_l4_csum_replace in Cilium, it ends up calling inet_proto_csum_replace_by_diff:
1: void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, 2: __wsum diff, bool pseudohdr) 3: { 4: if (skb->ip_summed != CHECKSUM_PARTIAL) { 5: csum_replace_by_diff(sum, diff); 6: if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) 7: skb->csum = ~csum_sub(diff, skb->csum); 8: } else if (pseudohdr) { 9: *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); 10: } 11: }
The bug happens when we're in the CHECKSUM_COMPLETE state. We've just updated one of the IPv6 addresses. The helper now updates the L4 header checksum on line 5. Next, it updates skb->csum on line 7. It shouldn't.
For an IPv6 packet, the updates of the IPv6 address and of the L4 checksum will cancel each other. The checksums are set such that computing a checksum over the packet including its checksum will result in a sum of 0. So the same is true here when we update the L4 checksum on line 5. We'll update it as to cancel the previous IPv6 address update. Hence skb->csum should remain untouched in this case.
The same bug doesn't affect IPv4 packets because, in that case, three fields are updated: the IPv4 address, the IP checksum, and the L4 checksum. The change to the IPv4 address and one of the checksums still cancel each other in skb->csum, but we're left with one checksum update and should therefore update skb->csum accordingly. That's exactly what inet_proto_csum_replace_by_diff does.
This special case for IPv6 L4 checksums is also described atop inet_proto_csum_replace16, the function we should be using in this case.
This patch introduces a new bpf_l4_csum_replace flag, BPF_F_IPV6, to indicate that we're updating the L4 checksum of an IPv6 packet. When the flag is set, inet_proto_csum_replace_by_diff will skip the skb->csum update.
Fixes: 7d672345ed295 ("bpf: add generic bpf_csum_diff helper") Signed-off-by: Paul Chaignon paul.chaignon@gmail.com Acked-by: Daniel Borkmann daniel@iogearbox.net Link: https://patch.msgid.link/96a6bc3a443e6f0b21ff7b7834000e17fb549e05.1748509484... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Paul Chaignon paul.chaignon@gmail.com --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 5 +++-- tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0bdeeabbc5a8..2ac62d5ed466 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1695,6 +1695,7 @@ union bpf_attr { * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates * the checksum is to be computed against a pseudo-header. + * Flag **BPF_F_IPV6** should be set for IPv6 packets. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more @@ -5106,6 +5107,7 @@ enum { BPF_F_PSEUDO_HDR = (1ULL << 4), BPF_F_MARK_MANGLED_0 = (1ULL << 5), BPF_F_MARK_ENFORCE = (1ULL << 6), + BPF_F_IPV6 = (1ULL << 7), };
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ diff --git a/net/core/filter.c b/net/core/filter.c index 65b7fb9c3d29..169d9ba4e7a0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1951,10 +1951,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1970,7 +1971,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL;
- inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, false); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 54b8c899d21c..fe70f9ce8b00 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1695,6 +1695,7 @@ union bpf_attr { * for updates resulting in a null checksum the value is set to * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates * the checksum is to be computed against a pseudo-header. + * Flag **BPF_F_IPV6** should be set for IPv6 packets. * * This helper works in combination with **bpf_csum_diff**\ (), * which does not update the checksum in-place, but offers more @@ -5106,6 +5107,7 @@ enum { BPF_F_PSEUDO_HDR = (1ULL << 4), BPF_F_MARK_MANGLED_0 = (1ULL << 5), BPF_F_MARK_ENFORCE = (1ULL << 6), + BPF_F_IPV6 = (1ULL << 7), };
/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
linux-stable-mirror@lists.linaro.org