Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure.
--- Changes in v4: - Use the hash value of the saddr, daddr and protocol of outer IP header as encapsulation id. - Link to v3: https://lore.kernel.org/r/20250703-nf-flowtable-ipip-v3-0-880afd319b9f@kerne...
Changes in v3: - Add outer IP header sanity checks - target nf-next tree instead of net-next - Link to v2: https://lore.kernel.org/r/20250627-nf-flowtable-ipip-v2-0-c713003ce75b@kerne...
Changes in v2: - Introduce IPIP flowtable selftest - Link to v1: https://lore.kernel.org/r/20250623-nf-flowtable-ipip-v1-1-2853596e3941@kerne...
--- Lorenzo Bianconi (2): net: netfilter: Add IPIP flowtable SW acceleration selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
include/linux/netdevice.h | 1 + net/ipv4/ipip.c | 25 +++++++++++ net/netfilter/nf_flow_table_ip.c | 48 +++++++++++++++++++++- net/netfilter/nft_flow_offload.c | 1 + .../selftests/net/netfilter/nft_flowtable.sh | 40 ++++++++++++++++++ 5 files changed, 113 insertions(+), 2 deletions(-) --- base-commit: d61f6cb6f6ef3c70d2ccc0d9c85c508cb8017da9 change-id: 20250623-nf-flowtable-ipip-1b3d7b08d067
Best regards,
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):
ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)
$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever
$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1
$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } }
Reproducing the scenario described above using veths I got the following results: - TCP stream transmitted into the IPIP tunnel: - net-next: ~41Gbps - net-next + IPIP flowtbale support: ~40Gbps - TCP stream received from the IPIP tunnel: - net-next: ~35Gbps - net-next + IPIP flowtbale support: ~49Gbps
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- include/linux/netdevice.h | 1 + net/ipv4/ipip.c | 25 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 48 ++++++++++++++++++++++++++++++++++++++-- net/netfilter/nft_flow_offload.c | 1 + 4 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e49d8c98d284bd8f8b0494d1d612c0694de511a1..b0aece4f8b0c14892158176460c9c312f9137875 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -872,6 +872,7 @@ enum net_device_path_type { DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, + DEV_PATH_IPENCAP, };
struct net_device_path { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1ccc3d7597a998a515b6cfdded40b5..0d4d0af129fb25debf5fe76f42bd2c47b7e1616a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); }
+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_IPENCAP; + path->dev = ctx->dev; + path->encap.proto = htons(ETH_P_IP); + path->encap.id = jhash_3words(ntohl(tiph->saddr), ntohl(tiph->daddr), + IPPROTO_IPIP, 0); + + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, };
#define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae21120f1057c4fce5aaca4e3152ae76d..bfd4bb1841d088b4334f726acc9c57222a147eb3 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -147,6 +147,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, { struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct iphdr *iph; int i = 0;
if (skb_vlan_tag_present(skb)) { @@ -165,6 +166,19 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; + case htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*iph))) + break; + + iph = (struct iphdr *)skb_network_header(skb); + if (iph->protocol != IPPROTO_IPIP) + break; + + tuple->encap[i].proto = htons(ETH_P_IP); + tuple->encap[i].id = jhash_3words(ntohl(iph->daddr), + ntohl(iph->saddr), + IPPROTO_IPIP, 0); + break; } }
@@ -277,13 +291,37 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; }
+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{ + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return false; + + iph = (struct iphdr *)skb_network_header(skb); + *size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(*size))) + return false; + + if (iph->ttl <= 1) + return false; + + return iph->protocol == IPPROTO_IPIP; +} + static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto; + u16 size;
switch (skb->protocol) { + case htons(ETH_P_IP): + if (nf_flow_ip4_encap_proto(skb, &size)) + *offset += size; + return true; case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return false; @@ -310,6 +348,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; + u16 size; int i;
for (i = 0; i < tuplehash->tuple.encap_num; i++) { @@ -331,6 +370,12 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP) && + nf_flow_ip4_encap_proto(skb, &size)) { + skb_pull(skb, size); + skb_reset_network_header(skb); + } }
static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -357,8 +402,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};
- if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL;
if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50081a30fc82feeed5bb054f6387f0..4fe9a5e5dab839b17fc2acea835b72efccf7e1d9 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -108,6 +108,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, case DEV_PATH_DSA: case DEV_PATH_VLAN: case DEV_PATH_PPPOE: + case DEV_PATH_IPENCAP: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
Lorenzo Bianconi lorenzo@kernel.org wrote:
- path->type = DEV_PATH_IPENCAP;
- path->dev = ctx->dev;
- path->encap.proto = htons(ETH_P_IP);
- path->encap.id = jhash_3words(ntohl(tiph->saddr), ntohl(tiph->daddr),
IPPROTO_IPIP, 0);
I think it would be better to have a helper. Else I think this needs a comment that explains it must be kept in sync with nf_flow_tuple_encap().
Or use __ipv4_addr_hash(tiph->saddr, (__force __u32)tiph->daddr). (loses IPPROTO_IPIP though).
@@ -165,6 +166,19 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break;
- case htons(ETH_P_IP):
if (!pskb_may_pull(skb, sizeof(*iph)))
break;
Is this needed? Caller does:
if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1;
and then populates the inner header: iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr;
.... so I think this can rely on the outer header being available via skb_network_header().
tuple->encap[i].proto = htons(ETH_P_IP);
tuple->encap[i].id = jhash_3words(ntohl(iph->daddr),
ntohl(iph->saddr),
IPPROTO_IPIP, 0);
See above, I think this desevers a helper or a comment, or both.
+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{
- struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
return false;
Nit: I think this could be 2 * sizeof() and a comment that we will also need the inner ip header later, might save one reallocation.
- iph = (struct iphdr *)skb_network_header(skb);
- *size = iph->ihl << 2;
I think this should be sanity tested vs. sizeof(iph).
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;
- u16 size;
switch (skb->protocol) {
- case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
*offset += size;
Nit: return nf_flow_ip4_encap_proto(skb, &offset) ?
Lorenzo Bianconi lorenzo@kernel.org wrote:
- path->type = DEV_PATH_IPENCAP;
- path->dev = ctx->dev;
- path->encap.proto = htons(ETH_P_IP);
- path->encap.id = jhash_3words(ntohl(tiph->saddr), ntohl(tiph->daddr),
IPPROTO_IPIP, 0);
I think it would be better to have a helper. Else I think this needs a comment that explains it must be kept in sync with nf_flow_tuple_encap().
Or use __ipv4_addr_hash(tiph->saddr, (__force __u32)tiph->daddr). (loses IPPROTO_IPIP though).
ack, I will fix it in v5.
@@ -165,6 +166,19 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break;
- case htons(ETH_P_IP):
if (!pskb_may_pull(skb, sizeof(*iph)))
break;
Is this needed? Caller does:
if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1;
and then populates the inner header: iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr;
.... so I think this can rely on the outer header being available via skb_network_header().
I agree, I will fix it in v5.
tuple->encap[i].proto = htons(ETH_P_IP);
tuple->encap[i].id = jhash_3words(ntohl(iph->daddr),
ntohl(iph->saddr),
IPPROTO_IPIP, 0);
See above, I think this desevers a helper or a comment, or both.
+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{
- struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
return false;
Nit: I think this could be 2 * sizeof() and a comment that we will also need the inner ip header later, might save one reallocation.
nf_flow_ip4_encap_proto() is used even for plain IP traffic but I guess we can assume the IP payload is at least 20B, right?
- iph = (struct iphdr *)skb_network_header(skb);
- *size = iph->ihl << 2;
I think this should be sanity tested vs. sizeof(iph).
I guess this is already done in ip_has_options(), agree?
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;
- u16 size;
switch (skb->protocol) {
- case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
*offset += size;
Nit: return nf_flow_ip4_encap_proto(skb, &offset) ?
ack, I will fix it in v5.
Regards, Lorenzo
Lorenzo Bianconi lorenzo@kernel.org wrote:
Lorenzo Bianconi lorenzo@kernel.org wrote:
+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{
- struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
return false;
Nit: I think this could be 2 * sizeof() and a comment that we will also need the inner ip header later, might save one reallocation.
nf_flow_ip4_encap_proto() is used even for plain IP traffic but I guess we can assume the IP payload is at least 20B, right?
Oh, right, I missed that. But even if we have a.g. ip header with icmp header, then the postconditions are the same, no?
as-is: pskb_may_pull -> ok, then iph->protocol == IPPROTO_IPIP -> return false
with 2*iph: pskb_may_pull -> return false
... but I'll leave it up to you, if you prefer pskb_may_pull(skb, sizeof(*iph))) for clarity then lets keep it as-is.
- iph = (struct iphdr *)skb_network_header(skb);
- *size = iph->ihl << 2;
I think this should be sanity tested vs. sizeof(iph).
I guess this is already done in ip_has_options(), agree?
Indeed it is! Nevermind then :-)
Lorenzo Bianconi lorenzo@kernel.org wrote:
Lorenzo Bianconi lorenzo@kernel.org wrote:
+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{
- struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
return false;
Nit: I think this could be 2 * sizeof() and a comment that we will also need the inner ip header later, might save one reallocation.
nf_flow_ip4_encap_proto() is used even for plain IP traffic but I guess we can assume the IP payload is at least 20B, right?
Oh, right, I missed that. But even if we have a.g. ip header with icmp header, then the postconditions are the same, no?
as-is: pskb_may_pull -> ok, then iph->protocol == IPPROTO_IPIP -> return false
with 2*iph: pskb_may_pull -> return false
... but I'll leave it up to you, if you prefer pskb_may_pull(skb, sizeof(*iph))) for clarity then lets keep it as-is.
I guess the point is we run nf_flow_skb_encap_protocol() not only for IPIP traffic but even for plain IP traffic (e.g. IP+UDP) in nf_flow_offload_lookup(). In particular, we run the following check in nf_flow_tuple_ip() for IP+UDP traffic:
pskb_may_pull(, 28)
That is less restrictive with respect to
pskb_may_pull(, 40)
I guess it is better to keep the original check in nf_flow_skb_encap_protocol(). What do you think?
Regards, Lorenzo
- iph = (struct iphdr *)skb_network_header(skb);
- *size = iph->ihl << 2;
I think this should be sanity tested vs. sizeof(iph).
I guess this is already done in ip_has_options(), agree?
Indeed it is! Nevermind then :-)
Lorenzo Bianconi lorenzo@kernel.org wrote:
I guess the point is we run nf_flow_skb_encap_protocol() not only for IPIP traffic but even for plain IP traffic (e.g. IP+UDP) in nf_flow_offload_lookup(). In particular, we run the following check in nf_flow_tuple_ip() for IP+UDP traffic:
pskb_may_pull(, 28)
That is less restrictive with respect to
pskb_may_pull(, 40)
Its in different functions? 40 bytes in linear area is a prerequisite for IPIP, and a check failure will not drop the skb; we'll still progress onwards to . But ok, it will be caught later too, so lets keep it as-is.
Introduce specific selftest for IPIP flowtable SW acceleration in nft_flowtable.sh
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- .../selftests/net/netfilter/nft_flowtable.sh | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+)
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a4ee5496f2a17cedf1ee71214397012c7906650f..d1c9d3eeda2c9874008f9d6de6cabaabea79b9fb 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -519,6 +519,44 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then ip netns exec "$nsr1" nft list ruleset fi
+# IPIP tunnel test: +# Add IPIP tunnel interfaces and check flowtable acceleration. +test_ipip() { +if ! ip -net "$nsr1" link add name tun0 type ipip \ + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then + echo "SKIP: could not add ipip tunnel" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" link set tun0 up +ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 +ip -net "$nsr2" link set tun0 up +ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr1" route change default via 192.168.100.2 +ip -net "$nsr2" route change default via 192.168.100.1 +ip -net "$ns2" route add default via 10.0.2.1 + +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward \ + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Restore the previous configuration +ip -net "$nsr1" route change default via 192.168.10.2 +ip -net "$nsr2" route change default via 192.168.10.1 +ip -net "$ns2" route del default via 10.0.2.1 +} + # Another test: # Add bridge interface br0 to Router1, with NAT enabled. test_bridge() { @@ -604,6 +642,8 @@ ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad ip -net "$nsr1" link set up dev veth0 }
+test_ipip + test_bridge
KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)
linux-kselftest-mirror@lists.linaro.org