The cb fields network_offset and inner_network_offset are used instead of skb->network_header throughout GRO.
These fields are then leveraged in the next commit to remove flush_id state from napi_gro_cb, and stateful code in {ipv6,inet}_gro_receive which may be unnecessarily complicated due to encapsulation support in GRO. These fields are checked in L4 instead.
3rd patch adds tests for different flush_id flows in GRO.
v8 -> v9: - rename skb_gro_network_offset to skb_gro_receive_network_offset for clarification - improved code readability in tests and gro_network_flush functions - v8: https://lore.kernel.org/all/20240506093550.128210-1-richardbgobert@gmail.com...
v7 -> v8: - Remove network_header use in gro - Re-send commits after the dependent patch to net was applied - v7: https://lore.kernel.org/all/20240412155533.115507-1-richardbgobert@gmail.com...
v6 -> v7: - Moved bug fixes to a separate submission in net - Added UDP fwd benchmark - v6: https://lore.kernel.org/all/20240410153423.107381-1-richardbgobert@gmail.com...
v5 -> v6: - Write inner_network_offset in vxlan and geneve - Ignore is_atomic when DF=0 - v5: https://lore.kernel.org/all/20240408141720.98832-1-richardbgobert@gmail.com/
v4 -> v5: - Add 1st commit - flush id checks in udp_gro_receive segment which can be backported by itself - Add TCP measurements for the 5th commit - Add flush id tests to ensure flush id logic is preserved in GRO - Simplify gro_inet_flush by removing a branch - v4: https://lore.kernel.org/all/202420325182543.87683-1-richardbgobert@gmail.com...
v3 -> v4: - Fix code comment and commit message typos - v3: https://lore.kernel.org/all/f939c84a-2322-4393-a5b0-9b1e0be8ed8e@gmail.com/
v2 -> v3: - Use napi_gro_cb instead of skb->{offset} - v2: https://lore.kernel.org/all/2ce1600b-e733-448b-91ac-9d0ae2b866a4@gmail.com/
v1 -> v2: - Pass p_off in *_gro_complete to fix UDP bug - Remove more conditionals and memory fetches from inet_gro_flush - v1: https://lore.kernel.org/netdev/e1d22505-c5f8-4c02-a997-64248480338b@gmail.co...
Richard Gobert (3): net: gro: use cb instead of skb->network_header net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment selftests/net: add flush id selftests
include/net/gro.h | 87 ++++++++++++++++--- net/core/gro.c | 3 - net/ipv4/af_inet.c | 45 +--------- net/ipv4/tcp_offload.c | 20 ++--- net/ipv4/udp_offload.c | 10 +-- net/ipv6/ip6_offload.c | 16 +--- net/ipv6/tcpv6_offload.c | 3 +- tools/testing/selftests/net/gro.c | 138 ++++++++++++++++++++++++++++++ 8 files changed, 227 insertions(+), 95 deletions(-)
This patch converts references of skb->network_header to napi_gro_cb's network_offset and inner_network_offset.
Signed-off-by: Richard Gobert richardbgobert@gmail.com --- include/net/gro.h | 9 +++++++-- net/ipv4/af_inet.c | 4 ---- net/ipv4/tcp_offload.c | 3 ++- net/ipv6/ip6_offload.c | 5 ++--- net/ipv6/tcpv6_offload.c | 3 ++- 5 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/include/net/gro.h b/include/net/gro.h index c1d4ca0463a1..3dafa0f31ae1 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -181,12 +181,17 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen, return ptr; }
+static inline int skb_gro_receive_network_offset(const struct sk_buff *skb) +{ + return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark]; +} + static inline void *skb_gro_network_header(const struct sk_buff *skb) { if (skb_gro_may_pull(skb, skb_gro_offset(skb))) - return skb_gro_header_fast(skb, skb_network_offset(skb)); + return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb));
- return skb_network_header(skb); + return skb->data + skb_gro_receive_network_offset(skb); }
static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a7bad18bc8b5..428196e1541f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1569,10 +1569,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; - skb_set_network_header(skb, off); - /* The above will be needed by the transport layer if there is one - * immediately following this IP hdr. - */ NAPI_GRO_CB(skb)->inner_network_offset = off;
/* Note : No need to call skb_gro_postpull_rcsum() here, diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index fab0973f995b..b70ae50e658d 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -330,7 +330,8 @@ struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { - const struct iphdr *iph = ip_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c8b909a9904f..288c7c6ea50f 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -67,7 +67,7 @@ static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) off += len; }
- skb_gro_pull(skb, off - skb_network_offset(skb)); + skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; }
@@ -236,7 +236,6 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, if (unlikely(!iph)) goto out;
- skb_set_network_header(skb, off); NAPI_GRO_CB(skb)->inner_network_offset = off;
flush += ntohs(iph->payload_len) != skb->len - hlen; @@ -260,7 +259,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, NAPI_GRO_CB(skb)->proto = proto;
flush--; - nlen = skb_network_header_len(skb); + nlen = skb_gro_offset(skb) - off;
list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 4b07d1e6c952..e70d60e0f86f 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -29,7 +29,8 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; + const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb);
th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP flows.
To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive
patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive
patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert richardbgobert@gmail.com --- include/net/gro.h | 78 +++++++++++++++++++++++++++++++++++++----- net/core/gro.c | 3 -- net/ipv4/af_inet.c | 41 +--------------------- net/ipv4/tcp_offload.c | 17 ++------- net/ipv4/udp_offload.c | 10 ++---- net/ipv6/ip6_offload.c | 11 ------ 6 files changed, 76 insertions(+), 84 deletions(-)
diff --git a/include/net/gro.h b/include/net/gro.h index 3dafa0f31ae1..e0939b4b6579 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -36,15 +36,15 @@ struct napi_gro_cb { /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush;
- /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - /* Number of segments aggregated. */ u16 count;
/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto;
+ /* used to support CHECKSUM_COMPLETE for tunneling protocols */ + __wsum csum; + /* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 @@ -75,8 +75,8 @@ struct napi_gro_cb { /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1;
- /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; + /* Used to determine if ipid_offset can be ignored */ + u8 ip_fixedid:1;
/* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -85,9 +85,6 @@ struct napi_gro_cb { u8 is_flist:1; );
- /* used to support CHECKSUM_COMPLETE for tunneling protocols */ - __wsum csum; - /* L3 offsets */ union { struct { @@ -442,6 +439,71 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, skb_gro_len(skb), proto, 0)); }
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, + struct sk_buff *p, bool outer) +{ + const u32 id = ntohl(*(__be32 *)&iph->id); + const u32 id2 = ntohl(*(__be32 *)&iph2->id); + const u16 ipid_offset = (id >> 16) - (id2 >> 16); + const u16 count = NAPI_GRO_CB(p)->count; + const u32 df = id & IP_DF; + int flush; + + /* All fields must match except length and checksum. */ + flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); + + if (outer && df) + return flush; + + /* When we receive our second frame we can make a decision on if we + * continue this flow as an atomic flow with a fixed ID or if we use + * an incrementing ID. + */ + if (count == 1 && df && !ipid_offset) + NAPI_GRO_CB(p)->ip_fixedid = true; + + if (NAPI_GRO_CB(p)->ip_fixedid && df) + return flush | ipid_offset; + + return flush | (ipid_offset ^ count); +} + +static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{ + /* Version:4<Traffic_Class:8><Flow_Label:20> */ + __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2; + + /* Flush if Traffic Class fields are different. */ + return !!((first_word & htonl(0x0FF00000)) | + (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); +} + +static inline int __gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p, const u16 diff, + bool outer) +{ + const void *nh = th - diff; + const void *nh2 = th2 - diff; + + if (((struct iphdr *)nh)->version == 6) + return ipv6_gro_flush(nh, nh2); + else + return inet_gro_flush(nh, nh2, p, outer); +} + +static inline int gro_receive_network_flush(const void *th, const void *th2, + struct sk_buff *p, int off) +{ + const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; + int flush; + + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); + if (encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + + return flush; +} + int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ diff --git a/net/core/gro.c b/net/core/gro.c index 99a45a5211c9..3e9422c23bc9 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -331,8 +331,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs;
- NAPI_GRO_CB(p)->flush = 0; - if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; @@ -472,7 +470,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 428196e1541f..44564d009e95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1482,7 +1482,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) struct sk_buff *p; unsigned int hlen; unsigned int off; - unsigned int id; int flush = 1; int proto;
@@ -1508,13 +1507,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) goto out;
NAPI_GRO_CB(skb)->proto = proto; - id = ntohl(*(__be32 *)&iph->id); - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); - id >>= 16; + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
list_for_each_entry(p, head, list) { struct iphdr *iph2; - u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -1531,43 +1527,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) NAPI_GRO_CB(p)->same_flow = 0; continue; } - - /* All fields must match except length and checksum. */ - NAPI_GRO_CB(p)->flush |= - (iph->ttl ^ iph2->ttl) | - (iph->tos ^ iph2->tos) | - ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); - - NAPI_GRO_CB(p)->flush |= flush; - - /* We need to store of the IP ID check to be included later - * when we can verify that this packet does in fact belong - * to a given flow. - */ - flush_id = (u16)(id - ntohs(iph2->id)); - - /* This bit of code makes it much easier for us to identify - * the cases where we are doing atomic vs non-atomic IP ID - * checks. Specifically an atomic check can return IP ID - * values 0 - 0xFFFF, while a non-atomic check can only - * return 0 or 0xFFFF. - */ - if (!NAPI_GRO_CB(p)->is_atomic || - !(iph->frag_off & htons(IP_DF))) { - flush_id ^= NAPI_GRO_CB(p)->count; - flush_id = flush_id ? 0xFFFF : 0; - } - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = flush_id; - else - NAPI_GRO_CB(p)->flush_id |= flush_id; }
- NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->inner_network_offset = off;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b70ae50e658d..5f0af1338d62 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -232,9 +232,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) goto out_check_final;
found: - /* Include the IP ID check below from the inner most IP hdr */ - flush = NAPI_GRO_CB(p)->flush; - flush |= (__force int)(flags & TCP_FLAG_CWR); + flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); @@ -242,16 +240,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i);
- /* When we receive our second frame we can made a decision on if we - * continue this flow as an atomic flow with a fixed ID or if we use - * an incrementing ID. - */ - if (NAPI_GRO_CB(p)->flush_id != 1 || - NAPI_GRO_CB(p)->count != 1 || - !NAPI_GRO_CB(p)->is_atomic) - flush |= NAPI_GRO_CB(p)->flush_id; - else - NAPI_GRO_CB(p)->is_atomic = false; + flush |= gro_receive_network_flush(th, th2, p, off);
mss = skb_shinfo(p)->gso_size;
@@ -338,7 +327,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (NAPI_GRO_CB(skb)->is_atomic * SKB_GSO_TCP_FIXEDID); + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
tcp_gro_complete(skb); return 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8721fe5beca2..726565159dc7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -466,6 +466,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + int off = skb_gro_offset(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; @@ -505,14 +506,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return p; }
- flush = NAPI_GRO_CB(p)->flush; - - if (NAPI_GRO_CB(p)->flush_id != 1 || - NAPI_GRO_CB(p)->count != 1 || - !NAPI_GRO_CB(p)->is_atomic) - flush |= NAPI_GRO_CB(p)->flush_id; - else - NAPI_GRO_CB(p)->is_atomic = false; + flush = gro_receive_network_flush(uh, uh2, p, off);
/* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 288c7c6ea50f..bd5aff97d8b1 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -290,19 +290,8 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } - /* flush if Traffic Class fields are different */ - NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) | - (__force __be32)(iph->hop_limit ^ iph2->hop_limit)); - NAPI_GRO_CB(p)->flush |= flush; - - /* If the previous IP ID value was based on an atomic - * datagram we can overwrite the value and ignore it. - */ - if (NAPI_GRO_CB(skb)->is_atomic) - NAPI_GRO_CB(p)->flush_id = 0; }
- NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_postpull_rcsum(skb, iph, nlen);
On Tue, May 7, 2024 at 6:30 PM Richard Gobert richardbgobert@gmail.com wrote:
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP flows.
To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive
patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive
patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert richardbgobert@gmail.com
include/net/gro.h | 78 +++++++++++++++++++++++++++++++++++++----- net/core/gro.c | 3 -- net/ipv4/af_inet.c | 41 +--------------------- net/ipv4/tcp_offload.c | 17 ++------- net/ipv4/udp_offload.c | 10 ++---- net/ipv6/ip6_offload.c | 11 ------ 6 files changed, 76 insertions(+), 84 deletions(-)
diff --git a/include/net/gro.h b/include/net/gro.h index 3dafa0f31ae1..e0939b4b6579 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -36,15 +36,15 @@ struct napi_gro_cb { /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush;
/* Save the IP ID here and check when we get to the transport layer */
u16 flush_id;
/* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto;
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
/* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 @@ -75,8 +75,8 @@ struct napi_gro_cb { /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1;
/* Used to determine if flush_id can be ignored */
u8 is_atomic:1;
/* Used to determine if ipid_offset can be ignored */
u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4;
@@ -85,9 +85,6 @@ struct napi_gro_cb { u8 is_flist:1; );
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
/* L3 offsets */ union { struct {
@@ -442,6 +439,71 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, skb_gro_len(skb), proto, 0)); }
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
struct sk_buff *p, bool outer)
+{
const u32 id = ntohl(*(__be32 *)&iph->id);
const u32 id2 = ntohl(*(__be32 *)&iph2->id);
const u16 ipid_offset = (id >> 16) - (id2 >> 16);
const u16 count = NAPI_GRO_CB(p)->count;
const u32 df = id & IP_DF;
int flush;
/* All fields must match except length and checksum. */
flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
if (outer && df)
return flush;
/* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
if (count == 1 && df && !ipid_offset)
NAPI_GRO_CB(p)->ip_fixedid = true;
I could not find where NAPI_GRO_CB(p)->ip_fixedid was cleared, or initialized if/when GRO is fed with a GRO/GSO packet.
if (NAPI_GRO_CB(p)->ip_fixedid && df)
return flush | ipid_offset;
return flush | (ipid_offset ^ count);
+}
+static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{
/* <Version:4><Traffic_Class:8><Flow_Label:20> */
__be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
/* Flush if Traffic Class fields are different. */
return !!((first_word & htonl(0x0FF00000)) |
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
+}
+static inline int __gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, const u16 diff,
bool outer)
+{
const void *nh = th - diff;
const void *nh2 = th2 - diff;
if (((struct iphdr *)nh)->version == 6)
return ipv6_gro_flush(nh, nh2);
else
return inet_gro_flush(nh, nh2, p, outer);
+}
+static inline int gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, int off)
+{
const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
int flush;
flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
if (encap_mark)
flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
return flush;
+}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ diff --git a/net/core/gro.c b/net/core/gro.c index 99a45a5211c9..3e9422c23bc9 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -331,8 +331,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue;
@@ -472,7 +470,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 428196e1541f..44564d009e95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1482,7 +1482,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) struct sk_buff *p; unsigned int hlen; unsigned int off;
unsigned int id; int flush = 1; int proto;
@@ -1508,13 +1507,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) goto out;
NAPI_GRO_CB(skb)->proto = proto;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF)); list_for_each_entry(p, head, list) { struct iphdr *iph2;
u16 flush_id; if (!NAPI_GRO_CB(p)->same_flow) continue;
@@ -1531,43 +1527,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) NAPI_GRO_CB(p)->same_flow = 0; continue; }
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
NAPI_GRO_CB(p)->flush |= flush;
/* We need to store of the IP ID check to be included later
* when we can verify that this packet does in fact belong
* to a given flow.
*/
flush_id = (u16)(id - ntohs(iph2->id));
/* This bit of code makes it much easier for us to identify
* the cases where we are doing atomic vs non-atomic IP ID
* checks. Specifically an atomic check can return IP ID
* values 0 - 0xFFFF, while a non-atomic check can only
* return 0 or 0xFFFF.
*/
if (!NAPI_GRO_CB(p)->is_atomic ||
!(iph->frag_off & htons(IP_DF))) {
flush_id ^= NAPI_GRO_CB(p)->count;
flush_id = flush_id ? 0xFFFF : 0;
}
/* If the previous IP ID value was based on an atomic
* datagram we can overwrite the value and ignore it.
*/
if (NAPI_GRO_CB(skb)->is_atomic)
NAPI_GRO_CB(p)->flush_id = flush_id;
else
NAPI_GRO_CB(p)->flush_id |= flush_id; }
NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->inner_network_offset = off;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b70ae50e658d..5f0af1338d62 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -232,9 +232,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) goto out_check_final;
found:
/* Include the IP ID check below from the inner most IP hdr */
flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
@@ -242,16 +240,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i);
/* When we receive our second frame we can made a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
if (NAPI_GRO_CB(p)->flush_id != 1 ||
NAPI_GRO_CB(p)->count != 1 ||
!NAPI_GRO_CB(p)->is_atomic)
flush |= NAPI_GRO_CB(p)->flush_id;
else
NAPI_GRO_CB(p)->is_atomic = false;
flush |= gro_receive_network_flush(th, th2, p, off); mss = skb_shinfo(p)->gso_size;
@@ -338,7 +327,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
(NAPI_GRO_CB(skb)->is_atomic * SKB_GSO_TCP_FIXEDID);
(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8721fe5beca2..726565159dc7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -466,6 +466,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb);
int off = skb_gro_offset(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p;
@@ -505,14 +506,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return p; }
flush = NAPI_GRO_CB(p)->flush;
if (NAPI_GRO_CB(p)->flush_id != 1 ||
NAPI_GRO_CB(p)->count != 1 ||
!NAPI_GRO_CB(p)->is_atomic)
flush |= NAPI_GRO_CB(p)->flush_id;
else
NAPI_GRO_CB(p)->is_atomic = false;
flush = gro_receive_network_flush(uh, uh2, p, off); /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 288c7c6ea50f..bd5aff97d8b1 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -290,19 +290,8 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; }
/* flush if Traffic Class fields are different */
NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
NAPI_GRO_CB(p)->flush |= flush;
/* If the previous IP ID value was based on an atomic
* datagram we can overwrite the value and ignore it.
*/
if (NAPI_GRO_CB(skb)->is_atomic)
NAPI_GRO_CB(p)->flush_id = 0; }
NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen);
-- 2.36.1
Eric Dumazet wrote:
On Tue, May 7, 2024 at 6:30 PM Richard Gobert richardbgobert@gmail.com wrote:
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP flows.
To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive
patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive
patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert richardbgobert@gmail.com
include/net/gro.h | 78 +++++++++++++++++++++++++++++++++++++----- net/core/gro.c | 3 -- net/ipv4/af_inet.c | 41 +--------------------- net/ipv4/tcp_offload.c | 17 ++------- net/ipv4/udp_offload.c | 10 ++---- net/ipv6/ip6_offload.c | 11 ------ 6 files changed, 76 insertions(+), 84 deletions(-)
diff --git a/include/net/gro.h b/include/net/gro.h index 3dafa0f31ae1..e0939b4b6579 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -36,15 +36,15 @@ struct napi_gro_cb { /* This is non-zero if the packet cannot be merged with the new skb. */ u16 flush;
/* Save the IP ID here and check when we get to the transport layer */
u16 flush_id;
/* Number of segments aggregated. */ u16 count; /* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */ u16 proto;
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
/* Used in napi_gro_cb::free */ #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 @@ -75,8 +75,8 @@ struct napi_gro_cb { /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1;
/* Used to determine if flush_id can be ignored */
u8 is_atomic:1;
/* Used to determine if ipid_offset can be ignored */
u8 ip_fixedid:1; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4;
@@ -85,9 +85,6 @@ struct napi_gro_cb { u8 is_flist:1; );
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
/* L3 offsets */ union { struct {
@@ -442,6 +439,71 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, skb_gro_len(skb), proto, 0)); }
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
struct sk_buff *p, bool outer)
+{
const u32 id = ntohl(*(__be32 *)&iph->id);
const u32 id2 = ntohl(*(__be32 *)&iph2->id);
const u16 ipid_offset = (id >> 16) - (id2 >> 16);
const u16 count = NAPI_GRO_CB(p)->count;
const u32 df = id & IP_DF;
int flush;
/* All fields must match except length and checksum. */
flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
if (outer && df)
return flush;
/* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
if (count == 1 && df && !ipid_offset)
NAPI_GRO_CB(p)->ip_fixedid = true;
I could not find where NAPI_GRO_CB(p)->ip_fixedid was cleared, or initialized if/when GRO is fed with a GRO/GSO packet.
Interesting, I think that is indeed a bug, that exists also in the current implementation. NAPI_GRO_CB(p)->ip_fixedid (is_atomic before we renamed it in this commit) is cleared as being part of NAPI_GRO_CB(skb)->zeroed in dev_gro_receive. However, in cases like RSC the HW should tell us if we continue to check against FIXEDID or not. Ideally we could initialized ip_fixedid accordingly to skb_shinfo(skb)->gso_type, but I looked around some drivers with RSC and it seems that not all of them support the SKB_GSO_TCP_FIXEDID gso_type.
For example in mlx5:
static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4, struct mlx5_cqe64 *cqe, bool match) { ... skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; ... }
But in google's gve:
static int gve_rx_complete_rsc(struct sk_buff *skb, const struct gve_rx_compl_desc_dqo *desc, struct gve_ptype ptype) { ... switch (ptype.l3_type) { case GVE_L3_TYPE_IPV4: shinfo->gso_type = SKB_GSO_TCPV4; break; ... } And there is no other reference to SKB_GSO_TCP_FIXEDID around the driver.
I'm not sure if we could fix this issue in software GRO in the case where SKB_GSO_TCP_FIXEDID is not set, as we might not have enough information about the ip ids of the hw gro packets. Even if we could, without information from the driver it would add checks and cancel some of the offloading advantages for drivers that assign FIXEDID, so that is probably not ideal. This bug should be reproducible on hw with RSC support on current GRO implementation, as it is unrelated to this series, and be opened as a different patch to net.
if (NAPI_GRO_CB(p)->ip_fixedid && df)
return flush | ipid_offset;
return flush | (ipid_offset ^ count);
+}
+static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) +{
/* <Version:4><Traffic_Class:8><Flow_Label:20> */
__be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
/* Flush if Traffic Class fields are different. */
return !!((first_word & htonl(0x0FF00000)) |
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
+}
+static inline int __gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, const u16 diff,
bool outer)
+{
const void *nh = th - diff;
const void *nh2 = th2 - diff;
if (((struct iphdr *)nh)->version == 6)
return ipv6_gro_flush(nh, nh2);
else
return inet_gro_flush(nh, nh2, p, outer);
+}
+static inline int gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, int off)
+{
const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
int flush;
flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
if (encap_mark)
flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
return flush;
+}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ diff --git a/net/core/gro.c b/net/core/gro.c index 99a45a5211c9..3e9422c23bc9 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -331,8 +331,6 @@ static void gro_list_prepare(const struct list_head *head, list_for_each_entry(p, head, list) { unsigned long diffs;
NAPI_GRO_CB(p)->flush = 0;
if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue;
@@ -472,7 +470,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 428196e1541f..44564d009e95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1482,7 +1482,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) struct sk_buff *p; unsigned int hlen; unsigned int off;
unsigned int id; int flush = 1; int proto;
@@ -1508,13 +1507,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) goto out;
NAPI_GRO_CB(skb)->proto = proto;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF)); list_for_each_entry(p, head, list) { struct iphdr *iph2;
u16 flush_id; if (!NAPI_GRO_CB(p)->same_flow) continue;
@@ -1531,43 +1527,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) NAPI_GRO_CB(p)->same_flow = 0; continue; }
/* All fields must match except length and checksum. */
NAPI_GRO_CB(p)->flush |=
(iph->ttl ^ iph2->ttl) |
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
NAPI_GRO_CB(p)->flush |= flush;
/* We need to store of the IP ID check to be included later
* when we can verify that this packet does in fact belong
* to a given flow.
*/
flush_id = (u16)(id - ntohs(iph2->id));
/* This bit of code makes it much easier for us to identify
* the cases where we are doing atomic vs non-atomic IP ID
* checks. Specifically an atomic check can return IP ID
* values 0 - 0xFFFF, while a non-atomic check can only
* return 0 or 0xFFFF.
*/
if (!NAPI_GRO_CB(p)->is_atomic ||
!(iph->frag_off & htons(IP_DF))) {
flush_id ^= NAPI_GRO_CB(p)->count;
flush_id = flush_id ? 0xFFFF : 0;
}
/* If the previous IP ID value was based on an atomic
* datagram we can overwrite the value and ignore it.
*/
if (NAPI_GRO_CB(skb)->is_atomic)
NAPI_GRO_CB(p)->flush_id = flush_id;
else
NAPI_GRO_CB(p)->flush_id |= flush_id; }
NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->inner_network_offset = off;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index b70ae50e658d..5f0af1338d62 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -232,9 +232,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) goto out_check_final;
found:
/* Include the IP ID check below from the inner most IP hdr */
flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
@@ -242,16 +240,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i);
/* When we receive our second frame we can made a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
if (NAPI_GRO_CB(p)->flush_id != 1 ||
NAPI_GRO_CB(p)->count != 1 ||
!NAPI_GRO_CB(p)->is_atomic)
flush |= NAPI_GRO_CB(p)->flush_id;
else
NAPI_GRO_CB(p)->is_atomic = false;
flush |= gro_receive_network_flush(th, th2, p, off); mss = skb_shinfo(p)->gso_size;
@@ -338,7 +327,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
(NAPI_GRO_CB(skb)->is_atomic * SKB_GSO_TCP_FIXEDID);
(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 8721fe5beca2..726565159dc7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -466,6 +466,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb);
int off = skb_gro_offset(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p;
@@ -505,14 +506,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return p; }
flush = NAPI_GRO_CB(p)->flush;
if (NAPI_GRO_CB(p)->flush_id != 1 ||
NAPI_GRO_CB(p)->count != 1 ||
!NAPI_GRO_CB(p)->is_atomic)
flush |= NAPI_GRO_CB(p)->flush_id;
else
NAPI_GRO_CB(p)->is_atomic = false;
flush = gro_receive_network_flush(uh, uh2, p, off); /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 288c7c6ea50f..bd5aff97d8b1 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -290,19 +290,8 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; }
/* flush if Traffic Class fields are different */
NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
NAPI_GRO_CB(p)->flush |= flush;
/* If the previous IP ID value was based on an atomic
* datagram we can overwrite the value and ignore it.
*/
if (NAPI_GRO_CB(skb)->is_atomic)
NAPI_GRO_CB(p)->flush_id = 0; }
NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen);
-- 2.36.1
On Thu, May 9, 2024 at 8:58 PM Richard Gobert richardbgobert@gmail.com wrote:
Interesting, I think that is indeed a bug, that exists also in the current implementation. NAPI_GRO_CB(p)->ip_fixedid (is_atomic before we renamed it in this commit) is cleared as being part of NAPI_GRO_CB(skb)->zeroed in dev_gro_receive.
And the code there seems wrong.
A compiler can absolutely reorder things, I have seen this many times.
I would play safe here, to make sure NAPI_GRO_CB(skb)->is_atomic = 1; can not be lost.
diff --git a/net/core/gro.c b/net/core/gro.c index c7901253a1a8fc1e9425add77014e15b363a1623..6e4203ea4d54b8955a504e42633f7667740b796e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -470,6 +470,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; + barrier(); NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1;
Eric Dumazet wrote:
On Thu, May 9, 2024 at 8:58 PM Richard Gobert richardbgobert@gmail.com wrote:
Interesting, I think that is indeed a bug, that exists also in the current implementation. NAPI_GRO_CB(p)->ip_fixedid (is_atomic before we renamed it in this commit) is cleared as being part of NAPI_GRO_CB(skb)->zeroed in dev_gro_receive.
And the code there seems wrong.
A compiler can absolutely reorder things, I have seen this many times.
I would play safe here, to make sure NAPI_GRO_CB(skb)->is_atomic = 1; can not be lost.
diff --git a/net/core/gro.c b/net/core/gro.c index c7901253a1a8fc1e9425add77014e15b363a1623..6e4203ea4d54b8955a504e42633f7667740b796e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -470,6 +470,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
barrier(); NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1;
I removed NAPI_GRO_CB(skb)->is_atomic = 1 from dev_gro_receive in this series since it is not needed anymore, so going forward this issue is fixed. I understand your concern about previous versions, I could send a patch to net to settle this issue. WDYT?
Richard Gobert wrote:
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP flows.
To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive
patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive
patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert richardbgobert@gmail.com
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
struct sk_buff *p, bool outer)
+{
- const u32 id = ntohl(*(__be32 *)&iph->id);
- const u32 id2 = ntohl(*(__be32 *)&iph2->id);
- const u16 ipid_offset = (id >> 16) - (id2 >> 16);
- const u16 count = NAPI_GRO_CB(p)->count;
- const u32 df = id & IP_DF;
- int flush;
- /* All fields must match except length and checksum. */
- flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
- if (outer && df)
return flush;
if (flush) return 1;
To be able to avoid the two flush | below? Or to avoid adding a branch
if (flush | (outer && df)) return 1;
- /* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
- if (count == 1 && df && !ipid_offset)
NAPI_GRO_CB(p)->ip_fixedid = true;
- if (NAPI_GRO_CB(p)->ip_fixedid && df)
return flush | ipid_offset;
- return flush | (ipid_offset ^ count);
And then simply
if (NAPI_GRO_CB(p)->ip_fixedid) return ipid_offset; else return ipid_offset ^ count;
Since NAPI_GRO_CB(p)->ip_fixedid is only set if DF is set on the first two segments, and df ^ id2 & IP_DF is tested above, no need to test that again.
+}
Willem de Bruijn wrote:
Richard Gobert wrote:
{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags, iph->id, ...) against all packets in a loop. These flush checks are used in all merging UDP and TCP flows.
These checks need to be done only once and only against the found p skb, since they only affect flush and not same_flow.
This patch leverages correct network header offsets from the cb for both outer and inner network headers - allowing these checks to be done only once, in tcp_gro_receive and udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are more declarative and contained in inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
This results in less parsing code for non-loop flush tests for TCP and UDP flows.
To make sure results are not within noise range - I've made netfilter drop all TCP packets, and measured CPU performance in GRO (in this case GRO is responsible for about 50% of the CPU utilization).
perf top while replaying 64 parallel IP/TCP streams merging in GRO: (gro_receive_network_flush is compiled inline to tcp_gro_receive) net-next: 6.94% [kernel] [k] inet_gro_receive 3.02% [kernel] [k] tcp_gro_receive
patch applied: 4.27% [kernel] [k] tcp_gro_receive 4.22% [kernel] [k] inet_gro_receive
perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same results for any encapsulation, in this case inet_gro_receive is top offender in net-next) net-next: 10.09% [kernel] [k] inet_gro_receive 2.08% [kernel] [k] tcp_gro_receive
patch applied: 6.97% [kernel] [k] inet_gro_receive 3.68% [kernel] [k] tcp_gro_receive
Signed-off-by: Richard Gobert richardbgobert@gmail.com
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
struct sk_buff *p, bool outer)
+{
- const u32 id = ntohl(*(__be32 *)&iph->id);
- const u32 id2 = ntohl(*(__be32 *)&iph2->id);
- const u16 ipid_offset = (id >> 16) - (id2 >> 16);
- const u16 count = NAPI_GRO_CB(p)->count;
- const u32 df = id & IP_DF;
- int flush;
- /* All fields must match except length and checksum. */
- flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
- if (outer && df)
return flush;
if (flush) return 1;
To be able to avoid the two flush | below? Or to avoid adding a branch
if (flush | (outer && df)) return 1;
- /* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
- if (count == 1 && df && !ipid_offset)
NAPI_GRO_CB(p)->ip_fixedid = true;
- if (NAPI_GRO_CB(p)->ip_fixedid && df)
return flush | ipid_offset;
- return flush | (ipid_offset ^ count);
And then simply
if (NAPI_GRO_CB(p)->ip_fixedid) return ipid_offset; else return ipid_offset ^ count;
Since NAPI_GRO_CB(p)->ip_fixedid is only set if DF is set on the first two segments, and df ^ id2 & IP_DF is tested above, no need to test that again.
+}
I like the idea, it is more readable. We just need to return flush, and not 1 to make it correct (since flush could be 0 while outer && df set):
if (flush | (outer && df)) return flush;
Not setting NAPI_GRO_CB(p)->ip_fixedid when flush==1 is a slight change from the previous behaviour. AFAIU it is ok since it doesn't change GRO logic - p will be flushed from gro_list and NAPI_GRO_CB becomes irrelevant. Removing the DF check is nice, I also think we can avoid a branch while keeping the code readable as follows:
return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid);
Added flush id selftests to test different cases where DF flag is set or unset and id value changes in the following packets. All cases where the packets should coalesce or should not coalesce are tested.
Signed-off-by: Richard Gobert richardbgobert@gmail.com --- tools/testing/selftests/net/gro.c | 138 ++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+)
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index 353e1e867fbb..e8fc94c2b9e5 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -93,6 +93,7 @@ static bool tx_socket = true; static int tcp_offset = -1; static int total_hdr_len = -1; static int ethhdr_proto = -1; +static const int num_flush_id_cases = 6;
static void vlog(const char *fmt, ...) { @@ -617,6 +618,113 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE); }
+static void fix_ip4_checksum(struct iphdr *iph) +{ + iph->check = 0; + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0); +} + +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) +{ + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN]; + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN]; + bool send_three = false; + struct iphdr *iph1; + struct iphdr *iph2; + struct iphdr *iph3; + + iph1 = (struct iphdr *)(buf1 + ETH_HLEN); + iph2 = (struct iphdr *)(buf2 + ETH_HLEN); + iph3 = (struct iphdr *)(buf3 + ETH_HLEN); + + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0); + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0); + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0); + + switch (tcase) { + case 0: /* DF=1, Incrementing - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + break; + + case 1: /* DF=1, Fixed - should coalesce */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + break; + + case 2: /* DF=0, Incrementing - should coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(9); + break; + + case 3: /* DF=0, Fixed - should not coalesce */ + iph1->frag_off &= ~htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off &= ~htons(IP_DF); + iph2->id = htons(8); + break; + + case 4: /* DF=1, two packets incrementing, and one fixed - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(9); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + + case 5: /* DF=1, two packets fixed, and one incrementing - should + * coalesce only the first two packets + */ + iph1->frag_off |= htons(IP_DF); + iph1->id = htons(8); + + iph2->frag_off |= htons(IP_DF); + iph2->id = htons(8); + + iph3->frag_off |= htons(IP_DF); + iph3->id = htons(9); + send_three = true; + break; + } + + fix_ip4_checksum(iph1); + fix_ip4_checksum(iph2); + write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr); + write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr); + + if (send_three) { + fix_ip4_checksum(iph3); + write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr); + } +} + +static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt) +{ + for (int i = 0; i < num_flush_id_cases; i++) { + sleep(1); + send_flush_id_case(fd, daddr, i); + sleep(1); + write_packet(fd, fin_pkt, total_hdr_len, daddr); + } +} + static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2) { static char buf[MAX_HDR_LEN + PAYLOAD_LEN]; @@ -935,6 +1043,8 @@ static void gro_sender(void) send_fragment4(txfd, &daddr); sleep(1); write_packet(txfd, fin_pkt, total_hdr_len, &daddr); + + test_flush_id(txfd, &daddr, fin_pkt); } else if (proto == PF_INET6) { sleep(1); send_fragment6(txfd, &daddr); @@ -1061,6 +1171,34 @@ static void gro_receiver(void)
printf("fragmented ip4 doesn't coalesce: "); check_recv_pkts(rxfd, correct_payload, 2); + + /* is_atomic checks */ + printf("DF=1, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=1, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Incrementing - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); + + printf("DF=0, Fixed - should not coalesce: "); + correct_payload[0] = PAYLOAD_LEN; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); + + printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: "); + correct_payload[0] = PAYLOAD_LEN * 2; + correct_payload[1] = PAYLOAD_LEN; + check_recv_pkts(rxfd, correct_payload, 2); } else if (proto == PF_INET6) { /* GRO doesn't check for ipv6 hop limit when flushing. * Hence no corresponding test to the ipv4 case.
On Tue, 7 May 2024 18:23:45 +0200 Richard Gobert wrote:
The cb fields network_offset and inner_network_offset are used instead of skb->network_header throughout GRO.
These fields are then leveraged in the next commit to remove flush_id state from napi_gro_cb, and stateful code in {ipv6,inet}_gro_receive which may be unnecessarily complicated due to encapsulation support in GRO. These fields are checked in L4 instead.
3rd patch adds tests for different flush_id flows in GRO.
Doesn't apply to either tree, please rebase.
linux-kselftest-mirror@lists.linaro.org