From: Ilpo Järvinen ij@kernel.org
These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx.
Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches:
[BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2640 4 */ u32 received_ce; /* 2644 4 */ u32 app_limited; /* 2648 4 */ u32 rcv_wnd; /* 2652 4 */ struct tcp_options_received rx_opt; /* 2656 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2680 0 */
[...] /* size: 3264, cachelines: 51, members: 169 */ }
[AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2640 4 */ u32 received_ce; /* 2644 4 */ u32 received_ecn_bytes[3];/* 2648 12 */ u32 app_limited; /* 2660 4 */ u32 rcv_wnd; /* 2664 4 */ struct tcp_options_received rx_opt; /* 2668 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2692 0 */ /* XXX 4 bytes hole, try to pack */
[...] /* size: 3264, cachelines: 51, members: 170 */ }
Signed-off-by: Ilpo Järvinen ij@kernel.org Signed-off-by: Neal Cardwell ncardwell@google.com Co-developed-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
--- v8: - Add new helper function tcp_ecn_received_counters_payload() --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 +++ include/net/tcp.h | 12 ++----- include/net/tcp_ecn.h | 34 ++++++++++++++++++- net/ipv4/syncookies.c | 1 + net/ipv4/tcp.c | 3 +- net/ipv4/tcp_input.c | 7 ++-- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/syncookies.c | 1 + 9 files changed, 50 insertions(+), 15 deletions(-)
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 22ac668fe6c7..804480d39132 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1d8301f2883c..0c2331e186e8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d9011557201..62e3ed0a1954 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -652,15 +652,6 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * dst_feature(dst, RTAX_FEATURE_ECN); }
-/* AccECN specification, 5.1: [...] a server can determine that it - * negotiated AccECN as [...] if the ACK contains an ACE field with - * the value 0b010 to 0b111 (decimal 2 to 7). - */ -static inline bool cookie_accecn_ok(const struct tcphdr *th) -{ - return tcp_accecn_ace(th) > 0x1; -} - #if IS_ENABLED(CONFIG_BPF) static inline bool cookie_bpf_ok(struct sk_buff *skb) { @@ -981,6 +972,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0
/* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index eb9aaabc9866..1027edd3c387 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -176,7 +176,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk,
/* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -196,9 +196,29 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (len > 0) + tp->received_ecn_bytes[ecnfield - 1] += len; } }
+static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) +{ + return tcp_accecn_ace(th) > 0x1; +} + /* Used to form the ACE flags for SYN/ACK */ static inline u16 tcp_accecn_reflector_flags(u8 ect) { @@ -233,10 +253,22 @@ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) return ace && ace != 0x3; }
+static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); }
/* Used for make_synack to form the ACE flags */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index e0c1e951c53c..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/route.h>
static siphash_aligned_key_t syncookie_secret[2]; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 31326c462396..8b767f6f9524 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5123,6 +5123,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5130,7 +5131,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4);
/* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7ccfde9bcfda..eea790295e54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6138,7 +6138,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta);
- tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0);
/* We know that such packets are checksummed * on entry. @@ -6184,7 +6184,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb); @@ -6231,7 +6232,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); } - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb);
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 845b1c81b3b0..0a0ee33e38ad 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -473,7 +473,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0b87a3268ae..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> +#include <net/tcp_ecn.h>
#define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)