From: Ilpo Järvinen ij@kernel.org
These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx.
Below are the pahole outcomes before and after this patch, in which extra 4-byte hole is created but will be exploited later:
[BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2640 4 */ u32 received_ce; /* 2644 4 */ u32 app_limited; /* 2648 4 */ u32 rcv_wnd; /* 2652 4 */ struct tcp_options_received rx_opt; /* 2656 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2680 0 */
[...] /* size: 3264, cachelines: 51, members: 169 */ }
[AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2640 4 */ u32 received_ce; /* 2644 4 */ u32 received_ecn_bytes[3];/* 2648 12 */ u32 app_limited; /* 2660 4 */ u32 rcv_wnd; /* 2664 4 */ struct tcp_options_received rx_opt; /* 2668 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2692 0 */ /* XXX 4 bytes hole, try to pack */
[...] /* size: 3264, cachelines: 51, members: 169 */ }
Signed-off-by: Ilpo Järvinen ij@kernel.org Signed-off-by: Neal Cardwell ncardwell@google.com Co-developed-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 ++++ include/net/tcp.h | 18 +++++++++++++++++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 13 +++++++++---- net/ipv4/tcp_minisocks.c | 3 ++- 6 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index eef8700085de..64306ae096cd 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b0e6a11a3fa1..427f2adab247 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -307,6 +307,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index 5fce4b49e132..101e07b8a089 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -467,7 +467,8 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 syn_ect_snt); -void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb); +void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, + u32 payload_len);
enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -1035,6 +1036,20 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 + +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +}
/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is * attemped to be negotiated and requested for incoming connection @@ -1053,6 +1068,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); }
/* State flags for sacked in struct tcp_skb_cb */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b9340d1f2bc6..7396bfbe7ab4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5095,6 +5095,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5102,7 +5103,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 96 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 108 + 4);
/* RX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f0475f95e112..76fda2a63add 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6135,7 +6135,8 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t }
/* Updates Accurate ECN received counters from the received IP ECN field */ -void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, + u32 payload_len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -6150,6 +6151,9 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (payload_len > 0) + tp->received_ecn_bytes[ecnfield - 1] += payload_len; } }
@@ -6423,7 +6427,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta);
- tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0);
/* We know that such packets are checksummed * on entry. @@ -6469,7 +6473,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb); @@ -6516,7 +6521,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); } - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, len - th->doff * 4);
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 779a206a5ca6..3f8225bae49f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -497,10 +497,11 @@ static void tcp_ecn_openreq_child(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk);
if (treq->accecn_ok) { + const struct tcphdr *th = (const struct tcphdr *)skb->data; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 :