On Sat, Jun 21, 2025 at 12:37 PM chia-yu.chang@nokia-bell-labs.com wrote:
From: Ilpo Järvinen ij@kernel.org
Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt
Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN.
The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction).
Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here.
tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN.
After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch:
}
Signed-off-by: Ilpo Järvinen ij@kernel.org Co-developed-by: Olivier Tilmans olivier.tilmans@nokia.com Signed-off-by: Olivier Tilmans olivier.tilmans@nokia.com Co-developed-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
.../networking/net_cachelines/tcp_sock.rst | 4 + include/linux/tcp.h | 9 +- include/net/tcp.h | 93 ++++++++- net/ipv4/syncookies.c | 3 + net/ipv4/sysctl_net_ipv4.c | 3 +- net/ipv4/tcp.c | 2 + net/ipv4/tcp_input.c | 178 ++++++++++++++++-- net/ipv4/tcp_ipv4.c | 5 +- net/ipv4/tcp_minisocks.c | 51 ++++- net/ipv4/tcp_output.c | 78 ++++++-- net/ipv6/syncookies.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 12 files changed, 384 insertions(+), 44 deletions(-)
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 31313a9adccc..22ac668fe6c7 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -103,6 +103,10 @@ u32 delivered read_mostly read_w u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u8:2 syn_ect_snt write_mostly read_write +u8:2 syn_ect_rcv read_mostly read_write +u8:1 wait_third_ack read_write +u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 874e0e45dfad..1d8301f2883c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos;
bool accecn_ok;
u8 syn_ect_snt: 2,
syn_ect_rcv: 2,
accecn_fail_mode:4;
#ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,9 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */
unused:5;
syn_ect_snt:2, /* AccECN ECT memory, only */
syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */
wait_third_ack:1; /* Wait 3rd ACK in simultaneous open */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
@@ -391,6 +397,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */
u8 keepalive_probes; /* num of allowed keep alive probes */
u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
/* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6cf5cea992e3..4d6325fa3f67 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -27,6 +27,7 @@ #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <linux/bits.h> +#include <linux/bitfield.h>
#include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -234,6 +235,37 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
+/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3)
+static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{
return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
+}
+static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{
return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
+}
+static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{
return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
+}
+static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{
return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
+}
+static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{
tp->accecn_fail_mode |= mode;
+}
/* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ @@ -420,6 +452,23 @@ static inline u8 tcp_accecn_ace(const struct tcphdr *th) return (th->ae << 2) | (th->cwr << 1) | th->ece; }
+/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) +{
if (ace & 0x1)
return INET_ECN_ECT_1;
if (!(ace & 0x2))
return INET_ECN_ECT_0;
if (ace & 0x4)
return INET_ECN_CE;
return INET_ECN_NOT_ECT;
+}
+bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb,
u8 syn_ect_snt);
+void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb);
enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, @@ -657,6 +706,15 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry * dst_feature(dst, RTAX_FEATURE_ECN); }
+/* AccECN specification, 5.1: [...] a server can determine that it
- negotiated AccECN as [...] if the ACK contains an ACE field with
- the value 0b010 to 0b111 (decimal 2 to 7).
- */
+static inline bool cookie_accecn_ok(const struct tcphdr *th) +{
return tcp_accecn_ace(th) > 0x1;
+}
#if IS_ENABLED(CONFIG_BPF) static inline bool cookie_bpf_ok(struct sk_buff *skb) { @@ -968,6 +1026,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)
#define TCP_ACCECN_CEP_ACE_MASK 0x7 #define TCP_ACCECN_ACE_MAX_DELTA 6 @@ -977,6 +1036,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) */ #define TCP_ACCECN_CEP_INIT_OFFSET 5
+/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
- attemped to be negotiated and requested for incoming connection
- and outgoing connection, respectively.
- */
+enum tcp_ecn_mode {
TCP_ECN_IN_NOECN_OUT_NOECN = 0,
TCP_ECN_IN_ECN_OUT_ECN = 1,
TCP_ECN_IN_ECN_OUT_NOECN = 2,
TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
TCP_ECN_IN_ACCECN_OUT_ECN = 4,
TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
+};
static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; @@ -1051,6 +1123,15 @@ struct tcp_skb_cb {
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
+static inline u16 tcp_accecn_reflector_flags(u8 ect) +{
u32 flags = ect + 2;
if (ect == 3)
flags++;
A comment might help, I have no idea of what is going on here.
return FIELD_PREP(TCPHDR_ACE, flags);
+}
extern const struct inet_connection_sock_af_ops ipv4_specific;
#if IS_ENABLED(CONFIG_IPV6) @@ -1173,7 +1254,10 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN BIT(1) -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +/* Require successfully negotiated AccECN capability */ +#define TCP_CONG_NEEDS_ACCECN BIT(2) +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \
TCP_CONG_NEEDS_ACCECN)
union tcp_cc_info;
@@ -1305,6 +1389,13 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk) return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; }
+static inline bool tcp_ca_needs_accecn(const struct sock *sk) +{
const struct inet_connection_sock *icsk = inet_csk(sk);
return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ACCECN;
+}
static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 5459a78b9809..3a44eb9c1d1a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -403,6 +403,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk);
struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4;
@@ -428,6 +429,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) }
ireq = inet_rsk(req);
treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
@@ -482,6 +484,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..75ec1a599b52 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 5;
/* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
.extra2 = &tcp_ecn_mode_max,
Please change Documentation/networking/ip-sysctl.rst tcp_ecn accordingly ?
}, { .procname = "tcp_ecn_fallback",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e0e8d784b1c..e6d7b5420c88 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3392,6 +3392,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0;
tp->wait_third_ack = 0;
tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0fa3803b353d..c986452302cb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -411,14 +411,114 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } }
I do think this patch is too big and should be split.