On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote:
@@ -302,10 +303,13 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */
- u32 delivered_ecn_bytes[3];
This new fields do not belong to this cacheline group. I'm unsure they belong to fast-path at all. Also u32 will wrap-around very soon.
[...]
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..74ac8a5d2e00 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -298,6 +298,13 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */
- __u32 tcpi_received_ce; /* # of CE marks received */
- __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
- __u32 tcpi_delivered_e0_bytes;
- __u32 tcpi_delivered_ce_bytes;
- __u32 tcpi_received_e1_bytes;
- __u32 tcpi_received_e0_bytes;
- __u32 tcpi_received_ce_bytes;
This will break uAPI: new fields must be addded at the end, or must fill existing holes. Also u32 set in stone in uAPI for a byte counter looks way too small.
@@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 122 + 6);
The above means an additional cacheline in fast-path WRT the current status. IMHO should be avoided.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bd7fc9bcf66..41e45b9aff3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
- we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
- */
+static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return 1;
- case INET_ECN_CE:
return 2;
- case INET_ECN_ECT_0:
return 3;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
No WARN_ONCE() above please: either the 'ecnfield' data is masked vs INET_ECN_MASK and the WARN_ONCE should not be possible or a remote sender can deterministically trigger a WARN() which nowadays will in turn raise a CVE...
[...]
+static u32 tcp_accecn_field_init_offset(u8 ecnfield) +{
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return TCP_ACCECN_E1B_INIT_OFFSET;
- case INET_ECN_CE:
return TCP_ACCECN_CEB_INIT_OFFSET;
- case INET_ECN_ECT_0:
return TCP_ACCECN_E0B_INIT_OFFSET;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
Same as above.
- }
- return 0;
+}
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield,
bool order)
+{
- u8 tmp;
- optfield = order ? 2 - optfield : optfield;
- tmp = optfield + 2;
- return (tmp + (tmp >> 2)) & INET_ECN_MASK;
+}
+/* Handles AccECN option ECT and CE 24-bit byte counters update into
- the u32 value in tcp_sock. As we're processing TCP options, it is
- safe to access from - 1.
- */
+static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset) +{
- u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
0xFFFFFFU;
- u32 delta = (truncated - *cnt) & 0xFFFFFFU;
- /* If delta has the highest bit set (24th bit) indicating
* negative, sign extend to correct an estimation using
* sign_extend32(delta, 24 - 1)
*/
- delta = sign_extend32(delta, 23);
- *cnt += delta;
- return (s32)delta;
+}
+/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp,
const struct sk_buff *skb,
u32 delivered_bytes, int flag)
+{
- u8 estimate_ecnfield = tp->est_ecnfield;
- bool ambiguous_ecn_bytes_incr = false;
- bool first_changed = false;
- unsigned int optlen;
- unsigned char *ptr;
- bool order1, res;
- unsigned int i;
- if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
if (estimate_ecnfield) {
u8 ecnfield = estimate_ecnfield - 1;
tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
return true;
}
return false;
- }
- ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
- optlen = ptr[1] - 2;
This assumes optlen is greater then 2, but I don't see the relevant check. Are tcp options present at all?
- WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1);
Please, don't warn for arbitrary wrong data sent from the peer.
- order1 = (ptr[0] == TCPOPT_ACCECN1);
- ptr += 2;
- res = !!estimate_ecnfield;
- for (i = 0; i < 3; i++) {
if (optlen >= TCPOLEN_ACCECN_PERFIELD) {
u32 init_offset;
u8 ecnfield;
s32 delta;
u32 *cnt;
ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
init_offset = tcp_accecn_field_init_offset(ecnfield);
cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
if (delta) {
if (delta < 0) {
res = false;
ambiguous_ecn_bytes_incr = true;
}
if (ecnfield != estimate_ecnfield) {
if (!first_changed) {
tp->est_ecnfield = ecnfield;
first_changed = true;
} else {
res = false;
ambiguous_ecn_bytes_incr = true;
}
At least 2 indentation levels above the maximum readable.
[...]
@@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0;
- opt_rx->accecn = 0; opt_rx->saw_unknown = 0;
It would be good to be able to zero both 'accecn' and 'saw_unknown' with a single statement.
[...]
@@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); }
- if (OPTION_ACCECN & options) {
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
u32 e0b;
u32 e1b;
u32 ceb;
u8 len;
e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
len = TCPOLEN_ACCECN_BASE +
opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
if (opts->num_accecn_fields == 2) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
} else if (opts->num_accecn_fields == 1) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
leftover_bytes = ((e1b & 0xff) << 8) |
TCPOPT_NOP;
leftover_size = 1;
} else if (opts->num_accecn_fields == 0) {
leftover_bytes = (TCPOPT_ACCECN1 << 8) | len;
leftover_size = 2;
} else if (opts->num_accecn_fields == 3) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
The above chunck and the contents of patch 7 must be in the same patch. This split makes the review even harder.
[...]
@@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; }
- if (tcp_ecn_mode_accecn(tp) &&
sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
int saving = opts->num_sack_blocks > 0 ? 2 : 0;
int remaining = MAX_TCP_OPTION_SPACE - size;
AFACS the above means tcp_options_fit_accecn() must clear any already set options, but apparently it does not do so. Have you tested with something adding largish options like mptcp?
/P