On 5/14/25 3:56 PM, chia-yu.chang@nokia-bell-labs.com wrote:
This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112.
Note that I'm still concerned by the relevant increase of the cacheline groups size. My fear is that this change could defeat some/most of the benefist from the cacheline reorg for all tcp users.
Some additional feedback from Eric and/or Neal more than welcome!
A possible alternative could be placing all the accounting fields outside of all the fastpath cache groups, i.e. after __cacheline_group_end(tcp_sock_write_rx)
@@ -710,6 +713,8 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +#define NOP_LEFTOVER ((TCPOPT_NOP << 8) | TCPOPT_NOP)
/* Write previously computed TCP options to the packet.
- Beware: Something in the Internet is very sensitive to the ordering of
@@ -728,8 +733,10 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) {
- u16 leftover_bytes = NOP_LEFTOVER; /* replace next NOPs if avail */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */
- int leftover_size = 2;
if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -763,18 +770,64 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); }
- if (OPTION_ACCECN & options) {
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
u32 e0b;
u32 e1b;
u32 ceb;
u8 len;
e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
len = TCPOLEN_ACCECN_BASE +
opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
if (opts->num_accecn_fields == 2) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
} else if (opts->num_accecn_fields == 1) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
leftover_bytes = ((e1b & 0xff) << 8) |
TCPOPT_NOP;
leftover_size = 1;
} else if (opts->num_accecn_fields == 0) {
leftover_bytes = (TCPOPT_ACCECN1 << 8) | len;
leftover_size = 2;
} else if (opts->num_accecn_fields == 3) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
if (tp)
tp->accecn_minlen = 0;
- }
- if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl((leftover_bytes << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
leftover_bytes = NOP_LEFTOVER;
AFAICS here leftover_size could be == 1. why is not reset to 2?
More importantly this looks quite fragile and error prone. *Possibly* have a tcp_accecn_write_option() helper that would rewrite the existing option as needed could be simpler and less impacting for the existing code?
/P