diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0323a2b6c..2fccae0df 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,7 @@ struct sk_psock { struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock;
- ssize_t ingress_size; unsigned long state; struct list_head link; spinlock_t link_lock;
[ ... ]
@@ -321,6 +322,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); }
+static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock) +{
- return READ_ONCE(psock->ingress_size);
+}
+static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff) +{
- WRITE_ONCE(psock->ingress_size, READ_ONCE(psock->ingress_size) + diff);
+}
Can sk_psock_inc_msg_size race with itself? The function implements a read-modify-write operation that isn't atomic. Looking at the call sites:
- sk_psock_queue_msg() holds ingress_lock when calling this - sk_psock_dequeue_msg() holds ingress_lock when calling this - __sk_psock_purge_ingress_msg() holds ingress_lock when calling this - __sk_msg_recvmsg() does NOT hold ingress_lock when calling this
The __sk_msg_recvmsg() path runs under the socket lock via tcp_bpf_recvmsg()->lock_sock(), but sk_psock_queue_msg() can be called from softirq context (network RX -> BPF verdict -> sk_psock_verdict_apply() -> sk_psock_skb_ingress_self() -> sk_psock_queue_msg()) without the socket lock.
This means concurrent updates from different lock contexts can race:
Thread A (softirq, ingress_lock): reads size=1000 Thread B (process, socket lock): reads size=1000, subtracts 50, writes 950 Thread A: adds 100 to stale 1000, writes 1100
Result: size=1100 but should be 1050 (lost the -50 update)
The ingress_size counter would drift from the actual queue size, breaking the FIONREAD accuracy that this patch aims to provide.
static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -329,6 +340,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg);
ret = true; } else { sk_msg_free(psock->sk, msg);sk_psock_inc_msg_size(psock, msg->sg.size);
[ ... ]
@@ -345,18 +357,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
- if (msg)
- if (msg) { list_del(&msg->list);
sk_psock_inc_msg_size(psock, -msg->sg.size);- } spin_unlock_bh(&psock->ingress_lock); return msg;
}
[ ... ]
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d73e03f77..c959d52a6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy;
sk_psock_inc_msg_size(psock, -copy);
This call to sk_psock_inc_msg_size() happens without holding ingress_lock, just the socket lock. Can this race with sk_psock_queue_msg() which holds ingress_lock but not the socket lock?
if (!sge->length) { sk_msg_iter_var_next(i);@@ -819,9 +820,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
sk_msg_free(psock->sk, msg); kfree(msg); }sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));- WARN_ON_ONCE(psock->ingress_size);
}
[ ... ]
--- AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19669112811