nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by switching from having a per-queue page_frag_cache to a per-cpu page_frag_cache.
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Reported-by: Dmitry Bogdanov d.bogdanov@yadro.com Signed-off-by: Chris Leech cleech@redhat.com --- drivers/nvme/host/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..a4c4ace5be0f4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -174,7 +174,6 @@ struct nvme_tcp_queue { __le32 recv_ddgst; struct completion tls_complete; int tls_err; - struct page_frag_cache pf_cache;
void (*state_change)(struct sock *); void (*data_ready)(struct sock *); @@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list); static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache); static struct workqueue_struct *nvme_tcp_wq; static const struct blk_mq_ops nvme_tcp_mq_ops; static const struct blk_mq_ops nvme_tcp_admin_mq_ops; @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
- req->pdu = page_frag_alloc(&queue->pf_cache, + req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!req->pdu) @@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
- async->pdu = page_frag_alloc(&queue->pf_cache, + async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!async->pdu) @@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return;
- page_frag_cache_drain(&queue->pf_cache); + page_frag_cache_drain(this_cpu_ptr(&pf_cache));
noreclaim_flag = memalloc_noreclaim_save(); /* ->sock will be released by fput() */