nvme uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by serializing the usage of page_frag_cache.
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Signed-off-by: Dmitry Bogdanov d.bogdanov@yadro.com --- drivers/nvme/host/tcp.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..823e07759e0d3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -145,6 +145,7 @@ struct nvme_tcp_queue {
struct mutex queue_lock; struct mutex send_mutex; + struct mutex pf_cache_lock; struct llist_head req_list; struct list_head send_list;
@@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
+ mutex_lock(&queue->pf_cache_lock); req->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + mutex_unlock(&queue->pf_cache_lock); if (!req->pdu) return -ENOMEM;
@@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
+ mutex_lock(&queue->pf_cache_lock); async->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + mutex_unlock(&queue->pf_cache_lock); if (!async->pdu) return -ENOMEM;
@@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); + mutex_destroy(&queue->pf_cache_lock); }
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) @@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, INIT_LIST_HEAD(&queue->send_list); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); + mutex_init(&queue->pf_cache_lock);
if (qid > 0) queue->cmnd_capsule_len = nctrl->ioccsz * 16; @@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, err_destroy_mutex: mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); + mutex_destroy(&queue->pf_cache_lock); return ret; }
On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote:
nvme uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by serializing the usage of page_frag_cache.
Thank you for reporting this. I think we can fix it without blocking the async namespace scanning with a mutex, by switching from a per-queue page_frag_cache to per-cpu. There shouldn't be a need to keep the page_frag allocations isolated by queue anyway.
It would be great if you could test the patch which I'll send after this.
- Chris
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Signed-off-by: Dmitry Bogdanov d.bogdanov@yadro.com
drivers/nvme/host/tcp.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..823e07759e0d3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -145,6 +145,7 @@ struct nvme_tcp_queue { struct mutex queue_lock; struct mutex send_mutex;
- struct mutex pf_cache_lock; struct llist_head req_list; struct list_head send_list;
@@ -556,9 +557,11 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
- mutex_lock(&queue->pf_cache_lock); req->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
- mutex_unlock(&queue->pf_cache_lock); if (!req->pdu) return -ENOMEM;
@@ -1420,9 +1423,11 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
- mutex_lock(&queue->pf_cache_lock); async->pdu = page_frag_alloc(&queue->pf_cache, sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
- mutex_unlock(&queue->pf_cache_lock); if (!async->pdu) return -ENOMEM;
@@ -1450,6 +1455,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock);
- mutex_destroy(&queue->pf_cache_lock);
} static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) @@ -1772,6 +1778,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, INIT_LIST_HEAD(&queue->send_list); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work);
- mutex_init(&queue->pf_cache_lock);
if (qid > 0) queue->cmnd_capsule_len = nctrl->ioccsz * 16; @@ -1903,6 +1910,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, err_destroy_mutex: mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock);
- mutex_destroy(&queue->pf_cache_lock); return ret;
} -- 2.25.1
On Tue, Sep 30, 2025 at 11:31:26PM -0700, Chris Leech wrote:
On Mon, Sep 29, 2025 at 02:19:51PM +0300, Dmitry Bogdanov wrote:
nvme uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by serializing the usage of page_frag_cache.
Thank you for reporting this. I think we can fix it without blocking the async namespace scanning with a mutex, by switching from a per-queue page_frag_cache to per-cpu. There shouldn't be a need to keep the page_frag allocations isolated by queue anyway.
It would be great if you could test the patch which I'll send after this.
As I commented on your patch, a naive per-cpu cache solution is error-prone. The complete solution will be unnecessaryly difficult. Block device creation is not a data plane, it is a control plane, so there is no sense to use there lockless algorithms.
My patch is a simple and error-proof already. So, I insist on this solution.
BR, Dmitry
nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by switching from having a per-queue page_frag_cache to a per-cpu page_frag_cache.
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Reported-by: Dmitry Bogdanov d.bogdanov@yadro.com Signed-off-by: Chris Leech cleech@redhat.com --- drivers/nvme/host/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..a4c4ace5be0f4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -174,7 +174,6 @@ struct nvme_tcp_queue { __le32 recv_ddgst; struct completion tls_complete; int tls_err; - struct page_frag_cache pf_cache;
void (*state_change)(struct sock *); void (*data_ready)(struct sock *); @@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list); static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache); static struct workqueue_struct *nvme_tcp_wq; static const struct blk_mq_ops nvme_tcp_mq_ops; static const struct blk_mq_ops nvme_tcp_admin_mq_ops; @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
- req->pdu = page_frag_alloc(&queue->pf_cache, + req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!req->pdu) @@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
- async->pdu = page_frag_alloc(&queue->pf_cache, + async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); if (!async->pdu) @@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return;
- page_frag_cache_drain(&queue->pf_cache); + page_frag_cache_drain(this_cpu_ptr(&pf_cache));
noreclaim_flag = memalloc_noreclaim_save(); /* ->sock will be released by fput() */
On Tue, Sep 30, 2025 at 11:34:14PM -0700, Chris Leech wrote:
nvme-tcp uses page_frag_cache to preallocate PDU for each preallocated request of block device. Block devices are created in parallel threads, consequently page_frag_cache is used in not thread-safe manner. That leads to incorrect refcounting of backstore pages and premature free.
That can be catched by !sendpage_ok inside network stack:
WARNING: CPU: 7 PID: 467 at ../net/core/skbuff.c:6931 skb_splice_from_iter+0xfa/0x310. tcp_sendmsg_locked+0x782/0xce0 tcp_sendmsg+0x27/0x40 sock_sendmsg+0x8b/0xa0 nvme_tcp_try_send_cmd_pdu+0x149/0x2a0 Then random panic may occur.
Fix that by switching from having a per-queue page_frag_cache to a per-cpu page_frag_cache.
Cc: stable@vger.kernel.org # 6.12 Fixes: 4e893ca81170 ("nvme_core: scan namespaces asynchronously") Reported-by: Dmitry Bogdanov d.bogdanov@yadro.com Signed-off-by: Chris Leech cleech@redhat.com
drivers/nvme/host/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1413788ca7d52..a4c4ace5be0f4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -174,7 +174,6 @@ struct nvme_tcp_queue { __le32 recv_ddgst; struct completion tls_complete; int tls_err;
struct page_frag_cache pf_cache; void (*state_change)(struct sock *); void (*data_ready)(struct sock *);
@@ -201,6 +200,7 @@ struct nvme_tcp_ctrl {
static LIST_HEAD(nvme_tcp_ctrl_list); static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static DEFINE_PER_CPU(struct page_frag_cache, pf_cache); static struct workqueue_struct *nvme_tcp_wq; static const struct blk_mq_ops nvme_tcp_mq_ops; static const struct blk_mq_ops nvme_tcp_admin_mq_ops; @@ -556,7 +556,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue);
req->pdu = page_frag_alloc(&queue->pf_cache,
req->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
I am not good at scheduler subsystem, but as far as I understand, workqueues may execute its work items in parallel up to max_active work items on the same CPU. It means that this solution does not fix the issue of parallel usage of the same variable. Can anyone comment on this?
if (!req->pdu)
@@ -1420,7 +1420,7 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) struct nvme_tcp_request *async = &ctrl->async_req; u8 hdgst = nvme_tcp_hdgst_len(queue);
async->pdu = page_frag_alloc(&queue->pf_cache,
async->pdu = page_frag_alloc(this_cpu_ptr(&pf_cache), sizeof(struct nvme_tcp_cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
This line is executed in a different(parallel) context comparing to nvme_tcp_init_request.
if (!async->pdu)
@@ -1439,7 +1439,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return;
page_frag_cache_drain(&queue->pf_cache);
page_frag_cache_drain(this_cpu_ptr(&pf_cache));
This line is also definitely processed in other(parallel) context comparing to nvme_tcp_init_request. And frees the still used pages by other queues.
noreclaim_flag = memalloc_noreclaim_save(); /* ->sock will be released by fput() */
-- 2.50.1
In total, my patch with a mutex looks more appropriate and more error-proof.
BR, Dmitry
linux-stable-mirror@lists.linaro.org