From: Martin Wilck mwilck@suse.com
We have observed a few crashes run_timer_softirq(), where a broken timer_list struct belonging to an anatt_timer was encountered. The broken structures look like this, and we see actually multiple ones attached to the same timer base:
crash> struct timer_list 0xffff92471bcfdc90 struct timer_list { entry = { next = 0xdead000000000122, // LIST_POISON2 pprev = 0x0 }, expires = 4296022933, function = 0xffffffffc06de5e0 <nvme_anatt_timeout>, flags = 20 }
If such a timer is encountered in run_timer_softirq(), the kernel crashes. The test scenario was an I/O load test with lots of NVMe controllers, some of which were removed and re-added on the storage side.
I think this may happen if the rdma recovery_work starts, in this call chain:
nvme_rdma_error_recovery_work() /* this stops all sorts of activity for the controller, but not the multipath-related work queue and timer */ nvme_rdma_reconnect_or_remove(ctrl) => kicks reconnect_work
work queue: reconnect_work
nvme_rdma_reconnect_ctrl_work() nvme_rdma_setup_ctrl() nvme_rdma_configure_admin_queue() nvme_init_identify() nvme_mpath_init() # this sets some fields of the timer_list without taking a lock timer_setup() nvme_read_ana_log() mod_timer() or del_timer_sync()
Similar for TCP. The idea for the patch is based on the observation that nvme_rdma_reset_ctrl_work() calls nvme_stop_ctrl()->nvme_mpath_stop(), whereas nvme_rdma_error_recovery_work() stops only the keepalive timer, but not the anatt timer. Also, nvme_mpath_init() is the only place where the anatt_timer structure is accessed without locking.
[The following paragraph was contributed by Chao Leng lengchao@huawei.com]
The process maybe:1.ana_work add the timer;2.error recovery occurs, in reconnecting, reinitialize the timer and call nvme_read_ana_log, nvme_read_ana_log may add the timer again. The same timer is added twice, crash will happens later.
This situation has actually been observed in a crash dump, where we found an anatt timer pending that had been started ~80s ago, despite a log message telling that the anatt timer for the same controller had timed out a few seconds ago. This could only be explained by the same timer having been attached multiple times.
Signed-off-by: Martin Wilck mwilck@suse.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Reviewed-by: Chao Leng lengchao@huawei.com Cc: stable@vger.kernel.org
---- Changes in v4: Updated commit message with Chao Leng's analysis, as suggested by Daniel Wagner.
Changes in v3: Changed the subject line, as suggested by Sagi Grimberg
Changes in v2: Moved call to nvme_mpath_stop() further down, directly before the call of nvme_rdma_reconnect_or_remove() (Chao Leng) --- drivers/nvme/host/multipath.c | 1 + drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + 3 files changed, 3 insertions(+)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a1d476e1ac02..c63dd5dfa7ff 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -586,6 +586,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl) del_timer_sync(&ctrl->anatt_timer); cancel_work_sync(&ctrl->ana_work); } +EXPORT_SYMBOL_GPL(nvme_mpath_stop);
#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ struct device_attribute subsys_attr_##_name = \ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index be905d4fdb47..fc07a7b0dc1d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1202,6 +1202,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) return; }
+ nvme_mpath_stop(&ctrl->ctrl); nvme_rdma_reconnect_or_remove(ctrl); }
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a0f00cb8f9f3..46287b4f4d10 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2068,6 +2068,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) return; }
+ nvme_mpath_stop(ctrl); nvme_tcp_reconnect_or_remove(ctrl); }
Martin,
can you give this patch a spin and check if this solves your issue?
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0d0de3433f37..68f4d9d0ce58 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -780,6 +780,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; + size_t ana_log_size; int error;
/* check if multipath is enabled and we have the capability */ @@ -787,47 +789,45 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0;
+ if (!ctrl->identified) { + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + INIT_WORK(&ctrl->ana_work, nvme_ana_work); + } + ctrl->anacap = id->anacap; ctrl->anatt = id->anatt; ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
- mutex_init(&ctrl->ana_lock); - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); - - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + + ctrl->max_namespaces * sizeof(__le32); + if (ana_log_size > max_transfer_size) { dev_err(ctrl->device, - "ANA log page size (%zd) larger than MDTS (%d).\n", - ctrl->ana_log_size, - ctrl->max_hw_sectors << SECTOR_SHIFT); + "ANA log page size (%zd) larger than MDTS (%zd).\n", + ana_log_size, max_transfer_size); dev_err(ctrl->device, "disabling ANA support.\n"); return 0; }
- INIT_WORK(&ctrl->ana_work, nvme_ana_work); - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); - if (!ctrl->ana_log_buf) { - error = -ENOMEM; - goto out; + if (ana_log_size > ctrl->ana_log_size) { + nvme_mpath_uninit(ctrl); + ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + return -ENOMEM; + ctrl->ana_log_size = ana_log_size; }
error = nvme_read_ana_log(ctrl); if (error) - goto out_free_ana_log_buf; - return 0; -out_free_ana_log_buf: - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = NULL; -out: + nvme_mpath_uninit(ctrl); return error; }
void nvme_mpath_uninit(struct nvme_ctrl *ctrl) { + nvme_mpath_stop(ctrl); kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; }
On Thu, Apr 29, 2021 at 02:24:33PM +0200, Christoph Hellwig wrote:
Martin,
can you give this patch a spin and check if this solves your issue?
ping?
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0d0de3433f37..68f4d9d0ce58 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -780,6 +780,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) {
- size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
- size_t ana_log_size; int error;
/* check if multipath is enabled and we have the capability */ @@ -787,47 +789,45 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) return 0;
- if (!ctrl->identified) {
mutex_init(&ctrl->ana_lock);
timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
- }
- ctrl->anacap = id->anacap; ctrl->anatt = id->anatt; ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
- mutex_init(&ctrl->ana_lock);
- timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
- ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
- ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
- if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
- ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
ctrl->max_namespaces * sizeof(__le32);
- if (ana_log_size > max_transfer_size) { dev_err(ctrl->device,
"ANA log page size (%zd) larger than MDTS (%d).\n",
ctrl->ana_log_size,
ctrl->max_hw_sectors << SECTOR_SHIFT);
"ANA log page size (%zd) larger than MDTS (%zd).\n",
dev_err(ctrl->device, "disabling ANA support.\n"); return 0; }ana_log_size, max_transfer_size);
- INIT_WORK(&ctrl->ana_work, nvme_ana_work);
- kfree(ctrl->ana_log_buf);
- ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
- if (!ctrl->ana_log_buf) {
error = -ENOMEM;
goto out;
- if (ana_log_size > ctrl->ana_log_size) {
nvme_mpath_uninit(ctrl);
ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
if (!ctrl->ana_log_buf)
return -ENOMEM;
}ctrl->ana_log_size = ana_log_size;
error = nvme_read_ana_log(ctrl); if (error)
goto out_free_ana_log_buf;
- return 0;
-out_free_ana_log_buf:
- kfree(ctrl->ana_log_buf);
- ctrl->ana_log_buf = NULL;
-out:
return error;nvme_mpath_uninit(ctrl);
} void nvme_mpath_uninit(struct nvme_ctrl *ctrl) {
- nvme_mpath_stop(ctrl); kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL;
}
---end quoted text---
Hello Christoph,
On Tue, 2021-05-04 at 09:52 +0200, Christoph Hellwig wrote:
On Thu, Apr 29, 2021 at 02:24:33PM +0200, Christoph Hellwig wrote:
Martin,
can you give this patch a spin and check if this solves your issue?
ping?
I've provided a test kernel with your patch to the SUSE partner in question, but it's hard to reproduce, the test will take time.
Regards Martin
linux-stable-mirror@lists.linaro.org