This patchset backports a series of ublk fixes from upstream to 6.14-stable.
Patch 7 fixes the race that can cause kernel panic when ublk server daemon is exiting.
It depends on patches 1-6 which simplifies & improves IO canceling when ublk server daemon is exiting as described here:
https://lore.kernel.org/linux-block/20250416035444.99569-1-ming.lei@redhat.c...
Ming Lei (5): ublk: add helper of ublk_need_map_io() ublk: move device reset into ublk_ch_release() ublk: remove __ublk_quiesce_dev() ublk: simplify aborting ublk request ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd
Uday Shankar (2): ublk: properly serialize all FETCH_REQs ublk: improve detection and handling of ublk server exit
drivers/block/ublk_drv.c | 550 +++++++++++++++++++++------------------ 1 file changed, 291 insertions(+), 259 deletions(-)
From: Ming Lei ming.lei@redhat.com
[ Upstream commit 1d781c0de08c0b35948ad4aaf609a4cc9995d9f6 ]
ublk_need_map_io() is more readable.
Reviewed-by: Caleb Sander Mateos csander@purestorage.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250327095123.179113-5-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ab06a7a064fb..4e81505179c6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -594,6 +594,11 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_USER_COPY; }
+static inline bool ublk_need_map_io(const struct ublk_queue *ubq) +{ + return !ublk_support_user_copy(ubq); +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { /* @@ -921,7 +926,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes;
/* @@ -945,7 +950,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req);
- if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes;
if (ublk_need_unmap_req(req)) { @@ -1914,7 +1919,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out;
- if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled @@ -1936,7 +1941,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out;
- if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO.
From: Uday Shankar ushankar@purestorage.com
[ Upstream commit b69b8edfb27dfa563cd53f590ec42b481f9eb174 ]
Most uring_cmds issued against ublk character devices are serialized because each command affects only one queue, and there is an early check which only allows a single task (the queue's ubq_daemon) to issue uring_cmds against that queue. However, this mechanism does not work for FETCH_REQs, since they are expected before ubq_daemon is set. Since FETCH_REQs are only used at initialization and not in the fast path, serialize them using the per-ublk-device mutex. This fixes a number of data races that were previously possible if a badly behaved ublk server decided to issue multiple FETCH_REQs against the same qid/tag concurrently.
Reported-by: Caleb Sander Mateos csander@purestorage.com Signed-off-by: Uday Shankar ushankar@purestorage.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250416035444.99569-2-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 77 +++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 28 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4e81505179c6..9345a6d8dbd8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1803,8 +1803,8 @@ static void ublk_nosrv_work(struct work_struct *work)
/* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) + __must_hold(&ub->mutex) { - mutex_lock(&ub->mutex); ubq->nr_io_ready++; if (ublk_queue_ready(ubq)) { ubq->ubq_daemon = current; @@ -1816,7 +1816,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) complete_all(&ub->completion); - mutex_unlock(&ub->mutex); }
static inline int ublk_check_cmd_op(u32 cmd_op) @@ -1855,6 +1854,52 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); }
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, + struct ublk_io *io, __u64 buf_addr) +{ + struct ublk_device *ub = ubq->dev; + int ret = 0; + + /* + * When handling FETCH command for setting up ublk uring queue, + * ub->mutex is the innermost lock, and we won't block for handling + * FETCH, so it is fine even for IO_URING_F_NONBLOCK. + */ + mutex_lock(&ub->mutex); + /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ + if (ublk_queue_ready(ubq)) { + ret = -EBUSY; + goto out; + } + + /* allow each command to be FETCHed at most once */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + ret = -EINVAL; + goto out; + } + + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); + + if (ublk_need_map_io(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!buf_addr && !ublk_need_get_data(ubq)) + goto out; + } else if (buf_addr) { + /* User copy requires addr to be unset */ + ret = -EINVAL; + goto out; + } + + ublk_fill_io_cmd(io, cmd, buf_addr); + ublk_mark_io_ready(ub, ubq); +out: + mutex_unlock(&ub->mutex); + return ret; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1907,33 +1952,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { case UBLK_IO_FETCH_REQ: - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { - ret = -EBUSY; - goto out; - } - /* - * The io is being handled by server, so COMMIT_RQ is expected - * instead of FETCH_REQ - */ - if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - goto out; - - if (ublk_need_map_io(ubq)) { - /* - * FETCH_RQ has to provide IO buffer if NEED GET - * DATA is not enabled - */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; - } else if (ub_cmd->addr) { - /* User copy requires addr to be unset */ - ret = -EINVAL; + ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + if (ret) goto out; - } - - ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
From: Ming Lei ming.lei@redhat.com
[ Upstream commit 728cbac5fe219d3b8a21a0688a08f2b7f8aeda2b ]
ublk_ch_release() is called after ublk char device is closed, when all uring_cmd are done, so it is perfect fine to move ublk device reset to ublk_ch_release() from ublk_ctrl_start_recovery().
This way can avoid to grab the exiting daemon task_struct too long.
However, reset of the following ublk IO flags has to be moved until ublk io_uring queues are ready:
- ubq->canceling
For requeuing IO in case of ublk_nosrv_dev_should_queue_io() before device is recovered
- ubq->fail_io
For failing IO in case of UBLK_F_USER_RECOVERY_FAIL_IO before device is recovered
- ublk_io->flags
For preventing using io->cmd
With this way, recovery is simplified a lot.
Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250416035444.99569-5-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 121 +++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 49 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9345a6d8dbd8..c619df880c72 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1043,7 +1043,7 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) { - return ubq->ubq_daemon->flags & PF_EXITING; + return !ubq->ubq_daemon || ubq->ubq_daemon->flags & PF_EXITING; }
/* todo: handle partial completion */ @@ -1440,6 +1440,37 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, };
+static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + /* All old ioucmds have to be completed */ + ubq->nr_io_ready = 0; + + /* + * old daemon is PF_EXITING, put it now + * + * It could be NULL in case of closing one quisced device. + */ + if (ubq->ubq_daemon) + put_task_struct(ubq->ubq_daemon); + /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ + ubq->ubq_daemon = NULL; + ubq->timeout = false; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + /* + * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch + * io->cmd + */ + io->flags &= UBLK_IO_FLAG_CANCELED; + io->cmd = NULL; + io->addr = 0; + } +} + static int ublk_ch_open(struct inode *inode, struct file *filp) { struct ublk_device *ub = container_of(inode->i_cdev, @@ -1451,10 +1482,26 @@ static int ublk_ch_open(struct inode *inode, struct file *filp) return 0; }
+static void ublk_reset_ch_dev(struct ublk_device *ub) +{ + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + + /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ + ub->mm = NULL; + ub->nr_queues_ready = 0; + ub->nr_privileged_daemon = 0; +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data;
+ /* all uring_cmd has been done now, reset device & ubq */ + ublk_reset_ch_dev(ub); + clear_bit(UB_STATE_OPEN, &ub->state); return 0; } @@ -1801,6 +1848,24 @@ static void ublk_nosrv_work(struct work_struct *work) ublk_cancel_dev(ub); }
+/* reset ublk io_uring queue & io flags */ +static void ublk_reset_io_flags(struct ublk_device *ub) +{ + int i, j; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + ubq->canceling = false; + ubq->fail_io = false; + } +} + /* device can only be started after all IOs are ready */ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) __must_hold(&ub->mutex) @@ -1814,8 +1879,12 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) if (capable(CAP_SYS_ADMIN)) ub->nr_privileged_daemon++; } - if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) + + if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { + /* now we are ready for handling ublk io request */ + ublk_reset_io_flags(ub); complete_all(&ub->completion); + } }
static inline int ublk_check_cmd_op(u32 cmd_op) @@ -2866,42 +2935,15 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, return ret; }
-static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) -{ - int i; - - WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq))); - - /* All old ioucmds have to be completed */ - ubq->nr_io_ready = 0; - /* old daemon is PF_EXITING, put it now */ - put_task_struct(ubq->ubq_daemon); - /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ - ubq->ubq_daemon = NULL; - ubq->timeout = false; - - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; - - /* forget everything now and be ready for new FETCH_REQ */ - io->flags = 0; - io->cmd = NULL; - io->addr = 0; - } -} - static int ublk_ctrl_start_recovery(struct ublk_device *ub, struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ret = -EINVAL; - int i;
mutex_lock(&ub->mutex); if (ublk_nosrv_should_stop_dev(ub)) goto out_unlock; - if (!ub->nr_queues_ready) - goto out_unlock; /* * START_RECOVERY is only allowd after: * @@ -2925,12 +2967,6 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, goto out_unlock; } pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_queue_reinit(ub, ublk_get_queue(ub, i)); - /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ - ub->mm = NULL; - ub->nr_queues_ready = 0; - ub->nr_privileged_daemon = 0; init_completion(&ub->completion); ret = 0; out_unlock: @@ -2944,7 +2980,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; - int i;
pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); @@ -2964,22 +2999,10 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, goto out_unlock; } ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->dev_info.state = UBLK_S_DEV_LIVE; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); - - blk_mq_quiesce_queue(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_LIVE; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - ubq->canceling = false; - ubq->fail_io = false; - } - blk_mq_unquiesce_queue(ub->ub_disk->queue); - pr_devel("%s: queue unquiesced, dev id %d.\n", - __func__, header->dev_id); blk_mq_kick_requeue_list(ub->ub_disk->queue); - ret = 0; out_unlock: mutex_unlock(&ub->mutex);
From: Uday Shankar ushankar@purestorage.com
[ Upstream commit 82a8a30c581bbbe653d33c6ce2ef67e3072c7f12 ]
There are currently two ways in which ublk server exit is detected by ublk_drv:
1. uring_cmd cancellation. If there are any outstanding uring_cmds which have not been completed to the ublk server when it exits, io_uring calls the uring_cmd callback with a special cancellation flag as the issuing task is exiting. 2. I/O timeout. This is needed in addition to the above to handle the "saturated queue" case, when all I/Os for a given queue are in the ublk server, and therefore there are no outstanding uring_cmds to cancel when the ublk server exits.
There are a couple of issues with this approach:
- It is complex and inelegant to have two methods to detect the same condition - The second method detects ublk server exit only after a long delay (~30s, the default timeout assigned by the block layer). This delays the nosrv behavior from kicking in and potential subsequent recovery of the device.
The second issue is brought to light with the new test_generic_06 which will be added in following patch. It fails before this fix:
selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 30.0611 s, 0.0 kB/s DEAD dd took 31 seconds to exit (>= 5s tolerance)! generic_06 : [FAIL]
Fix this by instead detecting and handling ublk server exit in the character file release callback. This has several advantages:
- This one place can handle both saturated and unsaturated queues. Thus, it replaces both preexisting methods of detecting ublk server exit. - It runs quickly on ublk server exit - there is no 30s delay. - It starts the process of removing task references in ublk_drv. This is needed if we want to relax restrictions in the driver like letting only one thread serve each queue
There is also the disadvantage that the character file release callback can also be triggered by intentional close of the file, which is a significant behavior change. Preexisting ublk servers (libublksrv) are dependent on the ability to open/close the file multiple times. To address this, only transition to a nosrv state if the file is released while the ublk device is live. This allows for programs to open/close the file multiple times during setup. It is still a behavior change if a ublk server decides to close/reopen the file while the device is LIVE (i.e. while it is responsible for serving I/O), but that would be highly unusual. This behavior is in line with what is done by FUSE, which is very similar to ublk in that a userspace daemon is providing services traditionally provided by the kernel.
With this change in, the new test (and all other selftests, and all ublksrv tests) pass:
selftests: ublk: test_generic_06.sh dev id is 0 dd: error writing '/dev/ublkb0': Input/output error 1+0 records in 0+0 records out 0 bytes copied, 0.0376731 s, 0.0 kB/s DEAD generic_04 : [PASS]
Signed-off-by: Uday Shankar ushankar@purestorage.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250416035444.99569-6-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 223 ++++++++++++++++++++++----------------- 1 file changed, 124 insertions(+), 99 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c619df880c72..652742db0396 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -194,8 +194,6 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; unsigned int nr_privileged_daemon; - - struct work_struct nosrv_work; };
/* header of ublk_params */ @@ -204,7 +202,10 @@ struct ublk_params_header { __u32 types; };
-static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); + +static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); +static void __ublk_quiesce_dev(struct ublk_device *ub);
static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, @@ -1306,8 +1307,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) static enum blk_eh_timer_return ublk_timeout(struct request *rq) { struct ublk_queue *ubq = rq->mq_hctx->driver_data; - unsigned int nr_inflight = 0; - int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { if (!ubq->timeout) { @@ -1318,26 +1317,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_DONE; }
- if (!ubq_daemon_is_dying(ubq)) - return BLK_EH_RESET_TIMER; - - for (i = 0; i < ubq->q_depth; i++) { - struct ublk_io *io = &ubq->ios[i]; - - if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) - nr_inflight++; - } - - /* cancelable uring_cmd can't help us if all commands are in-flight */ - if (nr_inflight == ubq->q_depth) { - struct ublk_device *ub = ubq->dev; - - if (ublk_abort_requests(ub, ubq)) { - schedule_work(&ub->nosrv_work); - } - return BLK_EH_DONE; - } - return BLK_EH_RESET_TIMER; }
@@ -1495,13 +1474,105 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) ub->nr_privileged_daemon = 0; }
+static struct gendisk *ublk_get_disk(struct ublk_device *ub) +{ + struct gendisk *disk; + + spin_lock(&ub->lock); + disk = ub->ub_disk; + if (disk) + get_device(disk_to_dev(disk)); + spin_unlock(&ub->lock); + + return disk; +} + +static void ublk_put_disk(struct gendisk *disk) +{ + if (disk) + put_device(disk_to_dev(disk)); +} + static int ublk_ch_release(struct inode *inode, struct file *filp) { struct ublk_device *ub = filp->private_data; + struct gendisk *disk; + int i; + + /* + * disk isn't attached yet, either device isn't live, or it has + * been removed already, so we needn't to do anything + */ + disk = ublk_get_disk(ub); + if (!disk) + goto out; + + /* + * All uring_cmd are done now, so abort any request outstanding to + * the ublk server + * + * This can be done in lockless way because ublk server has been + * gone + * + * More importantly, we have to provide forward progress guarantee + * without holding ub->mutex, otherwise control task grabbing + * ub->mutex triggers deadlock + * + * All requests may be inflight, so ->canceling may not be set, set + * it now. + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + ubq->canceling = true; + ublk_abort_queue(ub, ubq); + } + blk_mq_kick_requeue_list(disk->queue); + + /* + * All infligh requests have been completed or requeued and any new + * request will be failed or requeued via `->canceling` now, so it is + * fine to grab ub->mutex now. + */ + mutex_lock(&ub->mutex); + + /* double check after grabbing lock */ + if (!ub->ub_disk) + goto unlock; + + /* + * Transition the device to the nosrv state. What exactly this + * means depends on the recovery flags + */ + blk_mq_quiesce_queue(disk->queue); + if (ublk_nosrv_should_stop_dev(ub)) { + /* + * Allow any pending/future I/O to pass through quickly + * with an error. This is needed because del_gendisk + * waits for all pending I/O to complete + */ + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->force_abort = true; + blk_mq_unquiesce_queue(disk->queue); + + ublk_stop_dev_unlocked(ub); + } else { + if (ublk_nosrv_dev_should_queue_io(ub)) { + __ublk_quiesce_dev(ub); + } else { + ub->dev_info.state = UBLK_S_DEV_FAIL_IO; + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_get_queue(ub, i)->fail_io = true; + } + blk_mq_unquiesce_queue(disk->queue); + } +unlock: + mutex_unlock(&ub->mutex); + ublk_put_disk(disk);
/* all uring_cmd has been done now, reset device & ubq */ ublk_reset_ch_dev(ub); - +out: clear_bit(UB_STATE_OPEN, &ub->state); return 0; } @@ -1597,37 +1668,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) }
/* Must be called when queue is frozen */ -static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) +static void ublk_mark_queue_canceling(struct ublk_queue *ubq) { - bool canceled; - spin_lock(&ubq->cancel_lock); - canceled = ubq->canceling; - if (!canceled) + if (!ubq->canceling) ubq->canceling = true; spin_unlock(&ubq->cancel_lock); - - return canceled; }
-static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) +static void ublk_start_cancel(struct ublk_queue *ubq) { - bool was_canceled = ubq->canceling; - struct gendisk *disk; - - if (was_canceled) - return false; - - spin_lock(&ub->lock); - disk = ub->ub_disk; - if (disk) - get_device(disk_to_dev(disk)); - spin_unlock(&ub->lock); + struct ublk_device *ub = ubq->dev; + struct gendisk *disk = ublk_get_disk(ub);
/* Our disk has been dead */ if (!disk) - return false; - + return; /* * Now we are serialized with ublk_queue_rq() * @@ -1636,15 +1692,9 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) * touch completed uring_cmd */ blk_mq_quiesce_queue(disk->queue); - was_canceled = ublk_mark_queue_canceling(ubq); - if (!was_canceled) { - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); - } + ublk_mark_queue_canceling(ubq); blk_mq_unquiesce_queue(disk->queue); - put_device(disk_to_dev(disk)); - - return !was_canceled; + ublk_put_disk(disk); }
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, @@ -1668,6 +1718,17 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live + * + * Two-stage cancel: + * + * - make every active uring_cmd done in ->cancel_fn() + * + * - aborting inflight ublk IO requests in ublk char device release handler, + * which depends on 1st stage because device can only be closed iff all + * uring_cmd are done + * + * Do _not_ try to acquire ub->mutex before all inflight requests are + * aborted, otherwise deadlock may be caused. */ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -1675,8 +1736,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; - struct ublk_device *ub; - bool need_schedule; struct ublk_io *io;
if (WARN_ON_ONCE(!ubq)) @@ -1689,16 +1748,12 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (WARN_ON_ONCE(task && task != ubq->ubq_daemon)) return;
- ub = ubq->dev; - need_schedule = ublk_abort_requests(ub, ubq); + if (!ubq->canceling) + ublk_start_cancel(ubq);
io = &ubq->ios[pdu->tag]; WARN_ON_ONCE(io->cmd != cmd); ublk_cancel_cmd(ubq, io, issue_flags); - - if (need_schedule) { - schedule_work(&ub->nosrv_work); - } }
static inline bool ublk_queue_ready(struct ublk_queue *ubq) @@ -1757,13 +1812,11 @@ static void __ublk_quiesce_dev(struct ublk_device *ub) __func__, ub->dev_info.dev_id, ub->dev_info.state == UBLK_S_DEV_LIVE ? "LIVE" : "QUIESCED"); - blk_mq_quiesce_queue(ub->ub_disk->queue); /* mark every queue as canceling */ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_get_queue(ub, i)->canceling = true; ublk_wait_tagset_rqs_idle(ub); ub->dev_info.state = UBLK_S_DEV_QUIESCED; - blk_mq_unquiesce_queue(ub->ub_disk->queue); }
static void ublk_force_abort_dev(struct ublk_device *ub) @@ -1800,50 +1853,25 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub) return disk; }
-static void ublk_stop_dev(struct ublk_device *ub) +static void ublk_stop_dev_unlocked(struct ublk_device *ub) + __must_hold(&ub->mutex) { struct gendisk *disk;
- mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_DEAD) - goto unlock; + return; + if (ublk_nosrv_dev_should_queue_io(ub)) ublk_force_abort_dev(ub); del_gendisk(ub->ub_disk); disk = ublk_detach_disk(ub); put_disk(disk); - unlock: - mutex_unlock(&ub->mutex); - ublk_cancel_dev(ub); }
-static void ublk_nosrv_work(struct work_struct *work) +static void ublk_stop_dev(struct ublk_device *ub) { - struct ublk_device *ub = - container_of(work, struct ublk_device, nosrv_work); - int i; - - if (ublk_nosrv_should_stop_dev(ub)) { - ublk_stop_dev(ub); - return; - } - mutex_lock(&ub->mutex); - if (ub->dev_info.state != UBLK_S_DEV_LIVE) - goto unlock; - - if (ublk_nosrv_dev_should_queue_io(ub)) { - __ublk_quiesce_dev(ub); - } else { - blk_mq_quiesce_queue(ub->ub_disk->queue); - ub->dev_info.state = UBLK_S_DEV_FAIL_IO; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - ublk_get_queue(ub, i)->fail_io = true; - } - blk_mq_unquiesce_queue(ub->ub_disk->queue); - } - - unlock: + ublk_stop_dev_unlocked(ub); mutex_unlock(&ub->mutex); ublk_cancel_dev(ub); } @@ -2419,7 +2447,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); ublk_put_device(ub); ublks_added--; @@ -2693,7 +2720,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_unlock; mutex_init(&ub->mutex); spin_lock_init(&ub->lock); - INIT_WORK(&ub->nosrv_work, ublk_nosrv_work);
ret = ublk_alloc_dev_number(ub, header->dev_id); if (ret < 0) @@ -2828,7 +2854,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) static int ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - cancel_work_sync(&ub->nosrv_work); return 0; }
From: Ming Lei ming.lei@redhat.com
[ Upstream commit 736b005b413a172670711ee17cab3c8ccab83223 ]
Remove __ublk_quiesce_dev() and open code for updating device state as QUIESCED.
We needn't to drain inflight requests in __ublk_quiesce_dev() any more, because all inflight requests are aborted in ublk char device release handler.
Also we needn't to set ->canceling in __ublk_quiesce_dev() any more because it is done unconditionally now in ublk_ch_release().
Reviewed-by: Uday Shankar ushankar@purestorage.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250416035444.99569-7-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 652742db0396..c3f576a9dbf2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -205,7 +205,6 @@ struct ublk_params_header {
static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); -static void __ublk_quiesce_dev(struct ublk_device *ub);
static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, @@ -1558,7 +1557,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) ublk_stop_dev_unlocked(ub); } else { if (ublk_nosrv_dev_should_queue_io(ub)) { - __ublk_quiesce_dev(ub); + /* ->canceling is set and all requests are aborted */ + ub->dev_info.state = UBLK_S_DEV_QUIESCED; } else { ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) @@ -1804,21 +1804,6 @@ static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) } }
-static void __ublk_quiesce_dev(struct ublk_device *ub) -{ - int i; - - pr_devel("%s: quiesce ub: dev_id %d state %s\n", - __func__, ub->dev_info.dev_id, - ub->dev_info.state == UBLK_S_DEV_LIVE ? - "LIVE" : "QUIESCED"); - /* mark every queue as canceling */ - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->canceling = true; - ublk_wait_tagset_rqs_idle(ub); - ub->dev_info.state = UBLK_S_DEV_QUIESCED; -} - static void ublk_force_abort_dev(struct ublk_device *ub) { int i;
From: Ming Lei ming.lei@redhat.com
[ Upstream commit e63d2228ef831af36f963b3ab8604160cfff84c1 ]
Now ublk_abort_queue() is moved to ublk char device release handler, meantime our request queue is "quiesced" because either ->canceling was set from uring_cmd cancel function or all IOs are inflight and can't be completed by ublk server, things becomes easy much:
- all uring_cmd are done, so we needn't to mark io as UBLK_IO_FLAG_ABORTED for handling completion from uring_cmd
- ublk char device is closed, no one can hold IO request reference any more, so we can simply complete this request or requeue it for ublk_nosrv_should_reissue_outstanding.
Reviewed-by: Uday Shankar ushankar@purestorage.com Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250416035444.99569-8-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 82 ++++++++++------------------------------ 1 file changed, 20 insertions(+), 62 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c3f576a9dbf2..6000147ac2a5 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -115,15 +115,6 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
-/* - * IO command is aborted, so this flag is set in case of - * !UBLK_IO_FLAG_ACTIVE. - * - * After this flag is observed, any pending or new incoming request - * associated with this io command will be failed immediately - */ -#define UBLK_IO_FLAG_ABORTED 0x04 - /* * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires * get data buffer address from ublksrv. @@ -1054,12 +1045,6 @@ static inline void __ublk_complete_rq(struct request *req) unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK;
- /* called from ublk_abort_queue() code path */ - if (io->flags & UBLK_IO_FLAG_ABORTED) { - res = BLK_STS_IOERR; - goto exit; - } - /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; @@ -1109,47 +1094,6 @@ static void ublk_complete_rq(struct kref *ref) __ublk_complete_rq(req); }
-static void ublk_do_fail_rq(struct request *req) -{ - struct ublk_queue *ubq = req->mq_hctx->driver_data; - - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) - blk_mq_requeue_request(req, false); - else - __ublk_complete_rq(req); -} - -static void ublk_fail_rq_fn(struct kref *ref) -{ - struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, - ref); - struct request *req = blk_mq_rq_from_pdu(data); - - ublk_do_fail_rq(req); -} - -/* - * Since ublk_rq_task_work_cb always fails requests immediately during - * exiting, __ublk_fail_req() is only called from abort context during - * exiting. So lock is unnecessary. - * - * Also aborting may not be started yet, keep in mind that one failed - * request may be issued by block layer again. - */ -static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, - struct request *req) -{ - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - - if (ublk_need_req_ref(ubq)) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - kref_put(&data->ref, ublk_fail_rq_fn); - } else { - ublk_do_fail_rq(req); - } -} - static void ubq_complete_io_cmd(struct ublk_io *io, int res, unsigned issue_flags) { @@ -1639,10 +1583,26 @@ static void ublk_commit_completion(struct ublk_device *ub, ublk_put_req_ref(ubq, req); }
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, + struct request *req) +{ + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + + if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + blk_mq_requeue_request(req, false); + else { + io->res = -EIO; + __ublk_complete_rq(req); + } +} + /* - * Called from ubq_daemon context via cancel fn, meantime quiesce ublk - * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon - * context, so everything is serialized. + * Called from ublk char device release handler, when any uring_cmd is + * done, meantime request queue is "quiesced" since all inflight requests + * can't be completed because ublk server is dead. + * + * So no one can hold our request IO reference any more, simply ignore the + * reference, and complete the request immediately */ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) { @@ -1659,10 +1619,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) * will do it */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); - if (rq && blk_mq_request_started(rq)) { - io->flags |= UBLK_IO_FLAG_ABORTED; + if (rq && blk_mq_request_started(rq)) __ublk_fail_req(ubq, io, rq); - } } } }
From: Ming Lei ming.lei@redhat.com
[ Upstream commit f40139fde5278d81af3227444fd6e76a76b9506d ]
ublk_cancel_cmd() calls io_uring_cmd_done() to complete uring_cmd, but we may have scheduled task work via io_uring_cmd_complete_in_task() for dispatching request, then kernel crash can be triggered.
Fix it by not trying to canceling the command if ublk block request is started.
Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Reported-by: Jared Holzman jholzman@nvidia.com Tested-by: Jared Holzman jholzman@nvidia.com Closes: https://lore.kernel.org/linux-block/d2179120-171b-47ba-b664-23242981ef19@nvi... Signed-off-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/20250425013742.1079549-3-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk --- drivers/block/ublk_drv.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6000147ac2a5..348c4feb7a2d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1655,14 +1655,31 @@ static void ublk_start_cancel(struct ublk_queue *ubq) ublk_put_disk(disk); }
-static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, +static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, unsigned int issue_flags) { + struct ublk_io *io = &ubq->ios[tag]; + struct ublk_device *ub = ubq->dev; + struct request *req; bool done;
if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) return;
+ /* + * Don't try to cancel this command if the request is started for + * avoiding race between io_uring_cmd_done() and + * io_uring_cmd_complete_in_task(). + * + * Either the started request will be aborted via __ublk_abort_rq(), + * then this uring_cmd is canceled next time, or it will be done in + * task work function ublk_dispatch_req() because io_uring guarantees + * that ublk_dispatch_req() is always called + */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (req && blk_mq_request_started(req)) + return; + spin_lock(&ubq->cancel_lock); done = !!(io->flags & UBLK_IO_FLAG_CANCELED); if (!done) @@ -1694,7 +1711,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; - struct ublk_io *io;
if (WARN_ON_ONCE(!ubq)) return; @@ -1709,9 +1725,8 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (!ubq->canceling) ublk_start_cancel(ubq);
- io = &ubq->ios[pdu->tag]; - WARN_ON_ONCE(io->cmd != cmd); - ublk_cancel_cmd(ubq, io, issue_flags); + WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd); + ublk_cancel_cmd(ubq, pdu->tag, issue_flags); }
static inline bool ublk_queue_ready(struct ublk_queue *ubq) @@ -1724,7 +1739,7 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) int i;
for (i = 0; i < ubq->q_depth; i++) - ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); + ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); }
/* Cancel all pending commands, must be called after del_gendisk() returns */
On Wed, May 7, 2025 at 5:47 PM Jared Holzman jholzman@nvidia.com wrote:
This patchset backports a series of ublk fixes from upstream to 6.14-stable.
Patch 7 fixes the race that can cause kernel panic when ublk server daemon is exiting.
It depends on patches 1-6 which simplifies & improves IO canceling when ublk server daemon is exiting as described here:
https://lore.kernel.org/linux-block/20250416035444.99569-1-ming.lei@redhat.c...
Ming Lei (5): ublk: add helper of ublk_need_map_io() ublk: move device reset into ublk_ch_release() ublk: remove __ublk_quiesce_dev() ublk: simplify aborting ublk request ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd
Uday Shankar (2): ublk: properly serialize all FETCH_REQs ublk: improve detection and handling of ublk server exit
drivers/block/ublk_drv.c | 550 +++++++++++++++++++++------------------ 1 file changed, 291 insertions(+), 259 deletions(-)
Looks fine to me:
Reviewed-by: Ming Lei ming.lei@redhat.com
Thanks, Ming
linux-stable-mirror@lists.linaro.org