From: Ming Lei <ming.lei(a)redhat.com>
ublk_cancel_cmd() calls io_uring_cmd_done() to complete uring_cmd, but
we may have scheduled task work via io_uring_cmd_complete_in_task() for
dispatching request, then kernel crash can be triggered.
Fix it by not trying to canceling the command if ublk block request is
started.
Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd")
Reported-by: Jared Holzman <jholzman(a)nvidia.com>
Tested-by: Jared Holzman <jholzman(a)nvidia.com>
Closes: https://lore.kernel.org/linux-block/d2179120-171b-47ba-b664-23242981ef19@nv…
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20250425013742.1079549-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/block/ublk_drv.c | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6000147ac2a5..348c4feb7a2d 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1655,14 +1655,31 @@ static void ublk_start_cancel(struct ublk_queue *ubq)
ublk_put_disk(disk);
}
-static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
+static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
unsigned int issue_flags)
{
+ struct ublk_io *io = &ubq->ios[tag];
+ struct ublk_device *ub = ubq->dev;
+ struct request *req;
bool done;
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
return;
+ /*
+ * Don't try to cancel this command if the request is started for
+ * avoiding race between io_uring_cmd_done() and
+ * io_uring_cmd_complete_in_task().
+ *
+ * Either the started request will be aborted via __ublk_abort_rq(),
+ * then this uring_cmd is canceled next time, or it will be done in
+ * task work function ublk_dispatch_req() because io_uring guarantees
+ * that ublk_dispatch_req() is always called
+ */
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (req && blk_mq_request_started(req))
+ return;
+
spin_lock(&ubq->cancel_lock);
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
if (!done)
@@ -1694,7 +1711,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->ubq;
struct task_struct *task;
- struct ublk_io *io;
if (WARN_ON_ONCE(!ubq))
return;
@@ -1709,9 +1725,8 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
if (!ubq->canceling)
ublk_start_cancel(ubq);
- io = &ubq->ios[pdu->tag];
- WARN_ON_ONCE(io->cmd != cmd);
- ublk_cancel_cmd(ubq, io, issue_flags);
+ WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd);
+ ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1724,7 +1739,7 @@ static void ublk_cancel_queue(struct ublk_queue *ubq)
int i;
for (i = 0; i < ubq->q_depth; i++)
- ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
+ ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
--
2.43.0
From: Ming Lei <ming.lei(a)redhat.com>
Now ublk_abort_queue() is moved to ublk char device release handler,
meantime our request queue is "quiesced" because either ->canceling was
set from uring_cmd cancel function or all IOs are inflight and can't be
completed by ublk server, things becomes easy much:
- all uring_cmd are done, so we needn't to mark io as UBLK_IO_FLAG_ABORTED
for handling completion from uring_cmd
- ublk char device is closed, no one can hold IO request reference any more,
so we can simply complete this request or requeue it for ublk_nosrv_should_reissue_outstanding.
Reviewed-by: Uday Shankar <ushankar(a)purestorage.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20250416035444.99569-8-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/block/ublk_drv.c | 82 ++++++++++------------------------------
1 file changed, 20 insertions(+), 62 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c3f576a9dbf2..6000147ac2a5 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -115,15 +115,6 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
-/*
- * IO command is aborted, so this flag is set in case of
- * !UBLK_IO_FLAG_ACTIVE.
- *
- * After this flag is observed, any pending or new incoming request
- * associated with this io command will be failed immediately
- */
-#define UBLK_IO_FLAG_ABORTED 0x04
-
/*
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
* get data buffer address from ublksrv.
@@ -1054,12 +1045,6 @@ static inline void __ublk_complete_rq(struct request *req)
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
- /* called from ublk_abort_queue() code path */
- if (io->flags & UBLK_IO_FLAG_ABORTED) {
- res = BLK_STS_IOERR;
- goto exit;
- }
-
/* failed read IO if nothing is read */
if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
@@ -1109,47 +1094,6 @@ static void ublk_complete_rq(struct kref *ref)
__ublk_complete_rq(req);
}
-static void ublk_do_fail_rq(struct request *req)
-{
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
-
- if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
- blk_mq_requeue_request(req, false);
- else
- __ublk_complete_rq(req);
-}
-
-static void ublk_fail_rq_fn(struct kref *ref)
-{
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
-
- ublk_do_fail_rq(req);
-}
-
-/*
- * Since ublk_rq_task_work_cb always fails requests immediately during
- * exiting, __ublk_fail_req() is only called from abort context during
- * exiting. So lock is unnecessary.
- *
- * Also aborting may not be started yet, keep in mind that one failed
- * request may be issued by block layer again.
- */
-static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
- struct request *req)
-{
- WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
-
- if (ublk_need_req_ref(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- kref_put(&data->ref, ublk_fail_rq_fn);
- } else {
- ublk_do_fail_rq(req);
- }
-}
-
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
unsigned issue_flags)
{
@@ -1639,10 +1583,26 @@ static void ublk_commit_completion(struct ublk_device *ub,
ublk_put_req_ref(ubq, req);
}
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+ struct request *req)
+{
+ WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+ if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
+ blk_mq_requeue_request(req, false);
+ else {
+ io->res = -EIO;
+ __ublk_complete_rq(req);
+ }
+}
+
/*
- * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
- * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
- * context, so everything is serialized.
+ * Called from ublk char device release handler, when any uring_cmd is
+ * done, meantime request queue is "quiesced" since all inflight requests
+ * can't be completed because ublk server is dead.
+ *
+ * So no one can hold our request IO reference any more, simply ignore the
+ * reference, and complete the request immediately
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
@@ -1659,10 +1619,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq)) {
- io->flags |= UBLK_IO_FLAG_ABORTED;
+ if (rq && blk_mq_request_started(rq))
__ublk_fail_req(ubq, io, rq);
- }
}
}
}
--
2.43.0
From: Uday Shankar <ushankar(a)purestorage.com>
There are currently two ways in which ublk server exit is detected by
ublk_drv:
1. uring_cmd cancellation. If there are any outstanding uring_cmds which
have not been completed to the ublk server when it exits, io_uring
calls the uring_cmd callback with a special cancellation flag as the
issuing task is exiting.
2. I/O timeout. This is needed in addition to the above to handle the
"saturated queue" case, when all I/Os for a given queue are in the
ublk server, and therefore there are no outstanding uring_cmds to
cancel when the ublk server exits.
There are a couple of issues with this approach:
- It is complex and inelegant to have two methods to detect the same
condition
- The second method detects ublk server exit only after a long delay
(~30s, the default timeout assigned by the block layer). This delays
the nosrv behavior from kicking in and potential subsequent recovery
of the device.
The second issue is brought to light with the new test_generic_06 which
will be added in following patch. It fails before this fix:
selftests: ublk: test_generic_06.sh
dev id is 0
dd: error writing '/dev/ublkb0': Input/output error
1+0 records in
0+0 records out
0 bytes copied, 30.0611 s, 0.0 kB/s
DEAD
dd took 31 seconds to exit (>= 5s tolerance)!
generic_06 : [FAIL]
Fix this by instead detecting and handling ublk server exit in the
character file release callback. This has several advantages:
- This one place can handle both saturated and unsaturated queues. Thus,
it replaces both preexisting methods of detecting ublk server exit.
- It runs quickly on ublk server exit - there is no 30s delay.
- It starts the process of removing task references in ublk_drv. This is
needed if we want to relax restrictions in the driver like letting
only one thread serve each queue
There is also the disadvantage that the character file release callback
can also be triggered by intentional close of the file, which is a
significant behavior change. Preexisting ublk servers (libublksrv) are
dependent on the ability to open/close the file multiple times. To
address this, only transition to a nosrv state if the file is released
while the ublk device is live. This allows for programs to open/close
the file multiple times during setup. It is still a behavior change if a
ublk server decides to close/reopen the file while the device is LIVE
(i.e. while it is responsible for serving I/O), but that would be highly
unusual. This behavior is in line with what is done by FUSE, which is
very similar to ublk in that a userspace daemon is providing services
traditionally provided by the kernel.
With this change in, the new test (and all other selftests, and all
ublksrv tests) pass:
selftests: ublk: test_generic_06.sh
dev id is 0
dd: error writing '/dev/ublkb0': Input/output error
1+0 records in
0+0 records out
0 bytes copied, 0.0376731 s, 0.0 kB/s
DEAD
generic_04 : [PASS]
Signed-off-by: Uday Shankar <ushankar(a)purestorage.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20250416035444.99569-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/block/ublk_drv.c | 223 ++++++++++++++++++++++-----------------
1 file changed, 124 insertions(+), 99 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c619df880c72..652742db0396 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -194,8 +194,6 @@ struct ublk_device {
struct completion completion;
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
-
- struct work_struct nosrv_work;
};
/* header of ublk_params */
@@ -204,7 +202,10 @@ struct ublk_params_header {
__u32 types;
};
-static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
+
+static void ublk_stop_dev_unlocked(struct ublk_device *ub);
+static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
+static void __ublk_quiesce_dev(struct ublk_device *ub);
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
@@ -1306,8 +1307,6 @@ static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l)
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
- unsigned int nr_inflight = 0;
- int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
if (!ubq->timeout) {
@@ -1318,26 +1317,6 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_DONE;
}
- if (!ubq_daemon_is_dying(ubq))
- return BLK_EH_RESET_TIMER;
-
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
-
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
- nr_inflight++;
- }
-
- /* cancelable uring_cmd can't help us if all commands are in-flight */
- if (nr_inflight == ubq->q_depth) {
- struct ublk_device *ub = ubq->dev;
-
- if (ublk_abort_requests(ub, ubq)) {
- schedule_work(&ub->nosrv_work);
- }
- return BLK_EH_DONE;
- }
-
return BLK_EH_RESET_TIMER;
}
@@ -1495,13 +1474,105 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
ub->nr_privileged_daemon = 0;
}
+static struct gendisk *ublk_get_disk(struct ublk_device *ub)
+{
+ struct gendisk *disk;
+
+ spin_lock(&ub->lock);
+ disk = ub->ub_disk;
+ if (disk)
+ get_device(disk_to_dev(disk));
+ spin_unlock(&ub->lock);
+
+ return disk;
+}
+
+static void ublk_put_disk(struct gendisk *disk)
+{
+ if (disk)
+ put_device(disk_to_dev(disk));
+}
+
static int ublk_ch_release(struct inode *inode, struct file *filp)
{
struct ublk_device *ub = filp->private_data;
+ struct gendisk *disk;
+ int i;
+
+ /*
+ * disk isn't attached yet, either device isn't live, or it has
+ * been removed already, so we needn't to do anything
+ */
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ goto out;
+
+ /*
+ * All uring_cmd are done now, so abort any request outstanding to
+ * the ublk server
+ *
+ * This can be done in lockless way because ublk server has been
+ * gone
+ *
+ * More importantly, we have to provide forward progress guarantee
+ * without holding ub->mutex, otherwise control task grabbing
+ * ub->mutex triggers deadlock
+ *
+ * All requests may be inflight, so ->canceling may not be set, set
+ * it now.
+ */
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ ubq->canceling = true;
+ ublk_abort_queue(ub, ubq);
+ }
+ blk_mq_kick_requeue_list(disk->queue);
+
+ /*
+ * All infligh requests have been completed or requeued and any new
+ * request will be failed or requeued via `->canceling` now, so it is
+ * fine to grab ub->mutex now.
+ */
+ mutex_lock(&ub->mutex);
+
+ /* double check after grabbing lock */
+ if (!ub->ub_disk)
+ goto unlock;
+
+ /*
+ * Transition the device to the nosrv state. What exactly this
+ * means depends on the recovery flags
+ */
+ blk_mq_quiesce_queue(disk->queue);
+ if (ublk_nosrv_should_stop_dev(ub)) {
+ /*
+ * Allow any pending/future I/O to pass through quickly
+ * with an error. This is needed because del_gendisk
+ * waits for all pending I/O to complete
+ */
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_get_queue(ub, i)->force_abort = true;
+ blk_mq_unquiesce_queue(disk->queue);
+
+ ublk_stop_dev_unlocked(ub);
+ } else {
+ if (ublk_nosrv_dev_should_queue_io(ub)) {
+ __ublk_quiesce_dev(ub);
+ } else {
+ ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+ ublk_get_queue(ub, i)->fail_io = true;
+ }
+ blk_mq_unquiesce_queue(disk->queue);
+ }
+unlock:
+ mutex_unlock(&ub->mutex);
+ ublk_put_disk(disk);
/* all uring_cmd has been done now, reset device & ubq */
ublk_reset_ch_dev(ub);
-
+out:
clear_bit(UB_STATE_OPEN, &ub->state);
return 0;
}
@@ -1597,37 +1668,22 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
}
/* Must be called when queue is frozen */
-static bool ublk_mark_queue_canceling(struct ublk_queue *ubq)
+static void ublk_mark_queue_canceling(struct ublk_queue *ubq)
{
- bool canceled;
-
spin_lock(&ubq->cancel_lock);
- canceled = ubq->canceling;
- if (!canceled)
+ if (!ubq->canceling)
ubq->canceling = true;
spin_unlock(&ubq->cancel_lock);
-
- return canceled;
}
-static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
+static void ublk_start_cancel(struct ublk_queue *ubq)
{
- bool was_canceled = ubq->canceling;
- struct gendisk *disk;
-
- if (was_canceled)
- return false;
-
- spin_lock(&ub->lock);
- disk = ub->ub_disk;
- if (disk)
- get_device(disk_to_dev(disk));
- spin_unlock(&ub->lock);
+ struct ublk_device *ub = ubq->dev;
+ struct gendisk *disk = ublk_get_disk(ub);
/* Our disk has been dead */
if (!disk)
- return false;
-
+ return;
/*
* Now we are serialized with ublk_queue_rq()
*
@@ -1636,15 +1692,9 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
* touch completed uring_cmd
*/
blk_mq_quiesce_queue(disk->queue);
- was_canceled = ublk_mark_queue_canceling(ubq);
- if (!was_canceled) {
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
- }
+ ublk_mark_queue_canceling(ubq);
blk_mq_unquiesce_queue(disk->queue);
- put_device(disk_to_dev(disk));
-
- return !was_canceled;
+ ublk_put_disk(disk);
}
static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
@@ -1668,6 +1718,17 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
/*
* The ublk char device won't be closed when calling cancel fn, so both
* ublk device and queue are guaranteed to be live
+ *
+ * Two-stage cancel:
+ *
+ * - make every active uring_cmd done in ->cancel_fn()
+ *
+ * - aborting inflight ublk IO requests in ublk char device release handler,
+ * which depends on 1st stage because device can only be closed iff all
+ * uring_cmd are done
+ *
+ * Do _not_ try to acquire ub->mutex before all inflight requests are
+ * aborted, otherwise deadlock may be caused.
*/
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
unsigned int issue_flags)
@@ -1675,8 +1736,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->ubq;
struct task_struct *task;
- struct ublk_device *ub;
- bool need_schedule;
struct ublk_io *io;
if (WARN_ON_ONCE(!ubq))
@@ -1689,16 +1748,12 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
return;
- ub = ubq->dev;
- need_schedule = ublk_abort_requests(ub, ubq);
+ if (!ubq->canceling)
+ ublk_start_cancel(ubq);
io = &ubq->ios[pdu->tag];
WARN_ON_ONCE(io->cmd != cmd);
ublk_cancel_cmd(ubq, io, issue_flags);
-
- if (need_schedule) {
- schedule_work(&ub->nosrv_work);
- }
}
static inline bool ublk_queue_ready(struct ublk_queue *ubq)
@@ -1757,13 +1812,11 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ?
"LIVE" : "QUIESCED");
- blk_mq_quiesce_queue(ub->ub_disk->queue);
/* mark every queue as canceling */
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->canceling = true;
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
- blk_mq_unquiesce_queue(ub->ub_disk->queue);
}
static void ublk_force_abort_dev(struct ublk_device *ub)
@@ -1800,50 +1853,25 @@ static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
return disk;
}
-static void ublk_stop_dev(struct ublk_device *ub)
+static void ublk_stop_dev_unlocked(struct ublk_device *ub)
+ __must_hold(&ub->mutex)
{
struct gendisk *disk;
- mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
- goto unlock;
+ return;
+
if (ublk_nosrv_dev_should_queue_io(ub))
ublk_force_abort_dev(ub);
del_gendisk(ub->ub_disk);
disk = ublk_detach_disk(ub);
put_disk(disk);
- unlock:
- mutex_unlock(&ub->mutex);
- ublk_cancel_dev(ub);
}
-static void ublk_nosrv_work(struct work_struct *work)
+static void ublk_stop_dev(struct ublk_device *ub)
{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, nosrv_work);
- int i;
-
- if (ublk_nosrv_should_stop_dev(ub)) {
- ublk_stop_dev(ub);
- return;
- }
-
mutex_lock(&ub->mutex);
- if (ub->dev_info.state != UBLK_S_DEV_LIVE)
- goto unlock;
-
- if (ublk_nosrv_dev_should_queue_io(ub)) {
- __ublk_quiesce_dev(ub);
- } else {
- blk_mq_quiesce_queue(ub->ub_disk->queue);
- ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- ublk_get_queue(ub, i)->fail_io = true;
- }
- blk_mq_unquiesce_queue(ub->ub_disk->queue);
- }
-
- unlock:
+ ublk_stop_dev_unlocked(ub);
mutex_unlock(&ub->mutex);
ublk_cancel_dev(ub);
}
@@ -2419,7 +2447,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
static void ublk_remove(struct ublk_device *ub)
{
ublk_stop_dev(ub);
- cancel_work_sync(&ub->nosrv_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
ublk_put_device(ub);
ublks_added--;
@@ -2693,7 +2720,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
- INIT_WORK(&ub->nosrv_work, ublk_nosrv_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2828,7 +2854,6 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
{
ublk_stop_dev(ub);
- cancel_work_sync(&ub->nosrv_work);
return 0;
}
--
2.43.0
From: Uday Shankar <ushankar(a)purestorage.com>
Most uring_cmds issued against ublk character devices are serialized
because each command affects only one queue, and there is an early check
which only allows a single task (the queue's ubq_daemon) to issue
uring_cmds against that queue. However, this mechanism does not work for
FETCH_REQs, since they are expected before ubq_daemon is set. Since
FETCH_REQs are only used at initialization and not in the fast path,
serialize them using the per-ublk-device mutex. This fixes a number of
data races that were previously possible if a badly behaved ublk server
decided to issue multiple FETCH_REQs against the same qid/tag
concurrently.
Reported-by: Caleb Sander Mateos <csander(a)purestorage.com>
Signed-off-by: Uday Shankar <ushankar(a)purestorage.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20250416035444.99569-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
---
drivers/block/ublk_drv.c | 77 +++++++++++++++++++++++++---------------
1 file changed, 49 insertions(+), 28 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 4e81505179c6..9345a6d8dbd8 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1803,8 +1803,8 @@ static void ublk_nosrv_work(struct work_struct *work)
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+ __must_hold(&ub->mutex)
{
- mutex_lock(&ub->mutex);
ubq->nr_io_ready++;
if (ublk_queue_ready(ubq)) {
ubq->ubq_daemon = current;
@@ -1816,7 +1816,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
}
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
complete_all(&ub->completion);
- mutex_unlock(&ub->mutex);
}
static inline int ublk_check_cmd_op(u32 cmd_op)
@@ -1855,6 +1854,52 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
+ struct ublk_io *io, __u64 buf_addr)
+{
+ struct ublk_device *ub = ubq->dev;
+ int ret = 0;
+
+ /*
+ * When handling FETCH command for setting up ublk uring queue,
+ * ub->mutex is the innermost lock, and we won't block for handling
+ * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
+ */
+ mutex_lock(&ub->mutex);
+ /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
+ if (ublk_queue_ready(ubq)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* allow each command to be FETCHed at most once */
+ if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
+
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!buf_addr && !ublk_need_get_data(ubq))
+ goto out;
+ } else if (buf_addr) {
+ /* User copy requires addr to be unset */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ublk_fill_io_cmd(io, cmd, buf_addr);
+ ublk_mark_io_ready(ub, ubq);
+out:
+ mutex_unlock(&ub->mutex);
+ return ret;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1907,33 +1952,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
ret = -EINVAL;
switch (_IOC_NR(cmd_op)) {
case UBLK_IO_FETCH_REQ:
- /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
- if (ublk_queue_ready(ubq)) {
- ret = -EBUSY;
- goto out;
- }
- /*
- * The io is being handled by server, so COMMIT_RQ is expected
- * instead of FETCH_REQ
- */
- if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
- goto out;
-
- if (ublk_need_map_io(ubq)) {
- /*
- * FETCH_RQ has to provide IO buffer if NEED GET
- * DATA is not enabled
- */
- if (!ub_cmd->addr && !ublk_need_get_data(ubq))
- goto out;
- } else if (ub_cmd->addr) {
- /* User copy requires addr to be unset */
- ret = -EINVAL;
+ ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
+ if (ret)
goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
--
2.43.0
From: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
[ Upstream commit e56088a13708757da68ad035269d69b93ac8c389 ]
The public datasheets of the following Amlogic SoCs describe a typical
resistor value for the built-in pull up/down resistor:
- Meson8/8b/8m2: not documented
- GXBB (S905): 60 kOhm
- GXL (S905X): 60 kOhm
- GXM (S912): 60 kOhm
- G12B (S922X): 60 kOhm
- SM1 (S905D3): 60 kOhm
The public G12B and SM1 datasheets additionally state min and max
values:
- min value: 50 kOhm for both, pull-up and pull-down
- max value for the pull-up: 70 kOhm
- max value for the pull-down: 130 kOhm
Use 60 kOhm in the pinctrl-meson driver as well so it's shown in the
debugfs output. It may not be accurate for Meson8/8b/8m2 but in reality
60 kOhm is closer to the actual value than 1 Ohm.
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Reviewed-by: Neil Armstrong <neil.armstrong(a)linaro.org>
Link: https://lore.kernel.org/20250329190132.855196-1-martin.blumenstingl@googlem…
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/pinctrl/meson/pinctrl-meson.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c
index aba479a1150c8..f3b381370e5ed 100644
--- a/drivers/pinctrl/meson/pinctrl-meson.c
+++ b/drivers/pinctrl/meson/pinctrl-meson.c
@@ -480,7 +480,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin,
case PIN_CONFIG_BIAS_PULL_DOWN:
case PIN_CONFIG_BIAS_PULL_UP:
if (meson_pinconf_get_pull(pc, pin) == param)
- arg = 1;
+ arg = 60000;
else
return -EINVAL;
break;
--
2.39.5
From: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
[ Upstream commit e56088a13708757da68ad035269d69b93ac8c389 ]
The public datasheets of the following Amlogic SoCs describe a typical
resistor value for the built-in pull up/down resistor:
- Meson8/8b/8m2: not documented
- GXBB (S905): 60 kOhm
- GXL (S905X): 60 kOhm
- GXM (S912): 60 kOhm
- G12B (S922X): 60 kOhm
- SM1 (S905D3): 60 kOhm
The public G12B and SM1 datasheets additionally state min and max
values:
- min value: 50 kOhm for both, pull-up and pull-down
- max value for the pull-up: 70 kOhm
- max value for the pull-down: 130 kOhm
Use 60 kOhm in the pinctrl-meson driver as well so it's shown in the
debugfs output. It may not be accurate for Meson8/8b/8m2 but in reality
60 kOhm is closer to the actual value than 1 Ohm.
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Reviewed-by: Neil Armstrong <neil.armstrong(a)linaro.org>
Link: https://lore.kernel.org/20250329190132.855196-1-martin.blumenstingl@googlem…
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/pinctrl/meson/pinctrl-meson.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c
index 20683cd072bb0..ae72edba8a1f0 100644
--- a/drivers/pinctrl/meson/pinctrl-meson.c
+++ b/drivers/pinctrl/meson/pinctrl-meson.c
@@ -483,7 +483,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin,
case PIN_CONFIG_BIAS_PULL_DOWN:
case PIN_CONFIG_BIAS_PULL_UP:
if (meson_pinconf_get_pull(pc, pin) == param)
- arg = 1;
+ arg = 60000;
else
return -EINVAL;
break;
--
2.39.5
From: Chenyuan Yang <chenyuan0y(a)gmail.com>
[ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ]
Incorrect types are used as sizeof() arguments in devm_kcalloc().
It should be sizeof(dai_link_data) for link_data instead of
sizeof(snd_soc_dai_link).
This is found by our static analysis tool.
Signed-off-by: Chenyuan Yang <chenyuan0y(a)gmail.com>
Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/fsl/imx-card.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c
index 2b64c0384b6bb..9b14cda56b068 100644
--- a/sound/soc/fsl/imx-card.c
+++ b/sound/soc/fsl/imx-card.c
@@ -517,7 +517,7 @@ static int imx_card_parse_of(struct imx_card_data *data)
if (!card->dai_link)
return -ENOMEM;
- data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL);
+ data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL);
if (!data->link_data)
return -ENOMEM;
--
2.39.5
From: Chenyuan Yang <chenyuan0y(a)gmail.com>
[ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ]
Incorrect types are used as sizeof() arguments in devm_kcalloc().
It should be sizeof(dai_link_data) for link_data instead of
sizeof(snd_soc_dai_link).
This is found by our static analysis tool.
Signed-off-by: Chenyuan Yang <chenyuan0y(a)gmail.com>
Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/fsl/imx-card.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c
index c6d55b21f9496..11430f9f49968 100644
--- a/sound/soc/fsl/imx-card.c
+++ b/sound/soc/fsl/imx-card.c
@@ -517,7 +517,7 @@ static int imx_card_parse_of(struct imx_card_data *data)
if (!card->dai_link)
return -ENOMEM;
- data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL);
+ data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL);
if (!data->link_data)
return -ENOMEM;
--
2.39.5
From: Chenyuan Yang <chenyuan0y(a)gmail.com>
[ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ]
Incorrect types are used as sizeof() arguments in devm_kcalloc().
It should be sizeof(dai_link_data) for link_data instead of
sizeof(snd_soc_dai_link).
This is found by our static analysis tool.
Signed-off-by: Chenyuan Yang <chenyuan0y(a)gmail.com>
Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/fsl/imx-card.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c
index 7128bcf3a743e..bb304de5cc38a 100644
--- a/sound/soc/fsl/imx-card.c
+++ b/sound/soc/fsl/imx-card.c
@@ -517,7 +517,7 @@ static int imx_card_parse_of(struct imx_card_data *data)
if (!card->dai_link)
return -ENOMEM;
- data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL);
+ data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL);
if (!data->link_data)
return -ENOMEM;
--
2.39.5
From: Chenyuan Yang <chenyuan0y(a)gmail.com>
[ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ]
Incorrect types are used as sizeof() arguments in devm_kcalloc().
It should be sizeof(dai_link_data) for link_data instead of
sizeof(snd_soc_dai_link).
This is found by our static analysis tool.
Signed-off-by: Chenyuan Yang <chenyuan0y(a)gmail.com>
Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/fsl/imx-card.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c
index 93dbe40008c00..e5ae435171d68 100644
--- a/sound/soc/fsl/imx-card.c
+++ b/sound/soc/fsl/imx-card.c
@@ -516,7 +516,7 @@ static int imx_card_parse_of(struct imx_card_data *data)
if (!card->dai_link)
return -ENOMEM;
- data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL);
+ data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL);
if (!data->link_data)
return -ENOMEM;
--
2.39.5
From: Chenyuan Yang <chenyuan0y(a)gmail.com>
[ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ]
Incorrect types are used as sizeof() arguments in devm_kcalloc().
It should be sizeof(dai_link_data) for link_data instead of
sizeof(snd_soc_dai_link).
This is found by our static analysis tool.
Signed-off-by: Chenyuan Yang <chenyuan0y(a)gmail.com>
Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/fsl/imx-card.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c
index 21f617f6f9fa8..566214cb3d60c 100644
--- a/sound/soc/fsl/imx-card.c
+++ b/sound/soc/fsl/imx-card.c
@@ -543,7 +543,7 @@ static int imx_card_parse_of(struct imx_card_data *data)
if (!card->dai_link)
return -ENOMEM;
- data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL);
+ data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL);
if (!data->link_data)
return -ENOMEM;
--
2.39.5
A new on by default warning in clang [1] aims to flags instances where
const variables without static or thread local storage or const members
in aggregate types are not initialized because it can lead to an
indeterminate value. This is quite noisy for the kernel due to
instances originating from header files such as:
drivers/gpu/drm/i915/gt/intel_ring.h:62:2: error: default initialization of an object of type 'typeof (ring->size)' (aka 'const unsigned int') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe]
62 | typecheck(typeof(ring->size), next);
| ^
include/linux/typecheck.h:10:9: note: expanded from macro 'typecheck'
10 | ({ type __dummy; \
| ^
include/net/ip.h:478:14: error: default initialization of an object of type 'typeof (rt->dst.expires)' (aka 'const unsigned long') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe]
478 | if (mtu && time_before(jiffies, rt->dst.expires))
| ^
include/linux/jiffies.h:138:26: note: expanded from macro 'time_before'
138 | #define time_before(a,b) time_after(b,a)
| ^
include/linux/jiffies.h:128:3: note: expanded from macro 'time_after'
128 | (typecheck(unsigned long, a) && \
| ^
include/linux/typecheck.h:11:12: note: expanded from macro 'typecheck'
11 | typeof(x) __dummy2; \
| ^
include/linux/list.h:409:27: warning: default initialization of an object of type 'union (unnamed union at include/linux/list.h:409:27)' with const member leaves the object uninitialized [-Wdefault-const-init-field-unsafe]
409 | struct list_head *next = smp_load_acquire(&head->next);
| ^
include/asm-generic/barrier.h:176:29: note: expanded from macro 'smp_load_acquire'
176 | #define smp_load_acquire(p) __smp_load_acquire(p)
| ^
arch/arm64/include/asm/barrier.h:164:59: note: expanded from macro '__smp_load_acquire'
164 | union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u; \
| ^
include/linux/list.h:409:27: note: member '__val' declared 'const' here
crypto/scatterwalk.c:66:22: error: default initialization of an object of type 'struct scatter_walk' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe]
66 | struct scatter_walk walk;
| ^
include/crypto/algapi.h:112:15: note: member 'addr' declared 'const' here
112 | void *const addr;
| ^
fs/hugetlbfs/inode.c:733:24: error: default initialization of an object of type 'struct vm_area_struct' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe]
733 | struct vm_area_struct pseudo_vma;
| ^
include/linux/mm_types.h:803:20: note: member 'vm_flags' declared 'const' here
803 | const vm_flags_t vm_flags;
| ^
Silencing the instances from typecheck.h is difficult because '= {}' is
not available in older but supported compilers and '= {0}' would cause
warnings about a literal 0 being treated as NULL. While it might be
possible to come up with a local hack to silence the warning for
clang-21+, it may not be worth it since -Wuninitialized will still
trigger if an uninitialized const variable is actually used.
In all audited cases of the "field" variant of the warning, the members
are either not used in the particular call path, modified through other
means such as memset() / memcpy() because the containing object is not
const, or are within a union with other non-const members.
Since this warning does not appear to have a high signal to noise ratio,
just disable it.
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/576161cb6069e2c7656a8ef530727a0… [1]
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
Closes: https://lore.kernel.org/CA+G9fYuNjKcxFKS_MKPRuga32XbndkLGcY-PVuoSwzv6VWbY=w…
Reported-by: Marcus Seyfarth <m.seyfarth(a)gmail.com>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2088
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
Changes in v2:
- Disable -Wdefault-const-init-var-unsafe as well, as '= {}' does not
work in typecheck() for all supported compilers and it may not be
worth a local hack.
- Link to v1: https://lore.kernel.org/r/20250501-default-const-init-clang-v1-0-3d2c6c185d…
---
scripts/Makefile.extrawarn | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index d88acdf40855..fd649c68e198 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -37,6 +37,18 @@ KBUILD_CFLAGS += -Wno-gnu
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219
KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf)
KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf)
+
+# Clang may emit a warning when a const variable, such as the dummy variables
+# in typecheck(), or const member of an aggregate type are not initialized,
+# which can result in unexpected behavior. However, in many audited cases of
+# the "field" variant of the warning, this is intentional because the field is
+# never used within a particular call path, the field is within a union with
+# other non-const members, or the containing object is not const so the field
+# can be modified via memcpy() / memset(). While the variable warning also gets
+# disabled with this same switch, there should not be too much coverage lost
+# because -Wuninitialized will still flag when an uninitialized const variable
+# is used.
+KBUILD_CFLAGS += $(call cc-disable-warning, default-const-init-unsafe)
else
# gcc inanely warns about local variables called 'main'
---
base-commit: 92a09c47464d040866cf2b4cd052bc60555185fb
change-id: 20250430-default-const-init-clang-b6e21b8d03b6
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
The vfio-pci huge_fault handler doesn't make any attempt to insert a
mapping containing the faulting address, it only inserts mappings if the
faulting address and resulting pfn are aligned. This works in a lot of
cases, particularly in conjunction with QEMU where DMA mappings linearly
fault the mmap. However, there are configurations where we don't get
that linear faulting and pages are faulted on-demand.
The scenario reported in the bug below is such a case, where the physical
address width of the CPU is greater than that of the IOMMU, resulting in a
VM where guest firmware has mapped device MMIO beyond the address width of
the IOMMU. In this configuration, the MMIO is faulted on demand and
tracing indicates that occasionally the faults generate a VM_FAULT_OOM.
Given the use case, this results in a "error: kvm run failed Bad address",
killing the VM.
The host is not under memory pressure in this test, therefore it's
suspected that VM_FAULT_OOM is actually the result of a NULL return from
__pte_offset_map_lock() in the get_locked_pte() path from insert_pfn().
This suggests a potential race inserting a pte concurrent to a pmd, and
maybe indicates some deficiency in the mm layer properly handling such a
case.
Nevertheless, Peter noted the inconsistency of vfio-pci's huge_fault
handler where our mapping granularity depends on the alignment of the
faulting address relative to the order rather than aligning the faulting
address to the order to more consistently insert huge mappings. This
change not only uses the page tables more consistently and efficiently, but
as any fault to an aligned page results in the same mapping, the race
condition suspected in the VM_FAULT_OOM is avoided.
Reported-by: Adolfo <adolfotregosa(a)gmail.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=220057
Fixes: 09dfc8a5f2ce ("vfio/pci: Fallback huge faults for unaligned pfn")
Cc: stable(a)vger.kernel.org
Tested-by: Adolfo <adolfotregosa(a)gmail.com>
Co-developed-by: Peter Xu <peterx(a)redhat.com>
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
---
drivers/vfio/pci/vfio_pci_core.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 35f9046af315..6328c3a05bcd 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1646,14 +1646,14 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
{
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long pfn = vma_to_pfn(vma) + pgoff;
vm_fault_t ret = VM_FAULT_SIGBUS;
- pfn = vma_to_pfn(vma) + pgoff;
-
- if (order && (pfn & ((1 << order) - 1) ||
- vmf->address & ((PAGE_SIZE << order) - 1) ||
- vmf->address + (PAGE_SIZE << order) > vma->vm_end)) {
+ if (order && (addr < vma->vm_start ||
+ addr + (PAGE_SIZE << order) > vma->vm_end ||
+ pfn & ((1 << order) - 1))) {
ret = VM_FAULT_FALLBACK;
goto out;
}
--
2.48.1
On Tue, 06 May 2025 12:11:26 +0200,
Ezra Khuzadi wrote:
>
>
> Hi Takashi, Jaroslav, all maintainers,
>
> Could you please review it or let me know if any changes are needed? This is
> my first kernel patch as a student, and I’d appreciate any feedback.
I guess you submitted to a wrong address. The proper mailing list is
linux-sound(a)vger.kernel.org. Please try to resubmit.
And, make sure that your mailer doesn't break tabs and whitespaces.
You can test sending to yourself and verify that the submitted patch
is properly applicable beforehand.
thanks,
Takashi
>
> Thanks,
> Ezra Khuzadi
>
> On Wed, Apr 30, 2025 at 1:43 AM Ezra Khuzadi <ekhuzadi(a)uci.edu> wrote:
>
> sound/pci/hda/patch_realtek.c: add quirk for HP Spectre x360 15-eb0xxx
>
> Add subsystem ID 0x86e5 for HP Spectre x360 15-eb0xxx so that
> ALC285_FIXUP_HP_SPECTRE_X360_EB1 (GPIO amp-enable, mic-mute LED and
> pinconfigs) is applied.
>
> Tested on HP Spectre x360 15-eb0043dx (Vendor 0x10ec0285, Subsys
> 0x103c86e5)
> with legacy HDA driver and hda-verb toggles:
>
> $ cat /proc/asound/card0/codec#0 \
> | sed -n -e '1,5p;/Vendor Id:/p;/Subsystem Id:/p'
> Codec: Realtek ALC285
> Vendor Id: 0x10ec0285
> Subsystem Id: 0x103c86e5
>
> $ dmesg | grep -i realtek
> [ 5.828728] snd_hda_codec_realtek ehdaudio0D0: ALC285: picked fixup
> for PCI SSID 103c:86e5
>
> Signed-off-by: Ezra Khuzadi <ekhuzadi(a)uci.edu>
> Cc: stable(a)vger.kernel.org
>
> ---
> sound/pci/hda/patch_realtek.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
> index 877137cb09ac..82ad105e7fa9 100644
> --- a/sound/pci/hda/patch_realtek.c
> +++ b/sound/pci/hda/patch_realtek.c
> @@ -10563,6 +10563,7 @@ static const struct hda_quirk alc269_fixup_tbl[] =
> {
> SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32",
> ALC274_FIXUP_HP_ENVY_GPIO),
> + SND_PCI_QUIRK(0x103c, 0x86e5, "HP Spectre x360 15-eb0xxx",
> ALC285_FIXUP_HP_SPECTRE_X360_EB1),
> SND_PCI_QUIRK(0x103c, 0x86e7, "HP Spectre x360 15-eb0xxx",
> ALC285_FIXUP_HP_SPECTRE_X360_EB1),
> SND_PCI_QUIRK(0x103c, 0x86e8, "HP Spectre x360 15-eb0xxx",
> ALC285_FIXUP_HP_SPECTRE_X360_EB1),
> SND_PCI_QUIRK(0x103c, 0x86f9, "HP Spectre x360 13-aw0xxx",
> ALC285_FIXUP_HP_SPECTRE_X360_MUTE_LED),
>
> On Wed, Apr 30, 2025 at 1:33 AM kernel test robot <lkp(a)intel.com> wrote:
> >
> > Hi,
> >
> > Thanks for your patch.
> >
> > FYI: kernel test robot notices the stable kernel rule is not satisfied.
> >
> > The check is based on
> https://urldefense.com/v3/__https://www.kernel.org/doc/html/latest/process/…
> >
> > Rule: add the tag "Cc: stable(a)vger.kernel.org" in the sign-off area to
> have the patch automatically included in the stable tree.
> > Subject: sound/pci/hda: add quirk for HP Spectre x360 15-eb0xxx
> > Link:
> https://urldefense.com/v3/__https://lore.kernel.org/stable/CAPXr0uxh0c_2b2-…
> >
> > --
> > 0-DAY CI Kernel Test Service
> >
> https://urldefense.com/v3/__https://github.com/intel/lkp-tests/wiki__;!!CzA…
> >
> >
> >
>
Hi,
Pasi Kallinen reported in Debian a regression with perf r5101c4
counter, initially it was found in
https://github.com/rr-debugger/rr/issues/3949 but said to be a kernel
problem.
On Tue, May 06, 2025 at 07:18:39PM +0300, Pasi Kallinen wrote:
> Package: src:linux
> Version: 6.12.25-1
> Severity: normal
> X-Debbugs-Cc: debian-amd64(a)lists.debian.org, paxed(a)alt.org
> User: debian-amd64(a)lists.debian.org
> Usertags: amd64
>
> Dear Maintainer,
>
> perf stat -e r5101c4 true
>
> reports "not supported".
>
> The counters worked in kernel 6.11.10.
>
> I first noticed this not working when updating to 6.12.22.
> Booting back to 6.11.10, the counters work correctly.
Does this ring a bell?
Would you be able to bisect the changes to identify where the
behaviour changed?
Regards,
Salvatore
If a driver is removed, the driver framework invokes the driver's
remove callback. A CAN driver's remove function calls
unregister_candev(), which calls net_device_ops::ndo_stop further down
in the call stack for interfaces which are in the "up" state.
The removal of the module causes a warning, as can_rx_offload_del()
deletes the NAPI, while it is still active, because the interface is
still up.
To fix the warning, first unregister the network interface, which
calls net_device_ops::ndo_stop, which disables the NAPI, and then call
can_rx_offload_del().
Fixes: 1be37d3b0414 ("can: m_can: fix periph RX path: use rx-offload to ensure skbs are sent from softirq context")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20250502-can-rx-offload-del-v1-3-59a9b131589d@peng…
Reviewed-by: Markus Schneider-Pargmann <msp(a)baylibre.com>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/m_can/m_can.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 326ede9d400f..c2c116ce1087 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -2463,9 +2463,9 @@ EXPORT_SYMBOL_GPL(m_can_class_register);
void m_can_class_unregister(struct m_can_classdev *cdev)
{
+ unregister_candev(cdev->net);
if (cdev->is_peripheral)
can_rx_offload_del(&cdev->offload);
- unregister_candev(cdev->net);
}
EXPORT_SYMBOL_GPL(m_can_class_unregister);
--
2.47.2
If a driver is removed, the driver framework invokes the driver's
remove callback. A CAN driver's remove function calls
unregister_candev(), which calls net_device_ops::ndo_stop further down
in the call stack for interfaces which are in the "up" state.
The removal of the module causes a warning, as can_rx_offload_del()
deletes the NAPI, while it is still active, because the interface is
still up.
To fix the warning, first unregister the network interface, which
calls net_device_ops::ndo_stop, which disables the NAPI, and then call
can_rx_offload_del().
Fixes: ff60bfbaf67f ("can: rockchip_canfd: add driver for Rockchip CAN-FD controller")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20250502-can-rx-offload-del-v1-2-59a9b131589d@peng…
Reviewed-by: Markus Schneider-Pargmann <msp(a)baylibre.com>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/rockchip/rockchip_canfd-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c
index 7107a37da36c..c3fb3176ce42 100644
--- a/drivers/net/can/rockchip/rockchip_canfd-core.c
+++ b/drivers/net/can/rockchip/rockchip_canfd-core.c
@@ -937,8 +937,8 @@ static void rkcanfd_remove(struct platform_device *pdev)
struct rkcanfd_priv *priv = platform_get_drvdata(pdev);
struct net_device *ndev = priv->ndev;
- can_rx_offload_del(&priv->offload);
rkcanfd_unregister(priv);
+ can_rx_offload_del(&priv->offload);
free_candev(ndev);
}
--
2.47.2
If a driver is removed, the driver framework invokes the driver's
remove callback. A CAN driver's remove function calls
unregister_candev(), which calls net_device_ops::ndo_stop further down
in the call stack for interfaces which are in the "up" state.
With the mcp251xfd driver the removal of the module causes the
following warning:
| WARNING: CPU: 0 PID: 352 at net/core/dev.c:7342 __netif_napi_del_locked+0xc8/0xd8
as can_rx_offload_del() deletes the NAPI, while it is still active,
because the interface is still up.
To fix the warning, first unregister the network interface, which
calls net_device_ops::ndo_stop, which disables the NAPI, and then call
can_rx_offload_del().
Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/20250502-can-rx-offload-del-v1-1-59a9b131589d@peng…
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
index 064d81c724f4..c30b04f8fc0d 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c
@@ -2198,8 +2198,8 @@ static void mcp251xfd_remove(struct spi_device *spi)
struct mcp251xfd_priv *priv = spi_get_drvdata(spi);
struct net_device *ndev = priv->ndev;
- can_rx_offload_del(&priv->offload);
mcp251xfd_unregister(priv);
+ can_rx_offload_del(&priv->offload);
spi->max_speed_hz = priv->spi_max_speed_hz_orig;
free_candev(ndev);
}
--
2.47.2